Script to extract text from images and scanned PDF files
Posted by Eric Scheibler at April 13, 2015
For the friends of the text console I’ve created a small shell script, which extracts text from images and scanned PDF files. You can specify as many input files as you want. The results are merged into a single text file. You can open it in your favorite text editor or pipe it to stdout. The program Tesseract is used for the text recognition.
Some examples:
ocr image.jpg
ocr image.png document.pdf
ocr http://example.org/image.jpg
ocr -l eng -r layout -o image*
ocr document.pdf | grep -i search
Download: ocr
#!/bin/bash
# This script uses Tesseract to extract text from images and PDF files
# Supports local files and URLs
#
# Version: 0.6
# Date: 2024-04-29
# License: GNU General Public License
# Author: Eric Scheibler
# E-Mail: email [at] eric-scheibler [dot] de
# URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install curl, imagemagick, poppler-utils and tesseract ...
# sudo apt install curl imagemagick pdftk poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
# sudo apt install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng
# define some variables
exclude_page_numbers=0
force_ocr=0
open_in_editor=0
pdf_reading_order=''
ocr_language="deu"
psm=12
# temp folder and file
temp_folder=$(mktemp -d)
result_text_file="$temp_folder/result.txt"
trap "rm -rf $temp_folder" EXIT
cleanup() {
if [ -f "$1" ]; then
rm "$1"
fi
}
# check parameters
while getopts ":hfl:p:r:ox" opt; do
case $opt in
h)
echo -e "ocr [-f] [-l language] [-r] [-x] file_1 [file_2 ... file_n]"
echo -e "This script extracts text from image and PDF files\n"
echo -e """General options:
-l lang 3-letter language code (deu, eng, ...)
-p psm psm value for tesseract (default is 12)
-o Open results in the default text editor\n
-x Exclude page numbers\n"""
echo -e """PDF document options:
-f force ocr in already tagged pdf documents
-r mode pdf document reading order\n"""
echo -e "examples
ocr image1.png image2.jpg
ocr -l eng document.pdf
ocr -x https://path_to_image"""
exit 0
;;
f)
force_ocr=1
;;
o)
open_in_editor=1
;;
l)
ocr_language=$OPTARG
if (( ${#ocr_language} != 3)); then
echo "Invalid language parameter $ocr_language"
exit 1
fi
;;
p)
psm=$OPTARG
if (( $psm <= 0 || $psm > 13)); then
echo "Invalid value for psm: number between 1 and 13"
exit 1
fi
;;
r)
pdf_reading_order=$OPTARG
if [ "$pdf_reading_order" != "raw" ] && [ "$pdf_reading_order" != "layout" ]; then
echo "Invalid pdftotext layout parameter $pdf_reading_order. Choose between 'layout' and 'raw'"
exit 1
fi
# prepare for pdftotext command below
pdf_reading_order="-"$pdf_reading_order
;;
x)
exclude_page_numbers=1
;;
\?)
echo "Invalid option -$OPTARG"
exit 1
;;
:)
echo "Option -$OPTARG requires an argument"
exit 1
;;
esac
done
shift $((OPTIND -1))
if (( $# == 0 )); then
echo "Missing file(s)"
exit 1
fi
page_number=1
for file in "$@"
do
echo "Processing file $file" >&2
# strip path
base_filename=${file##*/}
# download
downloaded_into=""
if [[ $file == http://* || $file == https://* ]]; then
if [ -z $base_filename ]; then
base_filename = "downloaded_by_ocr_script"
fi
downloaded_into="/tmp/$base_filename"
curl --silent --show-error -o "$downloaded_into" "$file"
echo "Downloaded $downloaded_into from $file" >&2
file="$downloaded_into"
fi
# check if file exists
if [ ! -f "$file" ]; then
echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
page_number=$(( $page_number + 1 ))
continue
fi
# if it's a pdf file, check if it already contains text
# if so, copy the text into the corresponding text file and continue with the next one
if [[ $force_ocr -eq 0 && "${file,,}" == *.pdf ]]; then
pdf_contents=$(pdftotext -q $pdf_reading_order "$file" -)
if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
# found some text
if [[ $exclude_page_numbers -eq 0 ]]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n\n$pdf_contents" \
| sed -e 's/\f$//g' \
| sed -e 's/\f/\n-- Page '$page_number' -- File: '"${file##*/}"' --\n\n/g' \
| perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
else
echo -e "$pdf_contents" >> "$result_text_file"
fi
page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
cleanup "$downloaded_into"
continue
fi
fi
# if it's a pdf file without text, split it into single pages and convert them to ppm format
# otherweise copy the file into the ocr temp folder
#
# strip file extension
base_filename=${base_filename%.*}
if [[ "${file,,}" == *.pdf ]]; then
pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
rm -f "$temp_folder/doc_data.txt"
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.pdf ]]; then
pdftoppm -r 600 "$i" "${i%.pdf}"
rm -f "$i"
fi
done
else
cp "$file" "$temp_folder"
fi
# convert to tif
for i in "$temp_folder/$base_filename"* ; do
convert "$i" -type Grayscale "${i%.*}.tif"
rm -f "$i"
done
# start tesseract
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.tif ]]; then
tesseract -l $ocr_language --psm $psm "$i" "${i%.tif}" 2> /dev/null
rm -f "$i"
fi
done
# concatenate text files
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.txt ]]; then
if [[ $exclude_page_numbers -eq 0 ]]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
else
echo -e "\n" | cat - "$i" >> "$result_text_file"
fi
rm -f "$i"
page_number=$(( $page_number + 1 ))
fi
done
# cleanup download from above
cleanup "$downloaded_into"
done
if [[ $open_in_editor -eq 1 ]]; then
if [ -z "$EDITOR" ]; then
echo "EDITOR not set. Please set your preferred editor in the EDITOR environment variable."
exit 1
fi
"$EDITOR" "$result_text_file"
else
# pipe to stdout
cat "$result_text_file"
fi