Skript zum extrahieren von Text aus Bildern und gescannten PDF Dokumenten
Geschrieben von Eric Scheibler am 13.04.2015
Für die Freunde der Textkonsole habe ich ein kleines Shell Skript erstellt, welches mittels OCR Text aus Bildern und gescannten PDF Dateien extrahiert. Es können beliebig viele Quelldateien angegeben werden. Die Ergebnisse werden in einer einzelnen Textdatei gesammelt und im Texteditor der Wahl geöffnet oder nach stdout weitergeleitet. Für die Texterkennung kommt Tesseract zum Einsatz.
Ein paar Beispiele:
ocr image.jpg
ocr image.png document.pdf
ocr http://example.org/image.jpg
ocr -l eng image*
ocr -s document.pdf | grep -i search
Download: ocr
#!/bin/bash
# This script uses Tesseract to extract text from images and PDF files
# Supports local files and URLs
# Open the results in your favorite text editor or pipe them to stdout
#
# Version: 0.4
# Date: 2021-02-16
# License: GNU General Public License
# Author: Eric Scheibler
# E-Mail: email [at] eric-scheibler [dot] de
# URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install curl, imagemagick, poppler-utils and tesseract ...
# sudo aptitude install curl, imagemagick poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
# sudo aptitude install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng
# define some variables
force_ocr=0
text_editor="/usr/bin/vim"
pdf_with_text_layout=''
ocr_language="deu"
psm=12
temp_folder="/tmp/ocr"
result_text_file="$temp_folder/result.txt"
# check parameters
while getopts ":hfl:p:rs" opt; do
case $opt in
h)
echo -e "ocr [-f] [-l language] [-s] file_1 [file_2 ... file_n]"
echo -e "This script extracts text from image and PDF files\n"
echo -e "Options:\n -f: force ocr\n -l lang: 3-letter language code (deu, eng, ...)\n -p psm: psm value for tesseract (default is 3)\n -r: raw layout\n -s: Pipe output to stdout\n"
echo -e "examples\n ocr image1.png image2.jpg\n ocr -l eng -s document.pdf"
exit 0
;;
f)
force_ocr=1
;;
l)
ocr_language=$OPTARG
if (( ${#ocr_language} != 3)); then
echo "Invalid language parameter $ocr_language"
exit 1
fi
;;
p)
psm=$OPTARG
if (( $psm <= 0 || $psm > 10)); then
echo "Invalid value for psm: number between 1 and 10"
exit 1
fi
;;
r)
pdf_with_text_layout='-raw'
;;
s)
text_editor=""
;;
\?)
echo "Invalid option -$OPTARG"
exit 1
;;
:)
echo "Option -$OPTARG requires an argument"
exit 1
;;
esac
done
shift $((OPTIND -1))
if (( $# == 0 )); then
echo "Missing file(s)"
exit 1
fi
if [ -d "$temp_folder" ]; then
rm -f -R "$temp_folder"
if (( $? != 0 )); then
echo "Deletion of old temporary ocr folder failed"
exit 2
fi
fi
mkdir -p "$temp_folder"
if (( $? != 0 )); then
echo "Creation of temporary ocr folder failed"
exit 2
fi
cleanup() {
if [ -f "$1" ]; then
rm "$1"
fi
}
page_number=1
for file in "$@"
do
echo "Processing file $file" >&2
# strip path
base_filename=${file##*/}
# download
downloaded_into=""
if [[ $file == http://* || $file == https://* ]]; then
if [ -z $base_filename ]; then
base_filename = "downloaded_by_ocr_script"
fi
downloaded_into="/tmp/$base_filename"
curl --silent --show-error -o "$downloaded_into" "$file"
echo "Downloaded $downloaded_into from $file" >&2
file="$downloaded_into"
fi
# check if file exists
if [ ! -f "$file" ]; then
echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
page_number=$(( $page_number + 1 ))
continue
fi
# if it's a pdf file, check if it already contains text
# if so, copy the text into the corresponding text file and continue with the next one
if [[ $force_ocr -eq 0 && "$file" == *.pdf ]]; then
pdf_contents=$(pdftotext -q $pdf_with_text_layout "$file" -)
if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n\n$pdf_contents" \
| sed -e 's/\f$//g' \
| sed -e 's/\f/\n-- Page '$page_number' -- File: '"${file##*/}"' --\n\n/g' \
| perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
cleanup "$downloaded_into"
continue
fi
fi
# if it's a pdf file without text, split it into single pages and convert them to ppm format
# otherweise copy the file into the ocr temp folder
#
# strip file extension
base_filename=${base_filename%.*}
if [[ "$file" == *.pdf ]]; then
pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
rm -f "$temp_folder/doc_data.txt"
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.pdf ]]; then
pdftoppm -r 600 "$i" "${i%.pdf}"
rm -f "$i"
fi
done
else
cp "$file" "$temp_folder"
fi
# convert to tif
for i in "$temp_folder/$base_filename"* ; do
convert "$i" -type Grayscale "${i%.*}.tif"
rm -f "$i"
done
# start tesseract
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.tif ]]; then
tesseract -l $ocr_language --psm $psm "$i" "${i%.tif}" 2> /dev/null
rm -f "$i"
fi
done
# concatenate text files
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.txt ]]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
rm -f "$i"
page_number=$(( $page_number + 1 ))
fi
done
# cleanup download from above
cleanup "$downloaded_into"
done
# open in text editor or pipe to stdout
if [ -z "$text_editor" ]; then
cat "$result_text_file"
else
"$text_editor" "$result_text_file"
fi