Skript zum extrahieren von Text aus Bildern und gescannten PDF Dokumenten
Geschrieben von Eric Scheibler am 13.04.2015
Für die Freunde der Textkonsole habe ich ein kleines Shell Skript erstellt, welches mittels OCR Text aus Bildern und gescannten PDF Dateien extrahiert. Es können beliebig viele Quelldateien angegeben werden. Die Ergebnisse werden in einer einzelnen Textdatei gesammelt und im Texteditor der Wahl geöffnet oder nach stdout weitergeleitet. Für die Texterkennung kommt Tesseract zum Einsatz.
Ein paar Beispiele:
ocr image.jpg
ocr image.png document.pdf
ocr http://example.org/image.jpg
ocr -l eng -r layout -o image*
ocr document.pdf | grep -i search
Download: ocr
#!/bin/bash
# This script uses Tesseract to extract text from images and PDF files
# Supports local files and URLs
#
# Version: 0.6
# Date: 2024-04-29
# License: GNU General Public License
# Author: Eric Scheibler
# E-Mail: email [at] eric-scheibler [dot] de
# URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install curl, imagemagick, poppler-utils and tesseract ...
# sudo apt install curl imagemagick pdftk poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
# sudo apt install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng
# define some variables
exclude_page_numbers=0
force_ocr=0
open_in_editor=0
pdf_reading_order=''
ocr_language="deu"
psm=12
# temp folder and file
temp_folder=$(mktemp -d)
result_text_file="$temp_folder/result.txt"
trap "rm -rf $temp_folder" EXIT
cleanup() {
if [ -f "$1" ]; then
rm "$1"
fi
}
# check parameters
while getopts ":hfl:p:r:ox" opt; do
case $opt in
h)
echo -e "ocr [-f] [-l language] [-r] [-x] file_1 [file_2 ... file_n]"
echo -e "This script extracts text from image and PDF files\n"
echo -e """General options:
-l lang 3-letter language code (deu, eng, ...)
-p psm psm value for tesseract (default is 12)
-o Open results in the default text editor\n
-x Exclude page numbers\n"""
echo -e """PDF document options:
-f force ocr in already tagged pdf documents
-r mode pdf document reading order\n"""
echo -e "examples
ocr image1.png image2.jpg
ocr -l eng document.pdf
ocr -x https://path_to_image"""
exit 0
;;
f)
force_ocr=1
;;
o)
open_in_editor=1
;;
l)
ocr_language=$OPTARG
if (( ${#ocr_language} != 3)); then
echo "Invalid language parameter $ocr_language"
exit 1
fi
;;
p)
psm=$OPTARG
if (( $psm <= 0 || $psm > 13)); then
echo "Invalid value for psm: number between 1 and 13"
exit 1
fi
;;
r)
pdf_reading_order=$OPTARG
if [ "$pdf_reading_order" != "raw" ] && [ "$pdf_reading_order" != "layout" ]; then
echo "Invalid pdftotext layout parameter $pdf_reading_order. Choose between 'layout' and 'raw'"
exit 1
fi
# prepare for pdftotext command below
pdf_reading_order="-"$pdf_reading_order
;;
x)
exclude_page_numbers=1
;;
\?)
echo "Invalid option -$OPTARG"
exit 1
;;
:)
echo "Option -$OPTARG requires an argument"
exit 1
;;
esac
done
shift $((OPTIND -1))
if (( $# == 0 )); then
echo "Missing file(s)"
exit 1
fi
page_number=1
for file in "$@"
do
echo "Processing file $file" >&2
# strip path
base_filename=${file##*/}
# download
downloaded_into=""
if [[ $file == http://* || $file == https://* ]]; then
if [ -z $base_filename ]; then
base_filename = "downloaded_by_ocr_script"
fi
downloaded_into="/tmp/$base_filename"
curl --silent --show-error -o "$downloaded_into" "$file"
echo "Downloaded $downloaded_into from $file" >&2
file="$downloaded_into"
fi
# check if file exists
if [ ! -f "$file" ]; then
echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
page_number=$(( $page_number + 1 ))
continue
fi
# if it's a pdf file, check if it already contains text
# if so, copy the text into the corresponding text file and continue with the next one
if [[ $force_ocr -eq 0 && "${file,,}" == *.pdf ]]; then
pdf_contents=$(pdftotext -q $pdf_reading_order "$file" -)
if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
# found some text
if [[ $exclude_page_numbers -eq 0 ]]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n\n$pdf_contents" \
| sed -e 's/\f$//g' \
| sed -e 's/\f/\n-- Page '$page_number' -- File: '"${file##*/}"' --\n\n/g' \
| perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
else
echo -e "$pdf_contents" >> "$result_text_file"
fi
page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
cleanup "$downloaded_into"
continue
fi
fi
# if it's a pdf file without text, split it into single pages and convert them to ppm format
# otherweise copy the file into the ocr temp folder
#
# strip file extension
base_filename=${base_filename%.*}
if [[ "${file,,}" == *.pdf ]]; then
pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
rm -f "$temp_folder/doc_data.txt"
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.pdf ]]; then
pdftoppm -r 600 "$i" "${i%.pdf}"
rm -f "$i"
fi
done
else
cp "$file" "$temp_folder"
fi
# convert to tif
for i in "$temp_folder/$base_filename"* ; do
convert "$i" -type Grayscale "${i%.*}.tif"
rm -f "$i"
done
# start tesseract
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.tif ]]; then
tesseract -l $ocr_language --psm $psm "$i" "${i%.tif}" 2> /dev/null
rm -f "$i"
fi
done
# concatenate text files
for i in "$temp_folder/$base_filename"* ; do
if [[ "$i" == *.txt ]]; then
if [[ $exclude_page_numbers -eq 0 ]]; then
echo -e "-- Page $page_number -- File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
else
echo -e "\n" | cat - "$i" >> "$result_text_file"
fi
rm -f "$i"
page_number=$(( $page_number + 1 ))
fi
done
# cleanup download from above
cleanup "$downloaded_into"
done
if [[ $open_in_editor -eq 1 ]]; then
if [ -z "$EDITOR" ]; then
echo "EDITOR not set. Please set your preferred editor in the EDITOR environment variable."
exit 1
fi
"$EDITOR" "$result_text_file"
else
# pipe to stdout
cat "$result_text_file"
fi