Skript zum extrahieren von Text aus Bildern und gescannten PDF Dokumenten

Geschrieben von Eric Scheibler am 13.04.2015

Für die Freunde der Textkonsole habe ich ein kleines Shell Skript erstellt, welches mittels OCR Text aus Bildern und gescannten PDF Dateien extrahiert. Es können beliebig viele Quelldateien angegeben werden. Die Ergebnisse werden in einer einzelnen Textdatei gesammelt und im Texteditor der Wahl geöffnet oder nach stdout weitergeleitet. Für die Texterkennung kommt Tesseract zum Einsatz.

Ein paar Beispiele:

ocr image.jpg
ocr image.png document.pdf
ocr -l eng image*
ocr -s document.pdf | grep -i search

Download: ocr

#!/bin/bash

# This script uses Tesseract to extract text from images and PDF files
# Open the results in your favorite text editor or pipe them to stdout
#
# Version: 0.3
# Date:    2017-02-26
# License: GNU General Public License
# Author:  Eric Scheibler
# E-Mail:  email [at] eric-scheibler [dot] de
# URL:     http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install imagemagick, poppler-utils and tesseract ...
#       sudo aptitude install imagemagick poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
#       sudo aptitude install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng

# define some variables
force_ocr=0
text_editor="/usr/bin/vim"
ocr_language="deu"
psm=3
temp_folder="/tmp/ocr"
result_text_file="$temp_folder/result.txt"

# check parameters
while getopts ":fhl:p:s" opt; do
    case $opt in
        f)
            force_ocr=1
            ;;
        h)
            echo -e "ocr [-f] [-l language] [-s] file_1 [file_2 ... file_n]"
            echo -e "This script extracts text from image and PDF files\n"
            echo -e "Options:\n    -f:        force ocr\n    -l lang:   3-letter language code (deu, eng, ...)\n    -p psm:    psm value for tesseract (default is 3)\n    -s:        Pipe output to stdout\n"
            echo -e "examples\n    ocr image1.png image2.jpg\n    ocr -l eng -s document.pdf"
            exit 0
            ;;
        l)
            ocr_language=$OPTARG
            if (( ${#ocr_language} != 3)); then
                echo "Invalid language parameter $ocr_language"
                exit 1
            fi
            ;;
        p)
            psm=$OPTARG
            if (( $psm <= 0 || $psm > 10)); then
                echo "Invalid value for psm: number between 1 and 10"
                exit 1
            fi
            ;;
        s)
            text_editor=""
            ;;
        \?)
            echo "Invalid option -$OPTARG"
            exit 1
            ;;
        :)
            echo "Option -$OPTARG requires an argument"
            exit 1
            ;;
    esac
done
shift $((OPTIND -1))

if (( $# == 0 )); then
    echo "Missing file(s)"
    exit 1
fi
if [ -d "$temp_folder" ]; then
    rm -f -R "$temp_folder"
    if (( $? != 0 )); then
        echo "Deletion of old temporary ocr folder failed"
        exit 2
    fi
fi
mkdir -p "$temp_folder"
if (( $? != 0 )); then
    echo "Creation of temporary ocr folder failed"
    exit 2
fi

page_number=1
for file in "$@"
do
    # strip path
    base_filename=${file##*/}
    # strip file extension
    base_filename=${base_filename%.*}
    echo "Processing file $file" >&2

    # check if file exists
    if [ ! -f "$file" ]; then
        echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
        page_number=$(( $page_number + 1 ))
        continue
    fi

    # if it's a pdf file, check if it already contains text
    # if so, copy the text into the corresponding text file and continue with the next one
    if [[ $force_ocr -eq 0 && "$file" == *.pdf ]]; then
        pdf_contents=$(pdftotext "$file" - 2> /dev/null)
        if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
            echo -e "-- Page $page_number  --  File: ${file##*/} --\n\n$pdf_contents" \
                | sed -e 's/\f$//g' \
                | sed -e 's/\f/\n-- Page '$page_number'  --  File: '"${file##*/}"' --\n\n/g' \
                | perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
            page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
            continue
        fi
    fi

    # if it's a pdf file without text, split it into single pages and convert them to ppm format
    # otherweise copy the file into the ocr temp folder
    if [[ "$file" == *.pdf ]]; then
        pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
        rm -f "$temp_folder/doc_data.txt"
        for i in "$temp_folder/$base_filename"* ; do
            if [[ "$i" == *.pdf ]]; then
                pdftoppm -r 600 "$i" "${i%.pdf}"
                rm -f "$i"
            fi
        done
    else
        cp "$file" "$temp_folder"
    fi

    # convert to tif
    for i in "$temp_folder/$base_filename"* ; do
        convert "$i" -type Grayscale "${i%.*}.tif"
        rm -f "$i"
    done

    # start tesseract
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.tif ]]; then
            tesseract -l $ocr_language -psm $psm "$i" "${i%.tif}" 2> /dev/null
            rm -f "$i"
        fi
    done

    # concatenate text files
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.txt ]]; then
            echo -e "-- Page $page_number  --  File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
            rm -f "$i"
            page_number=$(( $page_number + 1 ))
        fi
    done
done

# open in text editor or pipe to stdout
if [ -z "$text_editor" ]; then
    cat "$result_text_file"
else
    "$text_editor" "$result_text_file"
fi