Skript zum extrahieren von Text aus Bildern und gescannten PDF Dokumenten

Geschrieben von Eric Scheibler am 13.04.2015

Für die Freunde der Textkonsole habe ich ein kleines Shell Skript erstellt, welches mittels OCR Text aus Bildern und gescannten PDF Dateien extrahiert. Es können beliebig viele Quelldateien angegeben werden. Die Ergebnisse werden in einer einzelnen Textdatei gesammelt und im Texteditor der Wahl geöffnet oder nach stdout weitergeleitet. Für die Texterkennung kommt Tesseract zum Einsatz.

Ein paar Beispiele:

ocr image.jpg
ocr image.png document.pdf
ocr http://example.org/image.jpg
ocr -l eng -r layout -o image*
ocr document.pdf | grep -i search

Download: ocr

#!/bin/bash

# This script uses Tesseract to extract text from images and PDF files
# Supports local files and URLs
#
# Version: 0.6
# Date:    2024-04-29
# License: GNU General Public License
# Author:  Eric Scheibler
# E-Mail:  email [at] eric-scheibler [dot] de
# URL:     http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install curl, imagemagick, poppler-utils and tesseract ...
#       sudo apt install curl imagemagick pdftk poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
#       sudo apt install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng

# define some variables
exclude_page_numbers=0
force_ocr=0
open_in_editor=0
pdf_reading_order=''
ocr_language="deu"
psm=12

# temp folder and file
temp_folder=$(mktemp -d)
result_text_file="$temp_folder/result.txt"
trap "rm -rf $temp_folder" EXIT

cleanup() {
    if [ -f "$1" ]; then
        rm "$1"
    fi
}


# check parameters
while getopts ":hfl:p:r:ox" opt; do
    case $opt in
        h)
            echo -e "ocr [-f] [-l language] [-r] [-x] file_1 [file_2 ... file_n]"
            echo -e "This script extracts text from image and PDF files\n"
            echo -e """General options:
    -l lang    3-letter language code (deu, eng, ...)
    -p psm     psm value for tesseract (default is 12)
    -o         Open results in the default text editor\n
    -x         Exclude page numbers\n"""
            echo -e """PDF document options:
    -f         force ocr in already tagged pdf documents
    -r mode    pdf document reading order\n"""
            echo -e "examples
    ocr image1.png image2.jpg
    ocr -l eng document.pdf
    ocr -x https://path_to_image"""
            exit 0
            ;;
        f)
            force_ocr=1
            ;;
        o)
            open_in_editor=1
            ;;
        l)
            ocr_language=$OPTARG
            if (( ${#ocr_language} != 3)); then
                echo "Invalid language parameter $ocr_language"
                exit 1
            fi
            ;;
        p)
            psm=$OPTARG
            if (( $psm <= 0 || $psm > 13)); then
                echo "Invalid value for psm: number between 1 and 13"
                exit 1
            fi
            ;;
        r)
            pdf_reading_order=$OPTARG
            if [ "$pdf_reading_order" != "raw" ] && [ "$pdf_reading_order" != "layout" ]; then
                echo "Invalid pdftotext layout parameter $pdf_reading_order. Choose between 'layout' and 'raw'"
                exit 1
            fi
            # prepare for pdftotext command below
            pdf_reading_order="-"$pdf_reading_order
            ;;
        x)
            exclude_page_numbers=1
            ;;
        \?)
            echo "Invalid option -$OPTARG"
            exit 1
            ;;
        :)
            echo "Option -$OPTARG requires an argument"
            exit 1
            ;;
    esac
done
shift $((OPTIND -1))

if (( $# == 0 )); then
    echo "Missing file(s)"
    exit 1
fi

page_number=1
for file in "$@"
do
    echo "Processing file $file" >&2
    # strip path
    base_filename=${file##*/}

    # download
    downloaded_into=""
    if [[ $file == http://* || $file == https://* ]]; then
        if [ -z $base_filename ]; then
            base_filename = "downloaded_by_ocr_script"
        fi
        downloaded_into="/tmp/$base_filename"
        curl --silent --show-error -o "$downloaded_into" "$file"
        echo "Downloaded $downloaded_into from $file" >&2
        file="$downloaded_into"
    fi

    # check if file exists
    if [ ! -f "$file" ]; then
        echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
        page_number=$(( $page_number + 1 ))
        continue
    fi

    # if it's a pdf file, check if it already contains text
    # if so, copy the text into the corresponding text file and continue with the next one
    if [[ $force_ocr -eq 0 && "${file,,}" == *.pdf ]]; then
        pdf_contents=$(pdftotext -q $pdf_reading_order "$file" -)
        if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
            # found some text
            if [[ $exclude_page_numbers -eq 0 ]]; then
                echo -e "-- Page $page_number  --  File: ${file##*/} --\n\n$pdf_contents" \
                    | sed -e 's/\f$//g' \
                    | sed -e 's/\f/\n-- Page '$page_number'  --  File: '"${file##*/}"' --\n\n/g' \
                    | perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
            else
                echo -e "$pdf_contents" >> "$result_text_file"
            fi
            page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
            cleanup "$downloaded_into"
            continue
        fi
    fi

    # if it's a pdf file without text, split it into single pages and convert them to ppm format
    # otherweise copy the file into the ocr temp folder
    #
    # strip file extension
    base_filename=${base_filename%.*}
    if [[ "${file,,}" == *.pdf ]]; then
        pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
        rm -f "$temp_folder/doc_data.txt"
        for i in "$temp_folder/$base_filename"* ; do
            if [[ "$i" == *.pdf ]]; then
                pdftoppm -r 600 "$i" "${i%.pdf}"
                rm -f "$i"
            fi
        done
    else
        cp "$file" "$temp_folder"
    fi

    # convert to tif
    for i in "$temp_folder/$base_filename"* ; do
        convert "$i" -type Grayscale "${i%.*}.tif"
        rm -f "$i"
    done

    # start tesseract
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.tif ]]; then
            tesseract -l $ocr_language --psm $psm "$i" "${i%.tif}" 2> /dev/null
            rm -f "$i"
        fi
    done

    # concatenate text files
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.txt ]]; then
            if [[ $exclude_page_numbers -eq 0 ]]; then
                echo -e "-- Page $page_number  --  File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
            else
                echo -e "\n" | cat - "$i" >> "$result_text_file"
            fi
            rm -f "$i"
            page_number=$(( $page_number + 1 ))
        fi
    done

    # cleanup download from above
    cleanup "$downloaded_into"
done

if [[ $open_in_editor -eq 1 ]]; then
    if [ -z "$EDITOR" ]; then
        echo "EDITOR not set. Please set your preferred editor in the EDITOR environment variable."
        exit 1
    fi
    "$EDITOR" "$result_text_file"
else
    # pipe to stdout
    cat "$result_text_file"
fi