Script to extract text from images and scanned PDF files

Posted by Eric Scheibler at April 13, 2015

For the friends of the text console I've created a small shell script, which extracts text from images and scanned PDF files. You can specify as many input files as you want. The results are merged into a single text file. You can open it in your favorite text editor or pipe it to stdout. The program Tesseract is used for the text recognition.

Some examples:

ocr image.jpg
ocr image.png document.pdf
ocr -l eng image*
ocr -s document.pdf | grep -i search

Download: ocr

#!/bin/bash

# This script uses Tesseract to extract text from images and PDF files
# Open the results in your favorite text editor or pipe them to stdout
#
# Version: 0.3
# Date:    2017-02-26
# License: GNU General Public License
# Author:  Eric Scheibler
# E-Mail:  email [at] eric-scheibler [dot] de
# URL:     http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install imagemagick, poppler-utils and tesseract ...
#       sudo aptitude install imagemagick poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
#       sudo aptitude install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng

# define some variables
force_ocr=0
text_editor="/usr/bin/vim"
ocr_language="deu"
psm=3
temp_folder="/tmp/ocr"
result_text_file="$temp_folder/result.txt"

# check parameters
while getopts ":fhl:p:s" opt; do
    case $opt in
        f)
            force_ocr=1
            ;;
        h)
            echo -e "ocr [-f] [-l language] [-s] file_1 [file_2 ... file_n]"
            echo -e "This script extracts text from image and PDF files\n"
            echo -e "Options:\n    -f:        force ocr\n    -l lang:   3-letter language code (deu, eng, ...)\n    -p psm:    psm value for tesseract (default is 3)\n    -s:        Pipe output to stdout\n"
            echo -e "examples\n    ocr image1.png image2.jpg\n    ocr -l eng -s document.pdf"
            exit 0
            ;;
        l)
            ocr_language=$OPTARG
            if (( ${#ocr_language} != 3)); then
                echo "Invalid language parameter $ocr_language"
                exit 1
            fi
            ;;
        p)
            psm=$OPTARG
            if (( $psm <= 0 || $psm > 10)); then
                echo "Invalid value for psm: number between 1 and 10"
                exit 1
            fi
            ;;
        s)
            text_editor=""
            ;;
        \?)
            echo "Invalid option -$OPTARG"
            exit 1
            ;;
        :)
            echo "Option -$OPTARG requires an argument"
            exit 1
            ;;
    esac
done
shift $((OPTIND -1))

if (( $# == 0 )); then
    echo "Missing file(s)"
    exit 1
fi
if [ -d "$temp_folder" ]; then
    rm -f -R "$temp_folder"
    if (( $? != 0 )); then
        echo "Deletion of old temporary ocr folder failed"
        exit 2
    fi
fi
mkdir -p "$temp_folder"
if (( $? != 0 )); then
    echo "Creation of temporary ocr folder failed"
    exit 2
fi

page_number=1
for file in "$@"
do
    # strip path
    base_filename=${file##*/}
    # strip file extension
    base_filename=${base_filename%.*}
    echo "Processing file $file" >&2

    # check if file exists
    if [ ! -f "$file" ]; then
        echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
        page_number=$(( $page_number + 1 ))
        continue
    fi

    # if it's a pdf file, check if it already contains text
    # if so, copy the text into the corresponding text file and continue with the next one
    if [[ $force_ocr -eq 0 && "$file" == *.pdf ]]; then
        pdf_contents=$(pdftotext "$file" - 2> /dev/null)
        if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
            echo -e "-- Page $page_number  --  File: ${file##*/} --\n\n$pdf_contents" \
                | sed -e 's/\f$//g' \
                | sed -e 's/\f/\n-- Page '$page_number'  --  File: '"${file##*/}"' --\n\n/g' \
                | perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
            page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
            continue
        fi
    fi

    # if it's a pdf file without text, split it into single pages and convert them to ppm format
    # otherweise copy the file into the ocr temp folder
    if [[ "$file" == *.pdf ]]; then
        pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
        rm -f "$temp_folder/doc_data.txt"
        for i in "$temp_folder/$base_filename"* ; do
            if [[ "$i" == *.pdf ]]; then
                pdftoppm -r 600 "$i" "${i%.pdf}"
                rm -f "$i"
            fi
        done
    else
        cp "$file" "$temp_folder"
    fi

    # convert to tif
    for i in "$temp_folder/$base_filename"* ; do
        convert "$i" -type Grayscale "${i%.*}.tif"
        rm -f "$i"
    done

    # start tesseract
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.tif ]]; then
            tesseract -l $ocr_language -psm $psm "$i" "${i%.tif}" 2> /dev/null
            rm -f "$i"
        fi
    done

    # concatenate text files
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.txt ]]; then
            echo -e "-- Page $page_number  --  File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
            rm -f "$i"
            page_number=$(( $page_number + 1 ))
        fi
    done
done

# open in text editor or pipe to stdout
if [ -z "$text_editor" ]; then
    cat "$result_text_file"
else
    "$text_editor" "$result_text_file"
fi