Unlocking Insights: Converting PDFs to Text with OCR in Python

Portable Document Format (PDF) files have become ubiquitous in sharing and preserving digital documents. Whether it's a research paper, an e-book, or an official report, PDFs offer a consistent format across various platforms.

Why Extracting Text from PDFs Matters

While PDFs are a robust format, extracting text from them is crucial for several reasons. It enables text analysis, data mining, and easy integration into other applications. Additionally, the ability to decipher text from images within PDFs provides a deeper layer of understanding.

Setting Up Your Environment

Installing Necessary Python Libraries

pip install PyMuPDF pytesseract Pillow

Configuring Tesseract OCR

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

Click here to download

The PDF to Text Journey

Extracting Text Using PyMuPDF

import fitz  # PyMuPDF

def extract_text(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text += page.get_text("text")
    return text

Unveiling the Power of Tesseract OCR for Image Text Extraction

from PIL import Image
import pytesseract

def image_to_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

Batch Processing PDFs with OCR

Navigating Through a Folder of PDFs

def extract_text_from_pdfs(pdf_folder, output_file):
    # Code for batch processing PDFs with OCR
    # ...

# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'your_pdf_folder'
# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'

extract_text_from_pdfs(pdf_folder, output_file)

print(f"Text extracted from all PDFs and images has been saved to {output_file}")

Optimizing Text Extraction

Handling Different PDF Layouts and Structures

# Example: Extracting text with specific layout parameters
text = extract_text(pdf_path)

Dealing with Encryption in PDFs

# Example: Extracting text from an encrypted PDF
text = extract_text(pdf_path, password='your_password')

PDF to Text conversion

Single PDF

import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os
import io  # Import io module for BytesIO
import shutil  # Import shutil for removing the temporary images folder

# Set the path to the Tesseract executable (update this with your path)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text(pdf_path):
    # Extract text from a single PDF using PDFMiner
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text += page.get_text("text")
    return text

def image_to_text(image_path):
    # Open the image file
    img = Image.open(image_path)

    # Perform OCR on the image
    text = pytesseract.image_to_string(img)

    return text

def extract_text_from_pdfs(pdf_folder, output_file):
    with open(output_file, 'a', encoding='utf-8') as output_file:
        for filename in os.listdir(pdf_folder):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(pdf_folder, filename)
                text_from_pdf = extract_text(pdf_path)

                # Write the extracted text from PDF to the output file
                output_file.write(f"Text from PDF {filename}:\n")
                output_file.write(text_from_pdf + '\n\n')

                # Extract images from the PDF using PyMuPDF
                pdf_document = fitz.open(pdf_path)
                for page_number in range(pdf_document.page_count):
                    page = pdf_document[page_number]
                    image_list = page.get_images(full=True)
                    
                    for img_index, img in enumerate(image_list):
                        image_index = img[0]
                        base_image = pdf_document.extract_image(image_index)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        
                        # Save the image to a temporary file
                        image_filename = f"{filename}_page{page_number + 1}_img{img_index}.png"
                        image_path = os.path.join('temp_images', image_filename)
                        image.save(image_path)

                        # Perform OCR on the image
                        text_from_image = image_to_text(image_path)

                        # Write the extracted text from OCR to the output file
                        output_file.write(f"Text from OCR on image {image_filename}:\n")
                        output_file.write(text_from_image + '\n\n')

                pdf_document.close()

# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'C:/Users/snawa/OneDrive/Documents/GitHub/shopit/mypdf'

# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'

# Create a temporary folder for images
os.makedirs('temp_images', exist_ok=True)

extract_text_from_pdfs(pdf_folder, output_file)

print(f"Text extracted from all PDFs and images has been appended to {output_file}")

# Remove the temporary images folder
shutil.rmtree('temp_images')

Batch PDF conversion

import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os
import io  # Import io module for BytesIO
import shutil  # Import shutil for removing the temporary images folder

# Set the path to the Tesseract executable (update this with your path)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text(pdf_path):
    # Extract text from a single PDF using PDFMiner
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text += page.get_text("text")
    return text

def image_to_text(image_path):
    # Open the image file
    img = Image.open(image_path)

    # Perform OCR on the image
    text = pytesseract.image_to_string(img)

    return text

def extract_text_from_pdfs(pdf_folder, output_file):
    with open(output_file, 'w', encoding='utf-8') as output_file:
        for filename in os.listdir(pdf_folder):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(pdf_folder, filename)
                text_from_pdf = extract_text(pdf_path)

                # Write the extracted text from PDF to the output file
                output_file.write(f"Text from PDF {filename}:\n")
                output_file.write(text_from_pdf + '\n\n')

                # Extract images from the PDF using PyMuPDF
                pdf_document = fitz.open(pdf_path)
                for page_number in range(pdf_document.page_count):
                    page = pdf_document[page_number]
                    image_list = page.get_images(full=True)
                    
                    for img_index, img in enumerate(image_list):
                        image_index = img[0]
                        base_image = pdf_document.extract_image(image_index)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        
                        # Save the image to a temporary file
                        image_filename = f"{filename}_page{page_number + 1}_img{img_index}.png"
                        image_path = os.path.join('temp_images', image_filename)
                        image.save(image_path)

                        # Perform OCR on the image
                        text_from_image = image_to_text(image_path)

                        # Write the extracted text from OCR to the output file
                        output_file.write(f"Text from OCR on image {image_filename}:\n")
                        output_file.write(text_from_image + '\n\n')

                pdf_document.close()

# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'C:/Users/IndianTechnoEra/Desktop/PdfFolder

# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'

# Create a temporary folder for images
os.makedirs('temp_images', exist_ok=True)

extract_text_from_pdfs(pdf_folder, output_file)

print(f"Text extracted from all PDFs and images has been saved to {output_file}")

# Remove the temporary images folder
shutil.rmtree('temp_images')

Conclusion

The Synergy of PyMuPDF and Tesseract OCR

By combining the capabilities of PyMuPDF for PDF text extraction and Tesseract OCR for image text extraction, we unlock a comprehensive solution for converting PDFs to text.

Empowering Data Workflows with Extracted Text

The extracted text serves as a foundation for various data workflows, including text analysis, data mining, and insights generation. As we navigate the evolving landscape of digital documents, the ability to unlock insights from PDFs becomes an invaluable asset.

ite2