Portable Document Format (PDF) files have become ubiquitous in sharing and preserving digital documents. Whether it's a research paper, an e-book, or an official report, PDFs offer a consistent format across various platforms.
Why Extracting Text from PDFs Matters
While PDFs are a robust format, extracting text from them is crucial for several reasons. It enables text analysis, data mining, and easy integration into other applications. Additionally, the ability to decipher text from images within PDFs provides a deeper layer of understanding.
Setting Up Your Environment
Installing Necessary Python Libraries
pip install PyMuPDF pytesseract Pillow
Configuring Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
The PDF to Text Journey
Extracting Text Using PyMuPDF
import fitz # PyMuPDF
def extract_text(pdf_path):
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
text += page.get_text("text")
return text
Unveiling the Power of Tesseract OCR for Image Text Extraction
from PIL import Image
import pytesseract
def image_to_text(image_path):
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
return text
Batch Processing PDFs with OCR
Navigating Through a Folder of PDFs
def extract_text_from_pdfs(pdf_folder, output_file):
# Code for batch processing PDFs with OCR
# ...
# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'your_pdf_folder'
# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'
extract_text_from_pdfs(pdf_folder, output_file)
print(f"Text extracted from all PDFs and images has been saved to {output_file}")
Optimizing Text Extraction
Handling Different PDF Layouts and Structures
# Example: Extracting text with specific layout parameters
text = extract_text(pdf_path)
Dealing with Encryption in PDFs
# Example: Extracting text from an encrypted PDF
text = extract_text(pdf_path, password='your_password')
PDF to Text conversion
Single PDF
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
import io # Import io module for BytesIO
import shutil # Import shutil for removing the temporary images folder
# Set the path to the Tesseract executable (update this with your path)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_text(pdf_path):
# Extract text from a single PDF using PDFMiner
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
text += page.get_text("text")
return text
def image_to_text(image_path):
# Open the image file
img = Image.open(image_path)
# Perform OCR on the image
text = pytesseract.image_to_string(img)
return text
def extract_text_from_pdfs(pdf_folder, output_file):
with open(output_file, 'a', encoding='utf-8') as output_file:
for filename in os.listdir(pdf_folder):
if filename.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder, filename)
text_from_pdf = extract_text(pdf_path)
# Write the extracted text from PDF to the output file
output_file.write(f"Text from PDF {filename}:\n")
output_file.write(text_from_pdf + '\n\n')
# Extract images from the PDF using PyMuPDF
pdf_document = fitz.open(pdf_path)
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
image_index = img[0]
base_image = pdf_document.extract_image(image_index)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
# Save the image to a temporary file
image_filename = f"{filename}_page{page_number + 1}_img{img_index}.png"
image_path = os.path.join('temp_images', image_filename)
image.save(image_path)
# Perform OCR on the image
text_from_image = image_to_text(image_path)
# Write the extracted text from OCR to the output file
output_file.write(f"Text from OCR on image {image_filename}:\n")
output_file.write(text_from_image + '\n\n')
pdf_document.close()
# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'C:/Users/snawa/OneDrive/Documents/GitHub/shopit/mypdf'
# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'
# Create a temporary folder for images
os.makedirs('temp_images', exist_ok=True)
extract_text_from_pdfs(pdf_folder, output_file)
print(f"Text extracted from all PDFs and images has been appended to {output_file}")
# Remove the temporary images folder
shutil.rmtree('temp_images')
Batch PDF conversion
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
import io # Import io module for BytesIO
import shutil # Import shutil for removing the temporary images folder
# Set the path to the Tesseract executable (update this with your path)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_text(pdf_path):
# Extract text from a single PDF using PDFMiner
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
text += page.get_text("text")
return text
def image_to_text(image_path):
# Open the image file
img = Image.open(image_path)
# Perform OCR on the image
text = pytesseract.image_to_string(img)
return text
def extract_text_from_pdfs(pdf_folder, output_file):
with open(output_file, 'w', encoding='utf-8') as output_file:
for filename in os.listdir(pdf_folder):
if filename.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder, filename)
text_from_pdf = extract_text(pdf_path)
# Write the extracted text from PDF to the output file
output_file.write(f"Text from PDF {filename}:\n")
output_file.write(text_from_pdf + '\n\n')
# Extract images from the PDF using PyMuPDF
pdf_document = fitz.open(pdf_path)
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
image_index = img[0]
base_image = pdf_document.extract_image(image_index)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
# Save the image to a temporary file
image_filename = f"{filename}_page{page_number + 1}_img{img_index}.png"
image_path = os.path.join('temp_images', image_filename)
image.save(image_path)
# Perform OCR on the image
text_from_image = image_to_text(image_path)
# Write the extracted text from OCR to the output file
output_file.write(f"Text from OCR on image {image_filename}:\n")
output_file.write(text_from_image + '\n\n')
pdf_document.close()
# Replace 'your_pdf_folder' with the path to the folder containing your PDFs
pdf_folder = 'C:/Users/IndianTechnoEra/Desktop/PdfFolder
# Replace 'output_text.txt' with the desired output text file
output_file = 'output_text.txt'
# Create a temporary folder for images
os.makedirs('temp_images', exist_ok=True)
extract_text_from_pdfs(pdf_folder, output_file)
print(f"Text extracted from all PDFs and images has been saved to {output_file}")
# Remove the temporary images folder
shutil.rmtree('temp_images')
Conclusion
The Synergy of PyMuPDF and Tesseract OCR
By combining the capabilities of PyMuPDF for PDF text extraction and Tesseract OCR for image text extraction, we unlock a comprehensive solution for converting PDFs to text.
Empowering Data Workflows with Extracted Text
The extracted text serves as a foundation for various data workflows, including text analysis, data mining, and insights generation. As we navigate the evolving landscape of digital documents, the ability to unlock insights from PDFs becomes an invaluable asset.