Dipan04's picture
Deploy Invoice Digitization Agent
8a859a8
"""
Text extraction utilities for PDF and images.
Supports both digital PDFs and scanned documents (OCR).
"""
import pdfplumber
import fitz # PyMuPDF
import pytesseract
from PIL import Image
from pathlib import Path
from typing import Dict, Tuple
import logging
logger = logging.getLogger(__name__)
def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from PDF using pdfplumber (for digital PDFs).
Returns:
(raw_text, metadata)
"""
try:
text_pages = []
page_count = 0
with pdfplumber.open(str(file_path)) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
text = page.extract_text()
if text:
text_pages.append(text)
raw_text = "\n\n".join(text_pages)
metadata = {
"page_count": page_count,
"extraction_method": "pdfplumber",
"confidence_score": 1.0 if len(raw_text) > 50 else 0.5
}
# If no text extracted, it might be a scanned PDF
if not raw_text.strip():
logger.info("No text found with pdfplumber, trying OCR...")
return extract_text_from_pdf_ocr(file_path)
return raw_text, metadata
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
raise
def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
"""
try:
text_pages = []
doc = fitz.open(str(file_path))
page_count = len(doc)
for page_num in range(page_count):
page = doc[page_num]
# Convert page to image
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# OCR
text = pytesseract.image_to_string(img)
text_pages.append(text)
doc.close()
raw_text = "\n\n".join(text_pages)
metadata = {
"page_count": page_count,
"extraction_method": "tesseract_ocr",
"confidence_score": 0.7 # OCR typically less confident
}
return raw_text, metadata
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
raise
def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from image using OCR (Tesseract).
"""
try:
img = Image.open(str(file_path))
raw_text = pytesseract.image_to_string(img)
metadata = {
"page_count": 1,
"extraction_method": "tesseract_ocr",
"confidence_score": 0.7
}
return raw_text, metadata
except Exception as e:
logger.error(f"Image OCR failed: {e}")
raise
def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
"""
Main entry point for text extraction.
Routes to appropriate extractor based on file type.
Args:
file_path: Path to document
mime_type: MIME type of document
Returns:
(raw_text, metadata_dict)
"""
if mime_type == "application/pdf":
return extract_text_from_pdf(file_path)
elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
return extract_text_from_image(file_path)
else:
raise ValueError(f"Unsupported file type: {mime_type}")