Dipan04's picture
Deploy Invoice Digitization Agent
8a859a8
raw
history blame
3.59 kB
"""
Text extraction utilities for PDF and images.
Supports both digital PDFs and scanned documents (OCR).
"""
import pdfplumber
import fitz # PyMuPDF
import pytesseract
from PIL import Image
from pathlib import Path
from typing import Dict, Tuple
import logging
logger = logging.getLogger(__name__)
def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from PDF using pdfplumber (for digital PDFs).
Returns:
(raw_text, metadata)
"""
try:
text_pages = []
page_count = 0
with pdfplumber.open(str(file_path)) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
text = page.extract_text()
if text:
text_pages.append(text)
raw_text = "\n\n".join(text_pages)
metadata = {
"page_count": page_count,
"extraction_method": "pdfplumber",
"confidence_score": 1.0 if len(raw_text) > 50 else 0.5
}
# If no text extracted, it might be a scanned PDF
if not raw_text.strip():
logger.info("No text found with pdfplumber, trying OCR...")
return extract_text_from_pdf_ocr(file_path)
return raw_text, metadata
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
raise
def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
"""
try:
text_pages = []
doc = fitz.open(str(file_path))
page_count = len(doc)
for page_num in range(page_count):
page = doc[page_num]
# Convert page to image
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# OCR
text = pytesseract.image_to_string(img)
text_pages.append(text)
doc.close()
raw_text = "\n\n".join(text_pages)
metadata = {
"page_count": page_count,
"extraction_method": "tesseract_ocr",
"confidence_score": 0.7 # OCR typically less confident
}
return raw_text, metadata
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
raise
def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
"""
Extract text from image using OCR (Tesseract).
"""
try:
img = Image.open(str(file_path))
raw_text = pytesseract.image_to_string(img)
metadata = {
"page_count": 1,
"extraction_method": "tesseract_ocr",
"confidence_score": 0.7
}
return raw_text, metadata
except Exception as e:
logger.error(f"Image OCR failed: {e}")
raise
def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
"""
Main entry point for text extraction.
Routes to appropriate extractor based on file type.
Args:
file_path: Path to document
mime_type: MIME type of document
Returns:
(raw_text, metadata_dict)
"""
if mime_type == "application/pdf":
return extract_text_from_pdf(file_path)
elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
return extract_text_from_image(file_path)
else:
raise ValueError(f"Unsupported file type: {mime_type}")