""" Text extraction utilities for PDF and images. Supports both digital PDFs and scanned documents (OCR). """ import pdfplumber import fitz # PyMuPDF import pytesseract from PIL import Image from pathlib import Path from typing import Dict, Tuple import logging logger = logging.getLogger(__name__) def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]: """ Extract text from PDF using pdfplumber (for digital PDFs). Returns: (raw_text, metadata) """ try: text_pages = [] page_count = 0 with pdfplumber.open(str(file_path)) as pdf: page_count = len(pdf.pages) for page in pdf.pages: text = page.extract_text() if text: text_pages.append(text) raw_text = "\n\n".join(text_pages) metadata = { "page_count": page_count, "extraction_method": "pdfplumber", "confidence_score": 1.0 if len(raw_text) > 50 else 0.5 } # If no text extracted, it might be a scanned PDF if not raw_text.strip(): logger.info("No text found with pdfplumber, trying OCR...") return extract_text_from_pdf_ocr(file_path) return raw_text, metadata except Exception as e: logger.error(f"PDF extraction failed: {e}") raise def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]: """ Extract text from scanned PDF using OCR (PyMuPDF + Tesseract). """ try: text_pages = [] doc = fitz.open(str(file_path)) page_count = len(doc) for page_num in range(page_count): page = doc[page_num] # Convert page to image pix = page.get_pixmap(dpi=300) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # OCR text = pytesseract.image_to_string(img) text_pages.append(text) doc.close() raw_text = "\n\n".join(text_pages) metadata = { "page_count": page_count, "extraction_method": "tesseract_ocr", "confidence_score": 0.7 # OCR typically less confident } return raw_text, metadata except Exception as e: logger.error(f"OCR extraction failed: {e}") raise def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]: """ Extract text from image using OCR (Tesseract). """ try: img = Image.open(str(file_path)) raw_text = pytesseract.image_to_string(img) metadata = { "page_count": 1, "extraction_method": "tesseract_ocr", "confidence_score": 0.7 } return raw_text, metadata except Exception as e: logger.error(f"Image OCR failed: {e}") raise def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]: """ Main entry point for text extraction. Routes to appropriate extractor based on file type. Args: file_path: Path to document mime_type: MIME type of document Returns: (raw_text, metadata_dict) """ if mime_type == "application/pdf": return extract_text_from_pdf(file_path) elif mime_type in ["image/png", "image/jpeg", "image/jpg"]: return extract_text_from_image(file_path) else: raise ValueError(f"Unsupported file type: {mime_type}")