""" Text extraction from PDFs and images using EasyOCR Smart extraction: tries text layer first, falls back to OCR """ import fitz # PyMuPDF import easyocr from PIL import Image from pdf2image import convert_from_bytes import io import numpy as np from typing import Tuple, Optional print("Initializing EasyOCR Reader...") try: reader = easyocr.Reader(['en'], gpu=False, verbose=False) print("✓ EasyOCR Reader initialized successfully") except Exception as e: print(f"✗ EasyOCR initialization failed: {e}") reader = None def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]: """ Extract text from PDF with smart OCR fallback Returns: (extracted_text, ocr_used) """ if not pdf_bytes: return None, False try: # Try extracting text layer first (fast) doc = fitz.open(stream=pdf_bytes, filetype="pdf") full_text = "" for page in doc: full_text += page.get_text() doc.close() # Check if meaningful text was extracted if len(full_text.strip()) > 50: print(f"✓ Extracted {len(full_text)} chars from text layer") return full_text.strip(), False # No text layer - use OCR print("⚠ No text layer detected, using EasyOCR...") text = extract_text_from_pdf_via_ocr(pdf_bytes) return text, True except Exception as e: print(f"✗ Error in PDF text extraction: {e}") return None, False def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]: """ Extract text using EasyOCR on PDF pages converted to images """ if not reader: raise RuntimeError("EasyOCR not initialized") try: # Convert PDF to images images = convert_from_bytes(pdf_bytes, dpi=300) full_text = "" for i, image in enumerate(images): print(f" OCR processing page {i+1}/{len(images)}...") # Convert PIL to numpy array img_array = np.array(image) # Run EasyOCR results = reader.readtext(img_array, detail=0, paragraph=True) page_text = ' '.join(results) full_text += page_text + "\n\n" print(f"✓ EasyOCR extracted {len(full_text)} chars from {len(images)} pages") return full_text.strip() except Exception as e: print(f"✗ OCR failed: {e}") return None def extract_text_from_image(image_bytes: bytes) -> Optional[str]: """ Extract text from image file using EasyOCR """ if not reader: raise RuntimeError("EasyOCR not initialized") try: print("Processing image with EasyOCR...") # Open and prepare image image = Image.open(io.BytesIO(image_bytes)) if image.mode != 'RGB': image = image.convert('RGB') # Convert to numpy img_array = np.array(image) # Run EasyOCR results = reader.readtext(img_array, detail=0, paragraph=True) text = ' '.join(results) print(f"✓ EasyOCR extracted {len(text)} chars from image") return text.strip() except Exception as e: print(f"✗ Image OCR failed: {e}") return None def get_ocr_confidence(image_array: np.ndarray) -> list: """ Get detailed OCR results with confidence scores """ if not reader: return [] try: results = reader.readtext(image_array, detail=1) return [ { "text": text, "confidence": round(conf, 3), "bbox": bbox } for bbox, text, conf in results ] except: return []