import os import logging from typing import List from pathlib import Path from pdf2image import convert_from_path from PIL import Image from chandra.model import InferenceManager from chandra.model.schema import BatchInputItem logger = logging.getLogger("ocr_engine") # Lazy-load global so the model loads only once _chandra_manager: InferenceManager | None = None def _get_chandra_manager() -> InferenceManager: global _chandra_manager if _chandra_manager is None: logger.info("[Chandra OCR] Loading model (method='hf')...") _chandra_manager = InferenceManager(method="hf") return _chandra_manager def extract_text(file_path: str) -> str: """ Extract text from a PDF or Image using Chandra OCR. Returns full text content only (no confidence scores). """ path = Path(file_path) if not path.exists(): logger.error(f"[Chandra OCR] File not found: {file_path}") return "" # Load PDF pages or image images: List[Image.Image] = [] try: # PDF if path.suffix.lower() == ".pdf": try: images = convert_from_path(str(path)) # You can set dpi=300 if needed except Exception as e: logger.error(f"[Chandra OCR] PDF conversion error: {e}", exc_info=True) return "" # Image formats elif path.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}: try: images = [Image.open(str(path)).convert("RGB")] except Exception as e: logger.error(f"[Chandra OCR] Image open error: {e}", exc_info=True) return "" else: logger.error(f"[Chandra OCR] Unsupported file type: {path.suffix}") return "" if not images: logger.error(f"[Chandra OCR] No images/pages loaded from file: {file_path}") return "" # Prepare OCR batch manager = _get_chandra_manager() batch = [ BatchInputItem( image=img, prompt_type="ocr_layout" ) for img in images ] # Run OCR logger.info(f"[Chandra OCR] Running OCR on {len(batch)} pages...") results = manager.generate(batch) # Join pages into final text text_blocks = [] for i, result in enumerate(results): page_text = getattr(result, "markdown", None) or getattr(result, "raw", "") or "" page_text = page_text.strip() text_blocks.append(f"--- Page {i+1} ---\n{page_text}") final_text = "\n\n".join(text_blocks).strip() if not final_text: logger.error(f"[Chandra OCR] OCR returned empty text for {file_path}") return final_text except Exception as e: logger.error(f"[Chandra OCR] Unexpected error: {e}", exc_info=True) return ""