Spaces:
Running
Running
| import os | |
| import logging | |
| from typing import List | |
| from pathlib import Path | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| from chandra.model import InferenceManager | |
| from chandra.model.schema import BatchInputItem | |
| logger = logging.getLogger("ocr_engine") | |
| # Lazy-load global so the model loads only once | |
| _chandra_manager: InferenceManager | None = None | |
| def _get_chandra_manager() -> InferenceManager: | |
| global _chandra_manager | |
| if _chandra_manager is None: | |
| logger.info("[Chandra OCR] Loading model (method='hf')...") | |
| _chandra_manager = InferenceManager(method="hf") | |
| return _chandra_manager | |
| def extract_text(file_path: str) -> str: | |
| """ | |
| Extract text from a PDF or Image using Chandra OCR. | |
| Returns full text content only (no confidence scores). | |
| """ | |
| path = Path(file_path) | |
| if not path.exists(): | |
| logger.error(f"[Chandra OCR] File not found: {file_path}") | |
| return "" | |
| # Load PDF pages or image | |
| images: List[Image.Image] = [] | |
| try: | |
| if path.suffix.lower() == ".pdf": | |
| try: | |
| images = convert_from_path(str(path)) # You can set dpi=300 if needed | |
| except Exception as e: | |
| logger.error(f"[Chandra OCR] PDF conversion error: {e}", exc_info=True) | |
| return "" | |
| # Image formats | |
| elif path.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}: | |
| try: | |
| images = [Image.open(str(path)).convert("RGB")] | |
| except Exception as e: | |
| logger.error(f"[Chandra OCR] Image open error: {e}", exc_info=True) | |
| return "" | |
| else: | |
| logger.error(f"[Chandra OCR] Unsupported file type: {path.suffix}") | |
| return "" | |
| if not images: | |
| logger.error(f"[Chandra OCR] No images/pages loaded from file: {file_path}") | |
| return "" | |
| # Prepare OCR batch | |
| manager = _get_chandra_manager() | |
| batch = [ | |
| BatchInputItem( | |
| image=img, | |
| prompt_type="ocr_layout" | |
| ) | |
| for img in images | |
| ] | |
| # Run OCR | |
| logger.info(f"[Chandra OCR] Running OCR on {len(batch)} pages...") | |
| results = manager.generate(batch) | |
| # Join pages into final text | |
| text_blocks = [] | |
| for i, result in enumerate(results): | |
| page_text = getattr(result, "markdown", None) or getattr(result, "raw", "") or "" | |
| page_text = page_text.strip() | |
| text_blocks.append(f"--- Page {i+1} ---\n{page_text}") | |
| final_text = "\n\n".join(text_blocks).strip() | |
| if not final_text: | |
| logger.error(f"[Chandra OCR] OCR returned empty text for {file_path}") | |
| return final_text | |
| except Exception as e: | |
| logger.error(f"[Chandra OCR] Unexpected error: {e}", exc_info=True) | |
| return "" | |