mcp_ocr_chandra / ocr_engine.py
Vachudev's picture
Update ocr_engine.py
ecf7bf7 verified
import os
import logging
from typing import List
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
from chandra.model import InferenceManager
from chandra.model.schema import BatchInputItem
logger = logging.getLogger("ocr_engine")
# Lazy-load global so the model loads only once
_chandra_manager: InferenceManager | None = None
def _get_chandra_manager() -> InferenceManager:
global _chandra_manager
if _chandra_manager is None:
logger.info("[Chandra OCR] Loading model (method='hf')...")
_chandra_manager = InferenceManager(method="hf")
return _chandra_manager
def extract_text(file_path: str) -> str:
"""
Extract text from a PDF or Image using Chandra OCR.
Returns full text content only (no confidence scores).
"""
path = Path(file_path)
if not path.exists():
logger.error(f"[Chandra OCR] File not found: {file_path}")
return ""
# Load PDF pages or image
images: List[Image.Image] = []
try:
# PDF
if path.suffix.lower() == ".pdf":
try:
images = convert_from_path(str(path)) # You can set dpi=300 if needed
except Exception as e:
logger.error(f"[Chandra OCR] PDF conversion error: {e}", exc_info=True)
return ""
# Image formats
elif path.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}:
try:
images = [Image.open(str(path)).convert("RGB")]
except Exception as e:
logger.error(f"[Chandra OCR] Image open error: {e}", exc_info=True)
return ""
else:
logger.error(f"[Chandra OCR] Unsupported file type: {path.suffix}")
return ""
if not images:
logger.error(f"[Chandra OCR] No images/pages loaded from file: {file_path}")
return ""
# Prepare OCR batch
manager = _get_chandra_manager()
batch = [
BatchInputItem(
image=img,
prompt_type="ocr_layout"
)
for img in images
]
# Run OCR
logger.info(f"[Chandra OCR] Running OCR on {len(batch)} pages...")
results = manager.generate(batch)
# Join pages into final text
text_blocks = []
for i, result in enumerate(results):
page_text = getattr(result, "markdown", None) or getattr(result, "raw", "") or ""
page_text = page_text.strip()
text_blocks.append(f"--- Page {i+1} ---\n{page_text}")
final_text = "\n\n".join(text_blocks).strip()
if not final_text:
logger.error(f"[Chandra OCR] OCR returned empty text for {file_path}")
return final_text
except Exception as e:
logger.error(f"[Chandra OCR] Unexpected error: {e}", exc_info=True)
return ""