Spaces:
Sleeping
Sleeping
File size: 2,924 Bytes
dc79584 ecf7bf7 dc79584 ecf7bf7 678c3d1 ecf7bf7 dc79584 ecf7bf7 678c3d1 dc79584 ecf7bf7 678c3d1 ecf7bf7 678c3d1 ecf7bf7 678c3d1 ecf7bf7 dc79584 ecf7bf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import os
import logging
from typing import List
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
from chandra.model import InferenceManager
from chandra.model.schema import BatchInputItem
logger = logging.getLogger("ocr_engine")
# Lazy-load global so the model loads only once
_chandra_manager: InferenceManager | None = None
def _get_chandra_manager() -> InferenceManager:
global _chandra_manager
if _chandra_manager is None:
logger.info("[Chandra OCR] Loading model (method='hf')...")
_chandra_manager = InferenceManager(method="hf")
return _chandra_manager
def extract_text(file_path: str) -> str:
"""
Extract text from a PDF or Image using Chandra OCR.
Returns full text content only (no confidence scores).
"""
path = Path(file_path)
if not path.exists():
logger.error(f"[Chandra OCR] File not found: {file_path}")
return ""
# Load PDF pages or image
images: List[Image.Image] = []
try:
# PDF
if path.suffix.lower() == ".pdf":
try:
images = convert_from_path(str(path)) # You can set dpi=300 if needed
except Exception as e:
logger.error(f"[Chandra OCR] PDF conversion error: {e}", exc_info=True)
return ""
# Image formats
elif path.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}:
try:
images = [Image.open(str(path)).convert("RGB")]
except Exception as e:
logger.error(f"[Chandra OCR] Image open error: {e}", exc_info=True)
return ""
else:
logger.error(f"[Chandra OCR] Unsupported file type: {path.suffix}")
return ""
if not images:
logger.error(f"[Chandra OCR] No images/pages loaded from file: {file_path}")
return ""
# Prepare OCR batch
manager = _get_chandra_manager()
batch = [
BatchInputItem(
image=img,
prompt_type="ocr_layout"
)
for img in images
]
# Run OCR
logger.info(f"[Chandra OCR] Running OCR on {len(batch)} pages...")
results = manager.generate(batch)
# Join pages into final text
text_blocks = []
for i, result in enumerate(results):
page_text = getattr(result, "markdown", None) or getattr(result, "raw", "") or ""
page_text = page_text.strip()
text_blocks.append(f"--- Page {i+1} ---\n{page_text}")
final_text = "\n\n".join(text_blocks).strip()
if not final_text:
logger.error(f"[Chandra OCR] OCR returned empty text for {file_path}")
return final_text
except Exception as e:
logger.error(f"[Chandra OCR] Unexpected error: {e}", exc_info=True)
return ""
|