Spaces:

arnavam
/

accujuris-api

Sleeping

App Files Files Community

accujuris-api / app /services /ocr_service.py

arnavam

Initial HuggingFace deployment

cd3078d about 2 months ago

raw

history blame contribute delete

6.23 kB

	"""
	OCR service with prioritized extraction:
	1) Native PDF text extraction (fast, accurate for digital PDFs)
	2) Chandra OCR (if installed)
	3) Tesseract OCR (if installed)
	"""

	import logging
	import os
	import tempfile
	import unicodedata
	from io import BytesIO
	from pathlib import Path
	from typing import Callable, Optional

	logger = logging.getLogger(__name__)

	_chandra_model = None
	OCREngine = Callable[[bytes, str], Optional[str]]
	_ocr_engines: dict[str, OCREngine] = {}


	def _preview_text(text: str, limit: int = 240) -> str:
	one_line = " ".join(text.split())
	if len(one_line) <= limit:
	return one_line
	return one_line[:limit] + "..."


	def _normalize_output_text(text: str) -> str:
	# Normalize combining marks for stable rendering across Malayalam-capable fonts.
	return unicodedata.normalize("NFC", text.replace("\ufeff", ""))


	def _extract_pdf_text(file_bytes: bytes) -> str:
	from pypdf import PdfReader

	reader = PdfReader(BytesIO(file_bytes))
	pages = []
	for page in reader.pages:
	pages.append(page.extract_text() or "")
	return "\n\n".join(pages).strip()


	def _embedded_pdf_engine(file_bytes: bytes, filename: str) -> Optional[str]:
	if not filename.lower().endswith(".pdf"):
	return None
	try:
	extracted = _extract_pdf_text(file_bytes)
	return extracted if extracted else None
	except Exception:
	logger.exception("Embedded PDF extraction failed for %s", filename)
	return None


	def _get_or_load_chandra_model():
	global _chandra_model
	if _chandra_model is not None:
	return _chandra_model

	from chandra.model import InferenceManager

	_chandra_model = InferenceManager(method="hf")
	return _chandra_model


	def _run_chandra_ocr(file_bytes: bytes, filename: str) -> Optional[str]:
	try:
	from chandra.input import load_file
	from chandra.model.schema import BatchInputItem
	except Exception:
	return None

	suffix = Path(filename).suffix or ".bin"
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	tmp.write(file_bytes)
	tmp_path = tmp.name

	try:
	model = _get_or_load_chandra_model()
	images = load_file(tmp_path)
	batch = [BatchInputItem(image=img, prompt_type="ocr_layout") for img in images]
	results = model.generate(batch)
	text = "\n\n".join((r.markdown or "").strip() for r in results).strip()
	return text if text else None
	finally:
	try:
	os.remove(tmp_path)
	except OSError:
	pass


	def _run_tesseract_ocr(file_bytes: bytes, filename: str) -> Optional[str]:
	try:
	import pytesseract
	from PIL import Image
	except Exception:
	return None

	ext = Path(filename).suffix.lower()
	from app.config import settings
	langs = settings.OCR_LANGS

	try:
	if ext == ".pdf":
	try:
	import pypdfium2 as pdfium
	except Exception:
	return None

	text_parts = []
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
	tmp.write(file_bytes)
	pdf_path = tmp.name
	try:
	pdf = pdfium.PdfDocument(pdf_path)
	for i in range(len(pdf)):
	page = pdf[i]
	pil_image = page.render(scale=2.0).to_pil()
	text_parts.append(pytesseract.image_to_string(pil_image, lang=langs))
	text = "\n\n".join(p.strip() for p in text_parts if p and p.strip()).strip()
	return text if text else None
	finally:
	try:
	os.remove(pdf_path)
	except OSError:
	pass

	image = Image.open(BytesIO(file_bytes))
	text = pytesseract.image_to_string(image, lang=langs).strip()
	return text if text else None
	except Exception:
	return None


	def extract_readable_text(file_bytes: bytes, filename: str) -> str:
	ext = os.path.splitext(filename)[1].lower()

	if ext in (".txt", ".md", ".csv"):
	extracted = _normalize_output_text(file_bytes.decode("utf-8", errors="replace"))
	logger.info("Decoded text from %s (%d chars): %s", filename, len(extracted), _preview_text(extracted))
	return extracted

	if ext == ".pdf":
	return f"[PDF uploaded: {filename} - Awaiting OCR]"


	async def perform_ocr(file_bytes: bytes, filename: str) -> str:
	ext = os.path.splitext(filename)[1].lower()

	if ext in (".txt", ".md", ".csv"):
	return extract_readable_text(file_bytes, filename)

	order = [x.strip() for x in os.getenv("OCR_ENGINE_ORDER", "tesseract,chandra").split(",") if x.strip()]
	for engine_name in order:
	engine = _ocr_engines.get(engine_name)
	if not engine:
	continue
	text = engine(file_bytes, filename)
	if text:
	text = _normalize_output_text(text)
	logger.info("%s OCR text from %s (%d chars): %s", engine_name, filename, len(text), _preview_text(text))

	# Save the OCR output for log verification
	log_dir = Path("logs")
	log_dir.mkdir(exist_ok=True)
	log_file = log_dir / "ocr_output.log"
	try:
	with open(log_file, "a", encoding="utf-8") as f:
	f.write(f"=== {filename} ({engine_name}) ===\n")
	f.write(text)
	f.write("\n===============================\n\n")
	except Exception as e:
	logger.error(f"Failed to write OCR output to log file: {e}")

	return text

	logger.warning("OCR unavailable/failed for %s", filename)
	return "[OCR failed or is not configured for this file. Install chandra-ocr or tesseract dependencies.]"


	def register_ocr_engine(name: str, engine: OCREngine) -> None:
	_ocr_engines[name] = engine


	def get_ocr_engines() -> list[str]:
	return sorted(_ocr_engines.keys())


	register_ocr_engine("embedded_pdf", _embedded_pdf_engine)
	register_ocr_engine("tesseract", _run_tesseract_ocr)
	register_ocr_engine("chandra", _run_chandra_ocr)