Spaces:

xeeshan404
/

agentic-image2word

Sleeping

App Files Files Community

agentic-image2word / core /ocr_engine.py

xeeshan404

Initial deployment: Agentic Image2Word Converter

5f3b8de verified 27 days ago

raw

history blame contribute delete

4.71 kB

	"""
	OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback.
	"""

	import numpy as np
	from PIL import Image
	from dataclasses import dataclass, field
	from typing import List
	import logging

	logger = logging.getLogger(__name__)


	@dataclass
	class WordResult:
	"""A single detected word with position and confidence."""
	text: str
	confidence: float
	x: int
	y: int
	width: int
	height: int
	block_id: int = 0
	line_id: int = 0


	@dataclass
	class OCRResult:
	"""Complete OCR result for an image."""
	words: List[WordResult] = field(default_factory=list)
	raw_text: str = ""
	average_confidence: float = 0.0
	engine_used: str = "unknown"
	image_width: int = 0
	image_height: int = 0


	def run_ocr(image_input, engine: str = "auto") -> OCRResult:
	"""Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'."""
	if engine == "auto":
	try:
	result = _run_doctr(image_input)
	if result.words:
	logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}")
	return result
	except Exception as e:
	logger.warning(f"docTR failed ({e}), falling back to Tesseract")
	try:
	return _run_tesseract(image_input)
	except Exception as e:
	raise RuntimeError(f"All OCR engines failed: {e}")
	elif engine == "doctr":
	return _run_doctr(image_input)
	elif engine == "tesseract":
	return _run_tesseract(image_input)
	raise ValueError(f"Unknown engine: {engine}")


	def _run_doctr(image_input) -> OCRResult:
	"""Run OCR using docTR."""
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor

	result = OCRResult(engine_used="docTR")

	if isinstance(image_input, str):
	doc = DocumentFile.from_images(image_input)
	pil_img = Image.open(image_input)
	result.image_width, result.image_height = pil_img.size
	elif isinstance(image_input, np.ndarray):
	doc = [image_input]
	result.image_height, result.image_width = image_input.shape[:2]
	else:
	raise ValueError("image_input must be a file path or numpy array")

	predictor = ocr_predictor(
	det_arch='db_resnet50', reco_arch='crnn_vgg16_bn',
	pretrained=True, assume_straight_pages=True
	)
	output = predictor(doc)

	block_id = 0
	all_conf = []
	for page in output.pages:
	for block in page.blocks:
	for line_idx, line in enumerate(block.lines):
	for word in line.words:
	(x_min, y_min), (x_max, y_max) = word.geometry
	w = WordResult(
	text=word.value, confidence=word.confidence,
	x=int(x_min * result.image_width), y=int(y_min * result.image_height),
	width=int((x_max - x_min) * result.image_width),
	height=int((y_max - y_min) * result.image_height),
	block_id=block_id, line_id=line_idx,
	)
	result.words.append(w)
	all_conf.append(word.confidence)
	block_id += 1

	result.raw_text = " ".join(w.text for w in result.words)
	result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0
	return result


	def _run_tesseract(image_input) -> OCRResult:
	"""Run OCR using Tesseract (fallback)."""
	import pytesseract

	result = OCRResult(engine_used="Tesseract")

	if isinstance(image_input, str):
	pil_img = Image.open(image_input)
	elif isinstance(image_input, np.ndarray):
	pil_img = Image.fromarray(image_input)
	else:
	raise ValueError("image_input must be a file path or numpy array")

	result.image_width, result.image_height = pil_img.size
	data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6')

	for i in range(len(data['text'])):
	text = data['text'][i].strip()
	if not text:
	continue
	conf = max(0, int(data['conf'][i])) / 100.0
	w = WordResult(
	text=text, confidence=conf,
	x=data['left'][i], y=data['top'][i],
	width=data['width'][i], height=data['height'][i],
	block_id=data['block_num'][i], line_id=data['line_num'][i],
	)
	result.words.append(w)

	result.raw_text = " ".join(w.text for w in result.words)
	confs = [w.confidence for w in result.words]
	result.average_confidence = sum(confs) / len(confs) if confs else 0.0
	logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}")
	return result