Spaces:
Sleeping
Sleeping
| """ | |
| OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback. | |
| """ | |
| import numpy as np | |
| from PIL import Image | |
| from dataclasses import dataclass, field | |
| from typing import List | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class WordResult: | |
| """A single detected word with position and confidence.""" | |
| text: str | |
| confidence: float | |
| x: int | |
| y: int | |
| width: int | |
| height: int | |
| block_id: int = 0 | |
| line_id: int = 0 | |
| class OCRResult: | |
| """Complete OCR result for an image.""" | |
| words: List[WordResult] = field(default_factory=list) | |
| raw_text: str = "" | |
| average_confidence: float = 0.0 | |
| engine_used: str = "unknown" | |
| image_width: int = 0 | |
| image_height: int = 0 | |
| def run_ocr(image_input, engine: str = "auto") -> OCRResult: | |
| """Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'.""" | |
| if engine == "auto": | |
| try: | |
| result = _run_doctr(image_input) | |
| if result.words: | |
| logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}") | |
| return result | |
| except Exception as e: | |
| logger.warning(f"docTR failed ({e}), falling back to Tesseract") | |
| try: | |
| return _run_tesseract(image_input) | |
| except Exception as e: | |
| raise RuntimeError(f"All OCR engines failed: {e}") | |
| elif engine == "doctr": | |
| return _run_doctr(image_input) | |
| elif engine == "tesseract": | |
| return _run_tesseract(image_input) | |
| raise ValueError(f"Unknown engine: {engine}") | |
| def _run_doctr(image_input) -> OCRResult: | |
| """Run OCR using docTR.""" | |
| from doctr.io import DocumentFile | |
| from doctr.models import ocr_predictor | |
| result = OCRResult(engine_used="docTR") | |
| if isinstance(image_input, str): | |
| doc = DocumentFile.from_images(image_input) | |
| pil_img = Image.open(image_input) | |
| result.image_width, result.image_height = pil_img.size | |
| elif isinstance(image_input, np.ndarray): | |
| doc = [image_input] | |
| result.image_height, result.image_width = image_input.shape[:2] | |
| else: | |
| raise ValueError("image_input must be a file path or numpy array") | |
| predictor = ocr_predictor( | |
| det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', | |
| pretrained=True, assume_straight_pages=True | |
| ) | |
| output = predictor(doc) | |
| block_id = 0 | |
| all_conf = [] | |
| for page in output.pages: | |
| for block in page.blocks: | |
| for line_idx, line in enumerate(block.lines): | |
| for word in line.words: | |
| (x_min, y_min), (x_max, y_max) = word.geometry | |
| w = WordResult( | |
| text=word.value, confidence=word.confidence, | |
| x=int(x_min * result.image_width), y=int(y_min * result.image_height), | |
| width=int((x_max - x_min) * result.image_width), | |
| height=int((y_max - y_min) * result.image_height), | |
| block_id=block_id, line_id=line_idx, | |
| ) | |
| result.words.append(w) | |
| all_conf.append(word.confidence) | |
| block_id += 1 | |
| result.raw_text = " ".join(w.text for w in result.words) | |
| result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0 | |
| return result | |
| def _run_tesseract(image_input) -> OCRResult: | |
| """Run OCR using Tesseract (fallback).""" | |
| import pytesseract | |
| result = OCRResult(engine_used="Tesseract") | |
| if isinstance(image_input, str): | |
| pil_img = Image.open(image_input) | |
| elif isinstance(image_input, np.ndarray): | |
| pil_img = Image.fromarray(image_input) | |
| else: | |
| raise ValueError("image_input must be a file path or numpy array") | |
| result.image_width, result.image_height = pil_img.size | |
| data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6') | |
| for i in range(len(data['text'])): | |
| text = data['text'][i].strip() | |
| if not text: | |
| continue | |
| conf = max(0, int(data['conf'][i])) / 100.0 | |
| w = WordResult( | |
| text=text, confidence=conf, | |
| x=data['left'][i], y=data['top'][i], | |
| width=data['width'][i], height=data['height'][i], | |
| block_id=data['block_num'][i], line_id=data['line_num'][i], | |
| ) | |
| result.words.append(w) | |
| result.raw_text = " ".join(w.text for w in result.words) | |
| confs = [w.confidence for w in result.words] | |
| result.average_confidence = sum(confs) / len(confs) if confs else 0.0 | |
| logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}") | |
| return result | |