Spaces:
Sleeping
Sleeping
File size: 4,707 Bytes
5f3b8de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """
OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback.
"""
import numpy as np
from PIL import Image
from dataclasses import dataclass, field
from typing import List
import logging
logger = logging.getLogger(__name__)
@dataclass
class WordResult:
"""A single detected word with position and confidence."""
text: str
confidence: float
x: int
y: int
width: int
height: int
block_id: int = 0
line_id: int = 0
@dataclass
class OCRResult:
"""Complete OCR result for an image."""
words: List[WordResult] = field(default_factory=list)
raw_text: str = ""
average_confidence: float = 0.0
engine_used: str = "unknown"
image_width: int = 0
image_height: int = 0
def run_ocr(image_input, engine: str = "auto") -> OCRResult:
"""Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'."""
if engine == "auto":
try:
result = _run_doctr(image_input)
if result.words:
logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}")
return result
except Exception as e:
logger.warning(f"docTR failed ({e}), falling back to Tesseract")
try:
return _run_tesseract(image_input)
except Exception as e:
raise RuntimeError(f"All OCR engines failed: {e}")
elif engine == "doctr":
return _run_doctr(image_input)
elif engine == "tesseract":
return _run_tesseract(image_input)
raise ValueError(f"Unknown engine: {engine}")
def _run_doctr(image_input) -> OCRResult:
"""Run OCR using docTR."""
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
result = OCRResult(engine_used="docTR")
if isinstance(image_input, str):
doc = DocumentFile.from_images(image_input)
pil_img = Image.open(image_input)
result.image_width, result.image_height = pil_img.size
elif isinstance(image_input, np.ndarray):
doc = [image_input]
result.image_height, result.image_width = image_input.shape[:2]
else:
raise ValueError("image_input must be a file path or numpy array")
predictor = ocr_predictor(
det_arch='db_resnet50', reco_arch='crnn_vgg16_bn',
pretrained=True, assume_straight_pages=True
)
output = predictor(doc)
block_id = 0
all_conf = []
for page in output.pages:
for block in page.blocks:
for line_idx, line in enumerate(block.lines):
for word in line.words:
(x_min, y_min), (x_max, y_max) = word.geometry
w = WordResult(
text=word.value, confidence=word.confidence,
x=int(x_min * result.image_width), y=int(y_min * result.image_height),
width=int((x_max - x_min) * result.image_width),
height=int((y_max - y_min) * result.image_height),
block_id=block_id, line_id=line_idx,
)
result.words.append(w)
all_conf.append(word.confidence)
block_id += 1
result.raw_text = " ".join(w.text for w in result.words)
result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0
return result
def _run_tesseract(image_input) -> OCRResult:
"""Run OCR using Tesseract (fallback)."""
import pytesseract
result = OCRResult(engine_used="Tesseract")
if isinstance(image_input, str):
pil_img = Image.open(image_input)
elif isinstance(image_input, np.ndarray):
pil_img = Image.fromarray(image_input)
else:
raise ValueError("image_input must be a file path or numpy array")
result.image_width, result.image_height = pil_img.size
data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6')
for i in range(len(data['text'])):
text = data['text'][i].strip()
if not text:
continue
conf = max(0, int(data['conf'][i])) / 100.0
w = WordResult(
text=text, confidence=conf,
x=data['left'][i], y=data['top'][i],
width=data['width'][i], height=data['height'][i],
block_id=data['block_num'][i], line_id=data['line_num'][i],
)
result.words.append(w)
result.raw_text = " ".join(w.text for w in result.words)
confs = [w.confidence for w in result.words]
result.average_confidence = sum(confs) / len(confs) if confs else 0.0
logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}")
return result
|