snipebargain / src /ocr_utils.py
theDavidGuy's picture
Update src/ocr_utils.py
d789f56 verified
import io, re
from typing import List, Dict, Tuple, Any
from PIL import Image, ImageDraw
try:
import pytesseract
from pytesseract import Output as TessOutput
except Exception:
pytesseract = None
TessOutput = None
PRICE_RE = re.compile(r"(\$\s*\d{1,4}(?:\.\d{2})?|\b\d{1,4}\.\d{2}\b)")
def ocr_image(image_bytes: bytes):
"""
ALWAYS return (full_text, tokens, size) even if OCR fails.
"""
try:
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
except Exception:
return "", [], (0, 0)
if pytesseract is None:
return "", [], img.size
try:
data = pytesseract.image_to_data(img, output_type=TessOutput.DICT)
except Exception:
return "", [], img.size
tokens: List[Dict[str, Any]] = []
for i, txt in enumerate(data["text"]):
if not txt:
continue
try:
conf = float(data.get("conf", ["-1"])[i])
except Exception:
conf = -1.0
x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
tokens.append({"text": txt, "conf": conf, "box": (x, y, w, h)})
full_text = " ".join(t["text"] for t in tokens)
return full_text, tokens, img.size
def guess_price(tokens: List[Dict[str, Any]]):
"""Find smallest plausible price in tokens."""
best = None
for t in tokens:
m = PRICE_RE.search(t["text"].replace(",", ""))
if m:
raw = m.group(0).replace("$", "").strip()
try:
val = float(raw)
if 0.5 <= val <= 1000:
if best is None or val < best[0]:
best = (val, t["box"])
except Exception:
pass
return best if best else (None, None)
def guess_title(text: str) -> str:
"""Make a short human-readable title from OCR text."""
words = re.findall(r"[A-Za-z0-9\-]{3,}", text or "")
ret