Spaces:
Sleeping
Sleeping
| import io, re | |
| from typing import List, Dict, Tuple, Any | |
| from PIL import Image, ImageDraw | |
| try: | |
| import pytesseract | |
| from pytesseract import Output as TessOutput | |
| except Exception: | |
| pytesseract = None | |
| TessOutput = None | |
| PRICE_RE = re.compile(r"(\$\s*\d{1,4}(?:\.\d{2})?|\b\d{1,4}\.\d{2}\b)") | |
| def ocr_image(image_bytes: bytes): | |
| """ | |
| ALWAYS return (full_text, tokens, size) even if OCR fails. | |
| """ | |
| try: | |
| img = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| except Exception: | |
| return "", [], (0, 0) | |
| if pytesseract is None: | |
| return "", [], img.size | |
| try: | |
| data = pytesseract.image_to_data(img, output_type=TessOutput.DICT) | |
| except Exception: | |
| return "", [], img.size | |
| tokens: List[Dict[str, Any]] = [] | |
| for i, txt in enumerate(data["text"]): | |
| if not txt: | |
| continue | |
| try: | |
| conf = float(data.get("conf", ["-1"])[i]) | |
| except Exception: | |
| conf = -1.0 | |
| x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i] | |
| tokens.append({"text": txt, "conf": conf, "box": (x, y, w, h)}) | |
| full_text = " ".join(t["text"] for t in tokens) | |
| return full_text, tokens, img.size | |
| def guess_price(tokens: List[Dict[str, Any]]): | |
| """Find smallest plausible price in tokens.""" | |
| best = None | |
| for t in tokens: | |
| m = PRICE_RE.search(t["text"].replace(",", "")) | |
| if m: | |
| raw = m.group(0).replace("$", "").strip() | |
| try: | |
| val = float(raw) | |
| if 0.5 <= val <= 1000: | |
| if best is None or val < best[0]: | |
| best = (val, t["box"]) | |
| except Exception: | |
| pass | |
| return best if best else (None, None) | |
| def guess_title(text: str) -> str: | |
| """Make a short human-readable title from OCR text.""" | |
| words = re.findall(r"[A-Za-z0-9\-]{3,}", text or "") | |
| ret | |