""" Receipt Scanner — AI-powered receipt parser using MiniCPM-V 4.6 Deploy to Hugging Face Spaces (GPU T4 small or better recommended). """ # `spaces` MUST be imported before torch/transformers on HF Spaces — # the package hooks into CUDA initialisation and raises a RuntimeError # if anything has already touched CUDA before it loads. # The try/except makes the same file work fine when running locally. try: import spaces # noqa: F401 except ImportError: pass import json import re import io import base64 import numpy as np import gradio as gr import torch from PIL import Image from transformers import AutoModelForImageTextToText, AutoProcessor # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── MODEL_ID = "openbmb/MiniCPM-V-4.6" DOWNSAMPLE_MODE = "4x" # "4x" = finer detail, ideal for dense receipt text MAX_SLICE_NUMS = 36 # allow high-res slicing for sharp photos MAX_NEW_TOKENS = 1200 # ───────────────────────────────────────────────────────────────────────────── # Structured extraction prompt # ───────────────────────────────────────────────────────────────────────────── RECEIPT_PROMPT = """\ You are a precise receipt data extractor. Carefully read every part of the receipt image. Return ONLY a valid JSON object — no markdown fences, no explanation, nothing else. Use this exact schema (set any unknown field to null): { "store": { "name": "string | null", "address": "string | null", "phone": "string | null" }, "transaction": { "date": "YYYY-MM-DD string | null", "time": "HH:MM string | null", "receipt_number": "string | null", "cashier": "string | null" }, "items": [ { "name": "string", "quantity": number, "unit_price": number | null, "total_price": number } ], "subtotal": number | null, "discounts": number | null, "tax": number | null, "tax_rate": "string | null", "total": number | null, "payment": { "method": "string | null", "amount_tendered": number | null, "change": number | null }, "currency": "string" } Rules: - Numbers must be numeric (e.g. 4.99), never strings. - If quantity is not printed, assume 1. - Extract EVERY line item you can see. - For discounts/coupons, use a positive number (it will be shown as a deduction). - Currency: use the symbol or 3-letter ISO code visible on the receipt (default "$"). """ # ───────────────────────────────────────────────────────────────────────────── # Utility — normalise escaped newlines emitted by some model responses # (taken from the official MiniCPM-V 4.6 model card) # ───────────────────────────────────────────────────────────────────────────── _NL_PATTERN = re.compile( r"(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$" r"|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])" r"|(? str: if not isinstance(text, str) or "\\" not in text: return text return _NL_PATTERN.sub(lambda m: m.group(1) or "\n", text) # ───────────────────────────────────────────────────────────────────────────── # Model — lazy-loaded on first inference (required for ZeroGPU) # ───────────────────────────────────────────────────────────────────────────── _processor = None _model = None def _get_model(): global _processor, _model if _model is None: print(f"Loading {MODEL_ID} …") _processor = AutoProcessor.from_pretrained(MODEL_ID) _model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype="auto", device_map="cuda", ) _model.eval() print("✓ Model ready") return _processor, _model # ───────────────────────────────────────────────────────────────────────────── # Inference # ───────────────────────────────────────────────────────────────────────────── def _to_pil(image) -> Image.Image: """Accept numpy array (Gradio) or PIL Image.""" if isinstance(image, np.ndarray): return Image.fromarray(image).convert("RGB") return image.convert("RGB") @spaces.GPU def run_model(pil_image: Image.Image) -> str: """Run the model and return raw text output.""" processor, model = _get_model() messages = [ { "role": "user", "content": [ {"type": "image", "image": pil_image}, {"type": "text", "text": RECEIPT_PROMPT}, ], } ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", downsample_mode=DOWNSAMPLE_MODE, max_slice_nums=MAX_SLICE_NUMS, ).to(model.device) with torch.inference_mode(): generated_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, downsample_mode=DOWNSAMPLE_MODE, do_sample=False, ) trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids) ] text = processor.batch_decode( trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] return _normalize(text) # ───────────────────────────────────────────────────────────────────────────── # JSON extraction & formatting # ───────────────────────────────────────────────────────────────────────────── def _extract_json(raw: str) -> dict | None: """Strip markdown fences and parse the first JSON object found.""" # Remove ```json … ``` wrappers cleaned = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw.strip(), flags=re.MULTILINE) match = re.search(r"\{[\s\S]*\}", cleaned) if not match: return None try: return json.loads(match.group()) except json.JSONDecodeError: return None def _fmt(value, sym: str = "") -> str: if value is None: return "—" try: return f"{sym}{float(value):.2f}" except (TypeError, ValueError): return str(value) def build_markdown(d: dict) -> str: lines: list[str] = [] # Currency symbol raw_cur = d.get("currency") or "$" sym = raw_cur if len(raw_cur) == 1 else "$" # ── Store ──────────────────────────────────────────────────────────────── store = d.get("store") or {} if store.get("name"): lines.append(f"## 🏪 {store['name']}") if store.get("address"): lines.append(f"📍 {store['address']}") if store.get("phone"): lines.append(f"📞 {store['phone']}") # ── Transaction metadata ───────────────────────────────────────────────── tx = d.get("transaction") or {} tx_lines = [] if tx.get("date"): tx_lines.append(f"📅 **Date:** {tx['date']}") if tx.get("time"): tx_lines.append(f"🕐 **Time:** {tx['time']}") if tx.get("receipt_number"): tx_lines.append(f"🧾 **Receipt #:** {tx['receipt_number']}") if tx.get("cashier"): tx_lines.append(f"👤 **Cashier:** {tx['cashier']}") if tx_lines: lines.append("") lines.extend(tx_lines) # ── Line items ─────────────────────────────────────────────────────────── items = d.get("items") or [] if items: lines += ["", "---", "### 🛒 Items Purchased", ""] for item in items: name = item.get("name", "Unknown") qty = item.get("quantity") or 1 total = item.get("total_price") unit = item.get("unit_price") unit_str = "" if unit is not None and qty != 1: unit_str = f" ({_fmt(unit, sym)} ea.)" lines.append(f"- **{name}** ×{qty}{unit_str}  →  **{_fmt(total, sym)}**") # ── Totals ─────────────────────────────────────────────────────────────── lines += ["", "---", ""] if d.get("subtotal") is not None: lines.append(f"Subtotal:   {_fmt(d['subtotal'], sym)}") if d.get("discounts") and float(d.get("discounts") or 0) != 0: lines.append(f"Discounts:   −{_fmt(abs(float(d['discounts'])), sym)}") if d.get("tax") is not None: rate_str = f" ({d['tax_rate']})" if d.get("tax_rate") else "" lines.append(f"Tax{rate_str}:   {_fmt(d['tax'], sym)}") if d.get("total") is not None: lines.append(f"\n### 💰 Total: {_fmt(d['total'], sym)}") # ── Payment ────────────────────────────────────────────────────────────── pay = d.get("payment") or {} pay_lines = [] if pay.get("method"): pay_lines.append(f"💳 **Method:** {pay['method']}") if pay.get("amount_tendered") is not None: pay_lines.append(f"💵 **Tendered:** {_fmt(pay['amount_tendered'], sym)}") if pay.get("change") is not None: pay_lines.append(f"🔄 **Change:** {_fmt(pay['change'], sym)}") if pay_lines: lines.append("") lines.extend(pay_lines) # Currency code (only show when it's a 3-letter code, not a symbol) if raw_cur and len(raw_cur) > 1: lines.append(f"\n*Currency: {raw_cur}*") return "\n".join(lines) # ───────────────────────────────────────────────────────────────────────────── # Top-level handler wired to Gradio # ───────────────────────────────────────────────────────────────────────────── def parse_receipt(image) -> tuple[str, str]: """ Returns (markdown_summary, json_string). Gradio calls this with a numpy array or None. """ if image is None: return "⚠️ Please upload or capture a receipt image to begin.", "{}" pil_image = _to_pil(image) try: raw_text = run_model(pil_image) except Exception as exc: return f"❌ Model error: {exc}", "{}" data = _extract_json(raw_text) if data is None: # Model returned non-JSON — show raw text as fallback return f"**Raw model output (JSON parse failed):**\n\n```\n{raw_text}\n```", "{}" markdown = build_markdown(data) json_str = json.dumps(data, indent=2, ensure_ascii=False) return markdown, json_str # ───────────────────────────────────────────────────────────────────────────── # Gradio UI # ───────────────────────────────────────────────────────────────────────────── TIPS = """\ **Tips for best results:** - Hold the camera directly above the receipt (avoid angles) - Make sure the receipt is fully visible and well-lit - Flatten crumpled receipts before scanning """ with gr.Blocks(title="🧾 AI Receipt Scanner") as demo: gr.Markdown(""" # 🧾 AI Receipt Scanner Upload a receipt photo or snap one with your camera. The model extracts every line item, price, tax, and store metadata automatically. """) with gr.Row(equal_height=False): # ── Input column ───────────────────────────────────────────────────── with gr.Column(scale=1): image_input = gr.Image( label="Receipt Image", sources=["upload", "webcam", "clipboard"], type="numpy", height=500, image_mode="RGB", ) scan_btn = gr.Button("🔍 Scan Receipt", variant="primary", size="lg") gr.Markdown(TIPS) # ── Output column ──────────────────────────────────────────────────── with gr.Column(scale=1): with gr.Tabs(): with gr.TabItem("📋 Summary"): summary_out = gr.Markdown( value="*Scan a receipt to see results here.*" ) with gr.TabItem("{ } Raw JSON"): json_out = gr.Code( value="{}", language="json", label="Structured JSON output", interactive=False, ) # Wire up the button scan_btn.click( fn=parse_receipt, inputs=[image_input], outputs=[summary_out, json_out], api_name="scan", ) # Also scan automatically when an image is uploaded/captured image_input.change( fn=parse_receipt, inputs=[image_input], outputs=[summary_out, json_out], ) gr.Markdown(""" --- *Powered by [MiniCPM-V 4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) — a lightweight 1.3 B multimodal model.* *Source: [OpenBMB / MiniCPM-V](https://github.com/OpenBMB/MiniCPM-V)* """) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"), share=True)