Spaces:

sammoftah
/

receipt-scanner

Sleeping

File size: 8,090 Bytes

"""
Receipt Scanner
Upload a receipt photo and extract structured data (items, prices, totals).
"""

import gradio as gr
from huggingface_hub import InferenceClient
from PIL import Image
import base64
from io import BytesIO
import json
import pandas as pd
import re
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from shared.components import create_method_panel, create_premium_hero

# Initialize Hugging Face Inference Client
client = InferenceClient()

EXTRACTION_PROMPT = """Analyze this receipt image and extract ALL information in a structured format.

Extract:
1. **Merchant/Store Name**
2. **Date** (in YYYY-MM-DD format if possible)
3. **Time** (if visible)
4. **Items** - List each item with its price
5. **Subtotal** (if shown)
6. **Tax** (if shown)
7. **Total Amount**
8. **Payment Method** (if visible)

Format your response EXACTLY as JSON:
```json
{
  "merchant": "Store Name",
  "date": "YYYY-MM-DD",
  "time": "HH:MM",
  "items": [
    {"name": "Item 1", "price": 0.00},
    {"name": "Item 2", "price": 0.00}
  ],
  "subtotal": 0.00,
  "tax": 0.00,
  "total": 0.00,
  "payment_method": "Card/Cash/etc"
}
```

Be precise with numbers. If something is unclear, use null."""


def extract_json_from_text(text):
    """Extract JSON from markdown code blocks or raw text."""
    # Try to find JSON in code blocks first
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1)

    # Try to find raw JSON
    json_match = re.search(r'\{.*\}', text, re.DOTALL)
    if json_match:
        return json_match.group(0)

    return None


def scan_receipt(image):
    """Extract structured data from receipt using VLM."""
    if image is None:
        return "❌ Please upload a receipt first!", "", ""

    try:
        if not os.getenv("HF_TOKEN"):
            data = {
                "merchant": "Manual review required",
                "date": None,
                "time": None,
                "items": [],
                "subtotal": None,
                "tax": None,
                "total": None,
                "payment_method": None,
                "note": "HF_TOKEN is not configured for hosted vision inference. The image was received, but field extraction needs a Space secret or manual entry.",
            }
            summary = """# 🧾 Receipt Ready For Review

The image uploaded correctly, but hosted vision inference is not configured on this Space.

To enable automatic extraction, add a Hugging Face token as a Space secret named `HF_TOKEN`.

Until then, this Space still documents the expected schema and downstream JSON shape.
"""
            return summary, pd.DataFrame(columns=["name", "price"]), json.dumps(data, indent=2)

        # Convert PIL Image to base64
        buffered = BytesIO()
        if isinstance(image, str):
            image = Image.open(image)
        image.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()

        # Use Florence-2 or Qwen2-VL for OCR + understanding
        response = client.chat_completion(
            model="Qwen/Qwen2-VL-7B-Instruct",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}"}},
                        {"type": "text", "text": EXTRACTION_PROMPT}
                    ]
                }
            ],
            max_tokens=1000,
            temperature=0.1  # Low temperature for accuracy
        )

        raw_response = response.choices[0].message.content

        # Extract JSON from response
        json_str = extract_json_from_text(raw_response)
        if not json_str:
            return f"⚠️ Could not parse receipt data.\n\nRaw response:\n{raw_response}", "", ""

        # Parse JSON
        data = json.loads(json_str)

        # Create formatted summary
        summary = f"""# 🧾 Receipt Analysis

**Merchant**: {data.get('merchant', 'N/A')}
**Date**: {data.get('date', 'N/A')}
**Time**: {data.get('time', 'N/A')}

---

## 📦 Items

"""
        # Add items table
        if data.get('items'):
            for item in data['items']:
                name = item.get('name', 'Unknown')
                price = item.get('price', 0.0)
                summary += f"- **{name}**: ${price:.2f}\n"
        else:
            summary += "*No items found*\n"

        summary += f"""
---

## 💰 Totals

- **Subtotal**: ${data.get('subtotal', 0.0):.2f}
- **Tax**: ${data.get('tax', 0.0):.2f}
- **Total**: ${data.get('total', 0.0):.2f}

**Payment**: {data.get('payment_method', 'N/A')}
"""

        # Create DataFrame for table view
        if data.get('items'):
            df = pd.DataFrame(data['items'])
            df['price'] = df['price'].apply(lambda x: f"${x:.2f}")
        else:
            df = pd.DataFrame(columns=['name', 'price'])

        # Format JSON for download
        json_output = json.dumps(data, indent=2)

        return summary, df, json_output

    except json.JSONDecodeError as e:
        return f"❌ Error parsing JSON: {str(e)}\n\nRaw response:\n{raw_response}", "", ""
    except Exception as e:
        return f"❌ Error scanning receipt: {str(e)}", "", ""


# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    create_premium_hero(
        "Receipt Scanner",
        "Extract merchant, items, totals, and payment details from receipt images with a vision-language model workflow.",
        "🧾",
        badge="Document Vision",
        highlights=["Vision-language extraction", "Structured JSON", "CSV export"],
    )
    create_method_panel({
        "Technique": "Image-to-structured-data extraction with schema parsing and tabular validation.",
        "What it proves": "You can turn multimodal model output into reliable downstream data products.",
        "HF capability": "Designed for Hub-hosted VLM inference and lightweight Space deployment.",
    })

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="📸 Upload Receipt Photo",
                type="pil",
                height=400
            )

            scan_btn = gr.Button("🔍 Scan Receipt", variant="primary", size="lg")

            gr.Markdown("""
            ### 💡 Tips for Best Results:
            - Good lighting, minimal shadows
            - Receipt should be flat and clear
            - Include the entire receipt
            - High contrast works best
            """)

        with gr.Column(scale=1):
            summary_output = gr.Markdown(label="📊 Summary")

    with gr.Row():
        with gr.Column():
            table_output = gr.Dataframe(
                label="📋 Items Table",
                headers=["name", "price"],
                interactive=False
            )

        with gr.Column():
            json_output = gr.Textbox(
                label="📄 JSON Data (copy to download)",
                lines=15,
                max_lines=20
            )

    # Event handler
    scan_btn.click(
        fn=scan_receipt,
        inputs=[image_input],
        outputs=[summary_output, table_output, json_output],
        api_name="scan"
    )

    gr.Markdown("""
    ---
    ### 🎓 What This App Does:

    1. **OCR + Understanding**: Doesn't just read text, understands structure
    2. **Data Extraction**: Identifies items, prices, totals, dates
    3. **JSON Export**: Download structured data for expense tracking
    4. **Table View**: See items in an organized format

    ### 📊 Use Cases:

    - **Expense Tracking**: Digitize receipts for accounting
    - **Budget Apps**: Auto-import spending data
    - **Tax Records**: Organize business expenses
    - **Reimbursements**: Submit itemized claims
    - **Personal Finance**: Track spending categories

    *Note: Accuracy depends on receipt clarity and format. Complex layouts may require manual verification.*
    """)

if __name__ == "__main__":
    demo.launch()