Spaces:

Mariem-Daha
/

OCR_Text

Sleeping

App Files Files Community

Mariem-Daha commited on Oct 12, 2025

Commit

b37fab8

verified ·

1 Parent(s): 7b0a2f6

Upload 6 files

Browse files

Files changed (6) hide show

.env +1 -0
.gitignore +56 -0
README.md +82 -13
app_gradio.py +107 -0
requirements.txt +26 -0
smart_ocr_pipeline_textonly.py +259 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

.gitignore ADDED Viewed

	@@ -0,0 +1,56 @@

+# Environment variables
+.env
+.env.local
+.env.*.local
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+venv/
+ENV/
+env/
+.venv
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Output files (optional - comment out if you want to track them)
+processed_invoice.png
+preview_invoice.png
+ocr_result.json
+ocr_lines.txt
+smart_output.json
+# Logs
+*.log

README.md CHANGED Viewed

@@ -1,13 +1,82 @@
----
-title: OCR Text
-emoji: 🌖
-colorFrom: yellow
-colorTo: red
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Smart OCR Pipeline Text-Only
+emoji: 💰
+colorFrom: green
+colorTo: yellow
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app_gradio.py
+pinned: false
+license: mit
+---
+# Smart OCR Pipeline - Cost-Optimized (Text-Only)
+**10-50x more cost-effective** invoice OCR system - sends only text to GPT, not images.
+## Features
+- 🖼️ **Image Preprocessing**: Automatic denoising and enhancement
+- 📄 **DocTR OCR**: State-of-the-art text extraction
+- 🤖 **GPT-4o-mini**: AI post-processing with text only (no image)
+- 💰 **Cost-Optimized**: Significantly cheaper than sending images
+- 📊 **Token Tracking**: Real-time cost estimation
+- ✅ **Structured Output**: Clean JSON with all invoice data
+## How It Works
+1. **Upload** an invoice image (JPG, PNG, BMP, TIFF)
+2. **Process** - The system will:
+   - Clean and enhance the image
+   - Extract text using DocTR OCR
+   - Send only text to GPT-4o-mini for structured extraction
+   - Group and format the data
+3. **Get Results** - Structured JSON with all invoice data + cost estimate
+## Cost
+- **~$0.001-$0.003 per invoice**
+- **10-50x cheaper** than sending images to GPT
+- Perfect for: High-volume processing, clean invoices
+## Cost Comparison
+| Volume | Cost per Invoice | Monthly Cost |
+|--------|------------------|--------------|
+| 100/month | $0.002 | $0.20 |
+| 1,000/month | $0.002 | $2.00 |
+| 10,000/month | $0.002 | $20.00 |
+Compare to full version: 10,000 invoices would cost **$200-500/month**!
+## Configuration
+This Space requires an OpenAI API key set as a secret:
+- Secret name: `OPENAI_API_KEY`
+- Get your key from: https://platform.openai.com/api-keys
+## When to Use This Version
+✅ **Use Text-Only when:**
+- Processing high volumes (>100 invoices/day)
+- Invoices are relatively clean and standard format
+- Cost optimization is a priority
+- Need faster processing times
+❌ **Use Full Version when:**
+- Invoices are complex or poor quality
+- Need absolute highest accuracy
+- Processing low volumes
+- Budget allows for higher costs
+## Use Cases
+- High-volume invoice processing
+- Receipt digitization at scale
+- Bulk document processing
+- Cost-conscious automation
+- Startup/SMB accounting systems
+## License
+MIT License

app_gradio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import gradio as gr
+import os
+import json
+from pathlib import Path
+from smart_ocr_pipeline_textonly import main as process_invoice
+# Set page title and description
+title = "💰 Smart OCR Pipeline - Cost-Optimized (Text-Only)"
+description = """
+**Cost-Efficient Invoice OCR**
+This service uses:
+- DocTR for text extraction
+- GPT-4o-mini for structured data extraction (text only - no image sent)
+- Smart line grouping and validation
+**Cost:** ~$0.001-$0.003 per invoice (10-50x cheaper than full version!)
+**Best for:** High volume processing, clean invoices
+"""
+def process_invoice_gradio(image):
+    """Process invoice image and return structured data"""
+    if image is None:
+        return "Please upload an image first."
+    try:
+        # Save uploaded image temporarily
+        temp_dir = "temp_uploads"
+        Path(temp_dir).mkdir(exist_ok=True)
+        temp_path = os.path.join(temp_dir, "temp_invoice.jpg")
+        image.save(temp_path)
+        # Process with OCR pipeline
+        result = process_invoice(temp_path, temp_dir)
+        # Format output as JSON with cost estimate
+        if result.get("usage"):
+            tokens = result["usage"]["total_tokens"]
+            cost = tokens * 0.00000015  # GPT-4o-mini pricing
+            result["estimated_cost"] = f"${cost:.6f}"
+        output = json.dumps(result, indent=2, ensure_ascii=False)
+        return output
+    except Exception as e:
+        return f"Error processing invoice: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"# {title}")
+    gr.Markdown(description)
+    gr.Markdown("### 💡 **Cost Savings:** 10-50x cheaper than sending images to GPT!")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                type="pil",
+                label="Upload Invoice Image",
+                sources=["upload", "clipboard"]
+            )
+            submit_btn = gr.Button("Process Invoice (Text-Only)", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(
+                label="Extracted Data (JSON)",
+                lines=20,
+                max_lines=30
+            )
+    # Features
+    gr.Markdown("### 📋 Features:")
+    gr.Markdown("""
+    - ✅ Image preprocessing (denoise, enhance)
+    - ✅ DocTR OCR extraction
+    - ✅ Smart line grouping
+    - ✅ GPT-4o-mini post-processing (text only)
+    - ✅ Token usage tracking
+    - ✅ Cost estimation per invoice
+    - ✅ Structured JSON output
+    """)
+    gr.Markdown("### 💰 Typical Costs:")
+    gr.Markdown("""
+    | Volume | Cost per Invoice | Monthly Cost |
+    |--------|------------------|--------------|
+    | 100/month | $0.002 | $0.20 |
+    | 1,000/month | $0.002 | $2.00 |
+    | 10,000/month | $0.002 | $20.00 |
+    """)
+    # Event handler
+    submit_btn.click(
+        fn=process_invoice_gradio,
+        inputs=image_input,
+        outputs=output
+    )
+# Launch
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Core dependencies for Smart OCR Pipeline
+openai>=1.3.0
+python-dotenv>=1.0.0
+# Web framework (FastAPI for Render)
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+python-multipart>=0.0.6
+# Gradio for Hugging Face Spaces
+gradio>=4.0.0
+# Image processing
+opencv-python>=4.8.0
+numpy>=1.24.0
+Pillow>=10.0.0
+# OCR engines
+python-doctr[torch]>=0.7.0
+# Optional: Tesseract fallback
+# pytesseract>=0.3.10
+# Install Tesseract separately: https://github.com/tesseract-ocr/tesseract
+# Optional: EasyOCR fallback
+# easyocr>=1.7.0

smart_ocr_pipeline_textonly.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+smart_ocr_pipeline_textonly.py
+---------------------------------
+Same as smart_ocr_pipeline_final.py but optimized for cost efficiency:
+ - Uses DocTR for OCR text extraction.
+ - Sends only text (no image) to GPT-4o-mini.
+ - Keeps full validation, logging, and token usage tracking.
+"""
+import os
+import sys
+import json
+import time
+import base64
+import logging
+from pathlib import Path
+from typing import Dict, List, Tuple
+import cv2
+import numpy as np
+from PIL import Image
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from openai import OpenAI
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except Exception:
+    pass
+# ------------------------------------------------------------
+# Logging setup
+# ------------------------------------------------------------
+def setup_logger() -> logging.Logger:
+    logger = logging.getLogger("smart_ocr_textonly")
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        ch = logging.StreamHandler(sys.stdout)
+        ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
+        logger.addHandler(ch)
+    return logger
+log = setup_logger()
+# ------------------------------------------------------------
+# Setup
+# ------------------------------------------------------------
+def setup_environment() -> OpenAI:
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY not found in environment variables.")
+    log.info("OpenAI client initialized")
+    return OpenAI(api_key=api_key)
+_DOCTR_MODEL = None
+def get_doctr_model():
+    global _DOCTR_MODEL
+    if _DOCTR_MODEL is None:
+        t0 = time.time()
+        _DOCTR_MODEL = ocr_predictor(pretrained=True)
+        log.info(f"DocTR model loaded in {time.time() - t0:.2f}s")
+    return _DOCTR_MODEL
+# ------------------------------------------------------------
+# Image preprocessing
+# ------------------------------------------------------------
+def preprocess_image(input_path: str, output_dir: str = ".") -> str:
+    log.info("Loading image for preprocessing...")
+    img = cv2.imread(input_path)
+    if img is None:
+        raise ValueError(f"Cannot load image: {input_path}")
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    denoised = cv2.bilateralFilter(gray, 9, 75, 75)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(denoised)
+    normalized = cv2.normalize(enhanced, None, 0, 255, cv2.NORM_MINMAX)
+    processed_path = os.path.join(output_dir, "processed_invoice.png")
+    cv2.imwrite(processed_path, normalized)
+    log.info(f"Processed image saved: {processed_path}")
+    return processed_path
+# ------------------------------------------------------------
+# OCR extraction
+# ------------------------------------------------------------
+def extract_text_with_doctr(image_path: str, output_dir: str = ".") -> Tuple[str, Dict, List[str]]:
+    log.info("Running DocTR OCR...")
+    model = get_doctr_model()
+    doc = DocumentFile.from_images(image_path)
+    result = model(doc)
+    all_lines = []
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                text = " ".join(w.value for w in line.words).strip()
+                if text:
+                    all_lines.append(text)
+    avg_conf = np.mean([w.confidence for p in result.pages for b in p.blocks for l in b.lines for w in l.words])
+    ocr_json = {"average_confidence": float(avg_conf), "lines": all_lines}
+    # Smart grouping: pair item lines with their numeric lines
+    grouped_lines = []
+    pending_item = None
+    for line in all_lines:
+        if any(x.isdigit() for x in line) and any(w in line.lower() for w in ["pz", "kg", "lt"]):
+            # numeric line (quantities)
+            if pending_item:
+                grouped_lines.append(f"{pending_item} | {line}")
+                pending_item = None
+            else:
+                grouped_lines.append(line)
+        else:
+            # item line
+            if pending_item:
+                grouped_lines.append(pending_item)
+            pending_item = line
+    if pending_item:
+        grouped_lines.append(pending_item)
+    ocr_text = "\n".join(grouped_lines)
+    log.info(f"DocTR OCR complete (confidence: {avg_conf:.2f}, lines: {len(all_lines)}, grouped: {len(grouped_lines)})")
+    return ocr_text, ocr_json, all_lines
+# ------------------------------------------------------------
+# GPT post-processing (text only)
+# ------------------------------------------------------------
+def extract_structured_data(client: OpenAI, ocr_text: str, model_name: str = "gpt-4o-mini") -> Dict:
+    log.info(f"Processing OCR text with {model_name}...")
+    system_message = """
+    You are a professional invoice/receipt parser for ChefCode.
+    You receive raw OCR text extracted from an invoice and must convert it into structured JSON.
+    Return ONLY valid JSON with this schema:
+    {
+      "supplier": "string",
+      "invoice_number": "string",
+      "date": "YYYY-MM-DD or null",
+      "line_items": [
+        {
+          "lot_number": "string",
+          "item_name": "string",
+          "unit": "string",
+          "quantity": number,
+          "unit_price": number or null,
+          "line_total": number or null,
+          "type": "string"
+        }
+      ],
+      "total_amount": number or null,
+      "confidence": "high | medium | low"
+    }
+    Extraction rules (critical):
+    - The table is horizontal: Lot → Item → Unit → Quantity → Unit Price → Line Total.
+    - The quantity is the number DIRECTLY AFTER the unit.
+    - If numbers for a line appear missing, check up to TWO lines BELOW that line in OCR_LINES,
+    - Do not ignore header words (Quantità, Prezzo, Sconto, Importo, IVA).
+    - Do not skip any visible row; compare OCR row count with extracted items and recover missing lines.
+    - Verify math: quantity × unit_price ≈ line_total (±3%). If off, re-read digits from the image.
+    - If two adjacent rows share identical numbers, re-check both in the image; do not merge distinct items.
+    - Use "." as decimal separator and strip any currency symbols.
+    - Keep supplier and item names exactly as printed; do not translate them.
+    - Infer "type" (meat/vegetable/dairy/grain/condiment/beverage/grocery). If invoice language is Italian,
+    output these category words in Italian (carne, verdura, latticini, cereali, condimento, bevanda, drogheria).
+    - Output ONLY JSON — no prose, no markdown.""".strip()
+    user_message = f"Extract structured data from this OCR text:\n\n{ocr_text[:8000]}"
+    resp = client.chat.completions.create(
+        model=model_name,
+        temperature=0.1,
+        max_completion_tokens=2000,
+        messages=[
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": user_message},
+        ],
+    )
+    # Capture real token usage
+    usage = None
+    if hasattr(resp, "usage") and resp.usage:
+        usage = {
+            "prompt_tokens": resp.usage.prompt_tokens,
+            "completion_tokens": resp.usage.completion_tokens,
+            "total_tokens": resp.usage.total_tokens,
+        }
+        print(f"🔢 Token usage: {usage}")
+    raw = resp.choices[0].message.content.strip()
+    if raw.startswith("```json"):
+        raw = raw.replace("```json", "").replace("```", "").strip()
+    elif raw.startswith("```"):
+        raw = raw.replace("```", "").strip()
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError:
+        log.error("Failed to parse JSON output from GPT.")
+        data = {"error": "json_parse_error", "raw": raw}
+    if usage:
+        data["usage"] = usage
+    return data
+# ------------------------------------------------------------
+# Main pipeline
+# ------------------------------------------------------------
+def main(invoice_path: str, output_dir: str = "."):
+    print("\n" + "="*60)
+    print("🧠 SMART OCR PIPELINE (TEXT-ONLY, gpt-4o-mini)")
+    print("="*60 + "\n")
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    client = setup_environment()
+    t0 = time.time()
+    processed_path = preprocess_image(invoice_path, output_dir)
+    ocr_text, ocr_json, lines = extract_text_with_doctr(processed_path, output_dir)
+    structured = extract_structured_data(client, ocr_text, model_name="gpt-4o-mini")
+    final_output = {
+        "status": "success",
+        "pipeline_version": "3.2_textonly_gpt4o-mini",
+        "input_file": Path(invoice_path).name,
+        "ocr_confidence": ocr_json.get("average_confidence", 0.0),
+        "lines_detected": len(lines),
+        "data": structured,
+        "elapsed_sec": round(time.time() - t0, 2),
+        "usage": structured.get("usage", None),
+    }
+    out_path = os.path.join(output_dir, "smart_output_textonly.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(final_output, f, indent=2, ensure_ascii=False)
+    log.info(f"Final output saved: {out_path}")
+    log.info(f"OCR Confidence: {final_output['ocr_confidence']:.2f}")
+    if final_output["usage"]:
+        log.info(f"Token usage: {final_output['usage']}")
+    log.info(f"Elapsed time: {final_output['elapsed_sec']}s")
+    print("\nDone.\n")
+    return final_output
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python smart_ocr_pipeline_textonly.py <path/to/invoice.jpg> [output_dir]")
+        sys.exit(1)
+    invoice_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "."
+    main(invoice_path, output_dir)