Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- .env +1 -0
- .gitignore +56 -0
- README.md +82 -13
- app_gradio.py +107 -0
- requirements.txt +26 -0
- smart_ocr_pipeline_textonly.py +259 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=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
|
.gitignore
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.*.local
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
pip-wheel-metadata/
|
| 25 |
+
share/python-wheels/
|
| 26 |
+
*.egg-info/
|
| 27 |
+
.installed.cfg
|
| 28 |
+
*.egg
|
| 29 |
+
MANIFEST
|
| 30 |
+
|
| 31 |
+
# Virtual environments
|
| 32 |
+
venv/
|
| 33 |
+
ENV/
|
| 34 |
+
env/
|
| 35 |
+
.venv
|
| 36 |
+
|
| 37 |
+
# IDEs
|
| 38 |
+
.vscode/
|
| 39 |
+
.idea/
|
| 40 |
+
*.swp
|
| 41 |
+
*.swo
|
| 42 |
+
*~
|
| 43 |
+
|
| 44 |
+
# OS
|
| 45 |
+
.DS_Store
|
| 46 |
+
Thumbs.db
|
| 47 |
+
|
| 48 |
+
# Output files (optional - comment out if you want to track them)
|
| 49 |
+
processed_invoice.png
|
| 50 |
+
preview_invoice.png
|
| 51 |
+
ocr_result.json
|
| 52 |
+
ocr_lines.txt
|
| 53 |
+
smart_output.json
|
| 54 |
+
|
| 55 |
+
# Logs
|
| 56 |
+
*.log
|
README.md
CHANGED
|
@@ -1,13 +1,82 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: OCR Text
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file:
|
| 9 |
-
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Smart OCR Pipeline Text-Only
|
| 3 |
+
emoji: 💰
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.0.0
|
| 8 |
+
app_file: app_gradio.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Smart OCR Pipeline - Cost-Optimized (Text-Only)
|
| 14 |
+
|
| 15 |
+
**10-50x more cost-effective** invoice OCR system - sends only text to GPT, not images.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- 🖼️ **Image Preprocessing**: Automatic denoising and enhancement
|
| 20 |
+
- 📄 **DocTR OCR**: State-of-the-art text extraction
|
| 21 |
+
- 🤖 **GPT-4o-mini**: AI post-processing with text only (no image)
|
| 22 |
+
- 💰 **Cost-Optimized**: Significantly cheaper than sending images
|
| 23 |
+
- 📊 **Token Tracking**: Real-time cost estimation
|
| 24 |
+
- ✅ **Structured Output**: Clean JSON with all invoice data
|
| 25 |
+
|
| 26 |
+
## How It Works
|
| 27 |
+
|
| 28 |
+
1. **Upload** an invoice image (JPG, PNG, BMP, TIFF)
|
| 29 |
+
2. **Process** - The system will:
|
| 30 |
+
- Clean and enhance the image
|
| 31 |
+
- Extract text using DocTR OCR
|
| 32 |
+
- Send only text to GPT-4o-mini for structured extraction
|
| 33 |
+
- Group and format the data
|
| 34 |
+
3. **Get Results** - Structured JSON with all invoice data + cost estimate
|
| 35 |
+
|
| 36 |
+
## Cost
|
| 37 |
+
|
| 38 |
+
- **~$0.001-$0.003 per invoice**
|
| 39 |
+
- **10-50x cheaper** than sending images to GPT
|
| 40 |
+
- Perfect for: High-volume processing, clean invoices
|
| 41 |
+
|
| 42 |
+
## Cost Comparison
|
| 43 |
+
|
| 44 |
+
| Volume | Cost per Invoice | Monthly Cost |
|
| 45 |
+
|--------|------------------|--------------|
|
| 46 |
+
| 100/month | $0.002 | $0.20 |
|
| 47 |
+
| 1,000/month | $0.002 | $2.00 |
|
| 48 |
+
| 10,000/month | $0.002 | $20.00 |
|
| 49 |
+
|
| 50 |
+
Compare to full version: 10,000 invoices would cost **$200-500/month**!
|
| 51 |
+
|
| 52 |
+
## Configuration
|
| 53 |
+
|
| 54 |
+
This Space requires an OpenAI API key set as a secret:
|
| 55 |
+
- Secret name: `OPENAI_API_KEY`
|
| 56 |
+
- Get your key from: https://platform.openai.com/api-keys
|
| 57 |
+
|
| 58 |
+
## When to Use This Version
|
| 59 |
+
|
| 60 |
+
✅ **Use Text-Only when:**
|
| 61 |
+
- Processing high volumes (>100 invoices/day)
|
| 62 |
+
- Invoices are relatively clean and standard format
|
| 63 |
+
- Cost optimization is a priority
|
| 64 |
+
- Need faster processing times
|
| 65 |
+
|
| 66 |
+
❌ **Use Full Version when:**
|
| 67 |
+
- Invoices are complex or poor quality
|
| 68 |
+
- Need absolute highest accuracy
|
| 69 |
+
- Processing low volumes
|
| 70 |
+
- Budget allows for higher costs
|
| 71 |
+
|
| 72 |
+
## Use Cases
|
| 73 |
+
|
| 74 |
+
- High-volume invoice processing
|
| 75 |
+
- Receipt digitization at scale
|
| 76 |
+
- Bulk document processing
|
| 77 |
+
- Cost-conscious automation
|
| 78 |
+
- Startup/SMB accounting systems
|
| 79 |
+
|
| 80 |
+
## License
|
| 81 |
+
|
| 82 |
+
MIT License
|
app_gradio.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from smart_ocr_pipeline_textonly import main as process_invoice
|
| 6 |
+
|
| 7 |
+
# Set page title and description
|
| 8 |
+
title = "💰 Smart OCR Pipeline - Cost-Optimized (Text-Only)"
|
| 9 |
+
description = """
|
| 10 |
+
**Cost-Efficient Invoice OCR**
|
| 11 |
+
|
| 12 |
+
This service uses:
|
| 13 |
+
- DocTR for text extraction
|
| 14 |
+
- GPT-4o-mini for structured data extraction (text only - no image sent)
|
| 15 |
+
- Smart line grouping and validation
|
| 16 |
+
|
| 17 |
+
**Cost:** ~$0.001-$0.003 per invoice (10-50x cheaper than full version!)
|
| 18 |
+
**Best for:** High volume processing, clean invoices
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def process_invoice_gradio(image):
|
| 22 |
+
"""Process invoice image and return structured data"""
|
| 23 |
+
if image is None:
|
| 24 |
+
return "Please upload an image first."
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
# Save uploaded image temporarily
|
| 28 |
+
temp_dir = "temp_uploads"
|
| 29 |
+
Path(temp_dir).mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
temp_path = os.path.join(temp_dir, "temp_invoice.jpg")
|
| 32 |
+
image.save(temp_path)
|
| 33 |
+
|
| 34 |
+
# Process with OCR pipeline
|
| 35 |
+
result = process_invoice(temp_path, temp_dir)
|
| 36 |
+
|
| 37 |
+
# Format output as JSON with cost estimate
|
| 38 |
+
if result.get("usage"):
|
| 39 |
+
tokens = result["usage"]["total_tokens"]
|
| 40 |
+
cost = tokens * 0.00000015 # GPT-4o-mini pricing
|
| 41 |
+
result["estimated_cost"] = f"${cost:.6f}"
|
| 42 |
+
|
| 43 |
+
output = json.dumps(result, indent=2, ensure_ascii=False)
|
| 44 |
+
|
| 45 |
+
return output
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return f"Error processing invoice: {str(e)}"
|
| 49 |
+
|
| 50 |
+
# Create Gradio interface
|
| 51 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 52 |
+
gr.Markdown(f"# {title}")
|
| 53 |
+
gr.Markdown(description)
|
| 54 |
+
|
| 55 |
+
gr.Markdown("### 💡 **Cost Savings:** 10-50x cheaper than sending images to GPT!")
|
| 56 |
+
|
| 57 |
+
with gr.Row():
|
| 58 |
+
with gr.Column():
|
| 59 |
+
image_input = gr.Image(
|
| 60 |
+
type="pil",
|
| 61 |
+
label="Upload Invoice Image",
|
| 62 |
+
sources=["upload", "clipboard"]
|
| 63 |
+
)
|
| 64 |
+
submit_btn = gr.Button("Process Invoice (Text-Only)", variant="primary")
|
| 65 |
+
|
| 66 |
+
with gr.Column():
|
| 67 |
+
output = gr.Textbox(
|
| 68 |
+
label="Extracted Data (JSON)",
|
| 69 |
+
lines=20,
|
| 70 |
+
max_lines=30
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Features
|
| 74 |
+
gr.Markdown("### 📋 Features:")
|
| 75 |
+
gr.Markdown("""
|
| 76 |
+
- ✅ Image preprocessing (denoise, enhance)
|
| 77 |
+
- ✅ DocTR OCR extraction
|
| 78 |
+
- ✅ Smart line grouping
|
| 79 |
+
- ✅ GPT-4o-mini post-processing (text only)
|
| 80 |
+
- ✅ Token usage tracking
|
| 81 |
+
- ✅ Cost estimation per invoice
|
| 82 |
+
- ✅ Structured JSON output
|
| 83 |
+
""")
|
| 84 |
+
|
| 85 |
+
gr.Markdown("### 💰 Typical Costs:")
|
| 86 |
+
gr.Markdown("""
|
| 87 |
+
| Volume | Cost per Invoice | Monthly Cost |
|
| 88 |
+
|--------|------------------|--------------|
|
| 89 |
+
| 100/month | $0.002 | $0.20 |
|
| 90 |
+
| 1,000/month | $0.002 | $2.00 |
|
| 91 |
+
| 10,000/month | $0.002 | $20.00 |
|
| 92 |
+
""")
|
| 93 |
+
|
| 94 |
+
# Event handler
|
| 95 |
+
submit_btn.click(
|
| 96 |
+
fn=process_invoice_gradio,
|
| 97 |
+
inputs=image_input,
|
| 98 |
+
outputs=output
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Launch
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
demo.launch(
|
| 104 |
+
share=False,
|
| 105 |
+
server_name="0.0.0.0",
|
| 106 |
+
server_port=7860
|
| 107 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for Smart OCR Pipeline
|
| 2 |
+
openai>=1.3.0
|
| 3 |
+
python-dotenv>=1.0.0
|
| 4 |
+
|
| 5 |
+
# Web framework (FastAPI for Render)
|
| 6 |
+
fastapi>=0.104.0
|
| 7 |
+
uvicorn[standard]>=0.24.0
|
| 8 |
+
python-multipart>=0.0.6
|
| 9 |
+
|
| 10 |
+
# Gradio for Hugging Face Spaces
|
| 11 |
+
gradio>=4.0.0
|
| 12 |
+
|
| 13 |
+
# Image processing
|
| 14 |
+
opencv-python>=4.8.0
|
| 15 |
+
numpy>=1.24.0
|
| 16 |
+
Pillow>=10.0.0
|
| 17 |
+
|
| 18 |
+
# OCR engines
|
| 19 |
+
python-doctr[torch]>=0.7.0
|
| 20 |
+
|
| 21 |
+
# Optional: Tesseract fallback
|
| 22 |
+
# pytesseract>=0.3.10
|
| 23 |
+
# Install Tesseract separately: https://github.com/tesseract-ocr/tesseract
|
| 24 |
+
|
| 25 |
+
# Optional: EasyOCR fallback
|
| 26 |
+
# easyocr>=1.7.0
|
smart_ocr_pipeline_textonly.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
smart_ocr_pipeline_textonly.py
|
| 5 |
+
---------------------------------
|
| 6 |
+
Same as smart_ocr_pipeline_final.py but optimized for cost efficiency:
|
| 7 |
+
- Uses DocTR for OCR text extraction.
|
| 8 |
+
- Sends only text (no image) to GPT-4o-mini.
|
| 9 |
+
- Keeps full validation, logging, and token usage tracking.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
import time
|
| 16 |
+
import base64
|
| 17 |
+
import logging
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Dict, List, Tuple
|
| 20 |
+
|
| 21 |
+
import cv2
|
| 22 |
+
import numpy as np
|
| 23 |
+
from PIL import Image
|
| 24 |
+
from doctr.io import DocumentFile
|
| 25 |
+
from doctr.models import ocr_predictor
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
load_dotenv()
|
| 31 |
+
except Exception:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
# ------------------------------------------------------------
|
| 35 |
+
# Logging setup
|
| 36 |
+
# ------------------------------------------------------------
|
| 37 |
+
def setup_logger() -> logging.Logger:
|
| 38 |
+
logger = logging.getLogger("smart_ocr_textonly")
|
| 39 |
+
logger.setLevel(logging.INFO)
|
| 40 |
+
if not logger.handlers:
|
| 41 |
+
ch = logging.StreamHandler(sys.stdout)
|
| 42 |
+
ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
|
| 43 |
+
logger.addHandler(ch)
|
| 44 |
+
return logger
|
| 45 |
+
|
| 46 |
+
log = setup_logger()
|
| 47 |
+
|
| 48 |
+
# ------------------------------------------------------------
|
| 49 |
+
# Setup
|
| 50 |
+
# ------------------------------------------------------------
|
| 51 |
+
def setup_environment() -> OpenAI:
|
| 52 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 53 |
+
if not api_key:
|
| 54 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables.")
|
| 55 |
+
log.info("OpenAI client initialized")
|
| 56 |
+
return OpenAI(api_key=api_key)
|
| 57 |
+
|
| 58 |
+
_DOCTR_MODEL = None
|
| 59 |
+
|
| 60 |
+
def get_doctr_model():
|
| 61 |
+
global _DOCTR_MODEL
|
| 62 |
+
if _DOCTR_MODEL is None:
|
| 63 |
+
t0 = time.time()
|
| 64 |
+
_DOCTR_MODEL = ocr_predictor(pretrained=True)
|
| 65 |
+
log.info(f"DocTR model loaded in {time.time() - t0:.2f}s")
|
| 66 |
+
return _DOCTR_MODEL
|
| 67 |
+
|
| 68 |
+
# ------------------------------------------------------------
|
| 69 |
+
# Image preprocessing
|
| 70 |
+
# ------------------------------------------------------------
|
| 71 |
+
def preprocess_image(input_path: str, output_dir: str = ".") -> str:
|
| 72 |
+
log.info("Loading image for preprocessing...")
|
| 73 |
+
img = cv2.imread(input_path)
|
| 74 |
+
if img is None:
|
| 75 |
+
raise ValueError(f"Cannot load image: {input_path}")
|
| 76 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 77 |
+
denoised = cv2.bilateralFilter(gray, 9, 75, 75)
|
| 78 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 79 |
+
enhanced = clahe.apply(denoised)
|
| 80 |
+
normalized = cv2.normalize(enhanced, None, 0, 255, cv2.NORM_MINMAX)
|
| 81 |
+
processed_path = os.path.join(output_dir, "processed_invoice.png")
|
| 82 |
+
cv2.imwrite(processed_path, normalized)
|
| 83 |
+
log.info(f"Processed image saved: {processed_path}")
|
| 84 |
+
return processed_path
|
| 85 |
+
|
| 86 |
+
# ------------------------------------------------------------
|
| 87 |
+
# OCR extraction
|
| 88 |
+
# ------------------------------------------------------------
|
| 89 |
+
def extract_text_with_doctr(image_path: str, output_dir: str = ".") -> Tuple[str, Dict, List[str]]:
|
| 90 |
+
log.info("Running DocTR OCR...")
|
| 91 |
+
model = get_doctr_model()
|
| 92 |
+
doc = DocumentFile.from_images(image_path)
|
| 93 |
+
result = model(doc)
|
| 94 |
+
|
| 95 |
+
all_lines = []
|
| 96 |
+
for page in result.pages:
|
| 97 |
+
for block in page.blocks:
|
| 98 |
+
for line in block.lines:
|
| 99 |
+
text = " ".join(w.value for w in line.words).strip()
|
| 100 |
+
if text:
|
| 101 |
+
all_lines.append(text)
|
| 102 |
+
|
| 103 |
+
avg_conf = np.mean([w.confidence for p in result.pages for b in p.blocks for l in b.lines for w in l.words])
|
| 104 |
+
ocr_json = {"average_confidence": float(avg_conf), "lines": all_lines}
|
| 105 |
+
|
| 106 |
+
# Smart grouping: pair item lines with their numeric lines
|
| 107 |
+
grouped_lines = []
|
| 108 |
+
pending_item = None
|
| 109 |
+
for line in all_lines:
|
| 110 |
+
if any(x.isdigit() for x in line) and any(w in line.lower() for w in ["pz", "kg", "lt"]):
|
| 111 |
+
# numeric line (quantities)
|
| 112 |
+
if pending_item:
|
| 113 |
+
grouped_lines.append(f"{pending_item} | {line}")
|
| 114 |
+
pending_item = None
|
| 115 |
+
else:
|
| 116 |
+
grouped_lines.append(line)
|
| 117 |
+
else:
|
| 118 |
+
# item line
|
| 119 |
+
if pending_item:
|
| 120 |
+
grouped_lines.append(pending_item)
|
| 121 |
+
pending_item = line
|
| 122 |
+
if pending_item:
|
| 123 |
+
grouped_lines.append(pending_item)
|
| 124 |
+
|
| 125 |
+
ocr_text = "\n".join(grouped_lines)
|
| 126 |
+
|
| 127 |
+
log.info(f"DocTR OCR complete (confidence: {avg_conf:.2f}, lines: {len(all_lines)}, grouped: {len(grouped_lines)})")
|
| 128 |
+
return ocr_text, ocr_json, all_lines
|
| 129 |
+
|
| 130 |
+
# ------------------------------------------------------------
|
| 131 |
+
# GPT post-processing (text only)
|
| 132 |
+
# ------------------------------------------------------------
|
| 133 |
+
def extract_structured_data(client: OpenAI, ocr_text: str, model_name: str = "gpt-4o-mini") -> Dict:
|
| 134 |
+
log.info(f"Processing OCR text with {model_name}...")
|
| 135 |
+
|
| 136 |
+
system_message = """
|
| 137 |
+
You are a professional invoice/receipt parser for ChefCode.
|
| 138 |
+
You receive raw OCR text extracted from an invoice and must convert it into structured JSON.
|
| 139 |
+
|
| 140 |
+
Return ONLY valid JSON with this schema:
|
| 141 |
+
{
|
| 142 |
+
"supplier": "string",
|
| 143 |
+
"invoice_number": "string",
|
| 144 |
+
"date": "YYYY-MM-DD or null",
|
| 145 |
+
"line_items": [
|
| 146 |
+
{
|
| 147 |
+
"lot_number": "string",
|
| 148 |
+
"item_name": "string",
|
| 149 |
+
"unit": "string",
|
| 150 |
+
"quantity": number,
|
| 151 |
+
"unit_price": number or null,
|
| 152 |
+
"line_total": number or null,
|
| 153 |
+
"type": "string"
|
| 154 |
+
}
|
| 155 |
+
],
|
| 156 |
+
"total_amount": number or null,
|
| 157 |
+
"confidence": "high | medium | low"
|
| 158 |
+
}
|
| 159 |
+
Extraction rules (critical):
|
| 160 |
+
- The table is horizontal: Lot → Item → Unit → Quantity → Unit Price → Line Total.
|
| 161 |
+
- The quantity is the number DIRECTLY AFTER the unit.
|
| 162 |
+
- If numbers for a line appear missing, check up to TWO lines BELOW that line in OCR_LINES,
|
| 163 |
+
- Do not ignore header words (Quantità, Prezzo, Sconto, Importo, IVA).
|
| 164 |
+
- Do not skip any visible row; compare OCR row count with extracted items and recover missing lines.
|
| 165 |
+
- Verify math: quantity × unit_price ≈ line_total (±3%). If off, re-read digits from the image.
|
| 166 |
+
- If two adjacent rows share identical numbers, re-check both in the image; do not merge distinct items.
|
| 167 |
+
- Use "." as decimal separator and strip any currency symbols.
|
| 168 |
+
- Keep supplier and item names exactly as printed; do not translate them.
|
| 169 |
+
- Infer "type" (meat/vegetable/dairy/grain/condiment/beverage/grocery). If invoice language is Italian,
|
| 170 |
+
output these category words in Italian (carne, verdura, latticini, cereali, condimento, bevanda, drogheria).
|
| 171 |
+
- Output ONLY JSON — no prose, no markdown.""".strip()
|
| 172 |
+
|
| 173 |
+
user_message = f"Extract structured data from this OCR text:\n\n{ocr_text[:8000]}"
|
| 174 |
+
|
| 175 |
+
resp = client.chat.completions.create(
|
| 176 |
+
model=model_name,
|
| 177 |
+
temperature=0.1,
|
| 178 |
+
max_completion_tokens=2000,
|
| 179 |
+
messages=[
|
| 180 |
+
{"role": "system", "content": system_message},
|
| 181 |
+
{"role": "user", "content": user_message},
|
| 182 |
+
],
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# Capture real token usage
|
| 186 |
+
usage = None
|
| 187 |
+
if hasattr(resp, "usage") and resp.usage:
|
| 188 |
+
usage = {
|
| 189 |
+
"prompt_tokens": resp.usage.prompt_tokens,
|
| 190 |
+
"completion_tokens": resp.usage.completion_tokens,
|
| 191 |
+
"total_tokens": resp.usage.total_tokens,
|
| 192 |
+
}
|
| 193 |
+
print(f"🔢 Token usage: {usage}")
|
| 194 |
+
|
| 195 |
+
raw = resp.choices[0].message.content.strip()
|
| 196 |
+
if raw.startswith("```json"):
|
| 197 |
+
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 198 |
+
elif raw.startswith("```"):
|
| 199 |
+
raw = raw.replace("```", "").strip()
|
| 200 |
+
|
| 201 |
+
try:
|
| 202 |
+
data = json.loads(raw)
|
| 203 |
+
except json.JSONDecodeError:
|
| 204 |
+
log.error("Failed to parse JSON output from GPT.")
|
| 205 |
+
data = {"error": "json_parse_error", "raw": raw}
|
| 206 |
+
|
| 207 |
+
if usage:
|
| 208 |
+
data["usage"] = usage
|
| 209 |
+
|
| 210 |
+
return data
|
| 211 |
+
|
| 212 |
+
# ------------------------------------------------------------
|
| 213 |
+
# Main pipeline
|
| 214 |
+
# ------------------------------------------------------------
|
| 215 |
+
def main(invoice_path: str, output_dir: str = "."):
|
| 216 |
+
print("\n" + "="*60)
|
| 217 |
+
print("🧠 SMART OCR PIPELINE (TEXT-ONLY, gpt-4o-mini)")
|
| 218 |
+
print("="*60 + "\n")
|
| 219 |
+
|
| 220 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 221 |
+
client = setup_environment()
|
| 222 |
+
|
| 223 |
+
t0 = time.time()
|
| 224 |
+
processed_path = preprocess_image(invoice_path, output_dir)
|
| 225 |
+
ocr_text, ocr_json, lines = extract_text_with_doctr(processed_path, output_dir)
|
| 226 |
+
structured = extract_structured_data(client, ocr_text, model_name="gpt-4o-mini")
|
| 227 |
+
|
| 228 |
+
final_output = {
|
| 229 |
+
"status": "success",
|
| 230 |
+
"pipeline_version": "3.2_textonly_gpt4o-mini",
|
| 231 |
+
"input_file": Path(invoice_path).name,
|
| 232 |
+
"ocr_confidence": ocr_json.get("average_confidence", 0.0),
|
| 233 |
+
"lines_detected": len(lines),
|
| 234 |
+
"data": structured,
|
| 235 |
+
"elapsed_sec": round(time.time() - t0, 2),
|
| 236 |
+
"usage": structured.get("usage", None),
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
out_path = os.path.join(output_dir, "smart_output_textonly.json")
|
| 240 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 241 |
+
json.dump(final_output, f, indent=2, ensure_ascii=False)
|
| 242 |
+
|
| 243 |
+
log.info(f"Final output saved: {out_path}")
|
| 244 |
+
log.info(f"OCR Confidence: {final_output['ocr_confidence']:.2f}")
|
| 245 |
+
if final_output["usage"]:
|
| 246 |
+
log.info(f"Token usage: {final_output['usage']}")
|
| 247 |
+
log.info(f"Elapsed time: {final_output['elapsed_sec']}s")
|
| 248 |
+
|
| 249 |
+
print("\nDone.\n")
|
| 250 |
+
return final_output
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
if len(sys.argv) < 2:
|
| 254 |
+
print("Usage: python smart_ocr_pipeline_textonly.py <path/to/invoice.jpg> [output_dir]")
|
| 255 |
+
sys.exit(1)
|
| 256 |
+
|
| 257 |
+
invoice_path = sys.argv[1]
|
| 258 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "."
|
| 259 |
+
main(invoice_path, output_dir)
|