Mariem-Daha commited on
Commit
b37fab8
·
verified ·
1 Parent(s): 7b0a2f6

Upload 6 files

Browse files
Files changed (6) hide show
  1. .env +1 -0
  2. .gitignore +56 -0
  3. README.md +82 -13
  4. app_gradio.py +107 -0
  5. requirements.txt +26 -0
  6. smart_ocr_pipeline_textonly.py +259 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.*.local
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ pip-wheel-metadata/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # Virtual environments
32
+ venv/
33
+ ENV/
34
+ env/
35
+ .venv
36
+
37
+ # IDEs
38
+ .vscode/
39
+ .idea/
40
+ *.swp
41
+ *.swo
42
+ *~
43
+
44
+ # OS
45
+ .DS_Store
46
+ Thumbs.db
47
+
48
+ # Output files (optional - comment out if you want to track them)
49
+ processed_invoice.png
50
+ preview_invoice.png
51
+ ocr_result.json
52
+ ocr_lines.txt
53
+ smart_output.json
54
+
55
+ # Logs
56
+ *.log
README.md CHANGED
@@ -1,13 +1,82 @@
1
- ---
2
- title: OCR Text
3
- emoji: 🌖
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Smart OCR Pipeline Text-Only
3
+ emoji: 💰
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app_gradio.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Smart OCR Pipeline - Cost-Optimized (Text-Only)
14
+
15
+ **10-50x more cost-effective** invoice OCR system - sends only text to GPT, not images.
16
+
17
+ ## Features
18
+
19
+ - 🖼️ **Image Preprocessing**: Automatic denoising and enhancement
20
+ - 📄 **DocTR OCR**: State-of-the-art text extraction
21
+ - 🤖 **GPT-4o-mini**: AI post-processing with text only (no image)
22
+ - 💰 **Cost-Optimized**: Significantly cheaper than sending images
23
+ - 📊 **Token Tracking**: Real-time cost estimation
24
+ - ✅ **Structured Output**: Clean JSON with all invoice data
25
+
26
+ ## How It Works
27
+
28
+ 1. **Upload** an invoice image (JPG, PNG, BMP, TIFF)
29
+ 2. **Process** - The system will:
30
+ - Clean and enhance the image
31
+ - Extract text using DocTR OCR
32
+ - Send only text to GPT-4o-mini for structured extraction
33
+ - Group and format the data
34
+ 3. **Get Results** - Structured JSON with all invoice data + cost estimate
35
+
36
+ ## Cost
37
+
38
+ - **~$0.001-$0.003 per invoice**
39
+ - **10-50x cheaper** than sending images to GPT
40
+ - Perfect for: High-volume processing, clean invoices
41
+
42
+ ## Cost Comparison
43
+
44
+ | Volume | Cost per Invoice | Monthly Cost |
45
+ |--------|------------------|--------------|
46
+ | 100/month | $0.002 | $0.20 |
47
+ | 1,000/month | $0.002 | $2.00 |
48
+ | 10,000/month | $0.002 | $20.00 |
49
+
50
+ Compare to full version: 10,000 invoices would cost **$200-500/month**!
51
+
52
+ ## Configuration
53
+
54
+ This Space requires an OpenAI API key set as a secret:
55
+ - Secret name: `OPENAI_API_KEY`
56
+ - Get your key from: https://platform.openai.com/api-keys
57
+
58
+ ## When to Use This Version
59
+
60
+ ✅ **Use Text-Only when:**
61
+ - Processing high volumes (>100 invoices/day)
62
+ - Invoices are relatively clean and standard format
63
+ - Cost optimization is a priority
64
+ - Need faster processing times
65
+
66
+ ❌ **Use Full Version when:**
67
+ - Invoices are complex or poor quality
68
+ - Need absolute highest accuracy
69
+ - Processing low volumes
70
+ - Budget allows for higher costs
71
+
72
+ ## Use Cases
73
+
74
+ - High-volume invoice processing
75
+ - Receipt digitization at scale
76
+ - Bulk document processing
77
+ - Cost-conscious automation
78
+ - Startup/SMB accounting systems
79
+
80
+ ## License
81
+
82
+ MIT License
app_gradio.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ from pathlib import Path
5
+ from smart_ocr_pipeline_textonly import main as process_invoice
6
+
7
+ # Set page title and description
8
+ title = "💰 Smart OCR Pipeline - Cost-Optimized (Text-Only)"
9
+ description = """
10
+ **Cost-Efficient Invoice OCR**
11
+
12
+ This service uses:
13
+ - DocTR for text extraction
14
+ - GPT-4o-mini for structured data extraction (text only - no image sent)
15
+ - Smart line grouping and validation
16
+
17
+ **Cost:** ~$0.001-$0.003 per invoice (10-50x cheaper than full version!)
18
+ **Best for:** High volume processing, clean invoices
19
+ """
20
+
21
+ def process_invoice_gradio(image):
22
+ """Process invoice image and return structured data"""
23
+ if image is None:
24
+ return "Please upload an image first."
25
+
26
+ try:
27
+ # Save uploaded image temporarily
28
+ temp_dir = "temp_uploads"
29
+ Path(temp_dir).mkdir(exist_ok=True)
30
+
31
+ temp_path = os.path.join(temp_dir, "temp_invoice.jpg")
32
+ image.save(temp_path)
33
+
34
+ # Process with OCR pipeline
35
+ result = process_invoice(temp_path, temp_dir)
36
+
37
+ # Format output as JSON with cost estimate
38
+ if result.get("usage"):
39
+ tokens = result["usage"]["total_tokens"]
40
+ cost = tokens * 0.00000015 # GPT-4o-mini pricing
41
+ result["estimated_cost"] = f"${cost:.6f}"
42
+
43
+ output = json.dumps(result, indent=2, ensure_ascii=False)
44
+
45
+ return output
46
+
47
+ except Exception as e:
48
+ return f"Error processing invoice: {str(e)}"
49
+
50
+ # Create Gradio interface
51
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
52
+ gr.Markdown(f"# {title}")
53
+ gr.Markdown(description)
54
+
55
+ gr.Markdown("### 💡 **Cost Savings:** 10-50x cheaper than sending images to GPT!")
56
+
57
+ with gr.Row():
58
+ with gr.Column():
59
+ image_input = gr.Image(
60
+ type="pil",
61
+ label="Upload Invoice Image",
62
+ sources=["upload", "clipboard"]
63
+ )
64
+ submit_btn = gr.Button("Process Invoice (Text-Only)", variant="primary")
65
+
66
+ with gr.Column():
67
+ output = gr.Textbox(
68
+ label="Extracted Data (JSON)",
69
+ lines=20,
70
+ max_lines=30
71
+ )
72
+
73
+ # Features
74
+ gr.Markdown("### 📋 Features:")
75
+ gr.Markdown("""
76
+ - ✅ Image preprocessing (denoise, enhance)
77
+ - ✅ DocTR OCR extraction
78
+ - ✅ Smart line grouping
79
+ - ✅ GPT-4o-mini post-processing (text only)
80
+ - ✅ Token usage tracking
81
+ - ✅ Cost estimation per invoice
82
+ - ✅ Structured JSON output
83
+ """)
84
+
85
+ gr.Markdown("### 💰 Typical Costs:")
86
+ gr.Markdown("""
87
+ | Volume | Cost per Invoice | Monthly Cost |
88
+ |--------|------------------|--------------|
89
+ | 100/month | $0.002 | $0.20 |
90
+ | 1,000/month | $0.002 | $2.00 |
91
+ | 10,000/month | $0.002 | $20.00 |
92
+ """)
93
+
94
+ # Event handler
95
+ submit_btn.click(
96
+ fn=process_invoice_gradio,
97
+ inputs=image_input,
98
+ outputs=output
99
+ )
100
+
101
+ # Launch
102
+ if __name__ == "__main__":
103
+ demo.launch(
104
+ share=False,
105
+ server_name="0.0.0.0",
106
+ server_port=7860
107
+ )
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for Smart OCR Pipeline
2
+ openai>=1.3.0
3
+ python-dotenv>=1.0.0
4
+
5
+ # Web framework (FastAPI for Render)
6
+ fastapi>=0.104.0
7
+ uvicorn[standard]>=0.24.0
8
+ python-multipart>=0.0.6
9
+
10
+ # Gradio for Hugging Face Spaces
11
+ gradio>=4.0.0
12
+
13
+ # Image processing
14
+ opencv-python>=4.8.0
15
+ numpy>=1.24.0
16
+ Pillow>=10.0.0
17
+
18
+ # OCR engines
19
+ python-doctr[torch]>=0.7.0
20
+
21
+ # Optional: Tesseract fallback
22
+ # pytesseract>=0.3.10
23
+ # Install Tesseract separately: https://github.com/tesseract-ocr/tesseract
24
+
25
+ # Optional: EasyOCR fallback
26
+ # easyocr>=1.7.0
smart_ocr_pipeline_textonly.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ smart_ocr_pipeline_textonly.py
5
+ ---------------------------------
6
+ Same as smart_ocr_pipeline_final.py but optimized for cost efficiency:
7
+ - Uses DocTR for OCR text extraction.
8
+ - Sends only text (no image) to GPT-4o-mini.
9
+ - Keeps full validation, logging, and token usage tracking.
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import json
15
+ import time
16
+ import base64
17
+ import logging
18
+ from pathlib import Path
19
+ from typing import Dict, List, Tuple
20
+
21
+ import cv2
22
+ import numpy as np
23
+ from PIL import Image
24
+ from doctr.io import DocumentFile
25
+ from doctr.models import ocr_predictor
26
+ from openai import OpenAI
27
+
28
+ try:
29
+ from dotenv import load_dotenv
30
+ load_dotenv()
31
+ except Exception:
32
+ pass
33
+
34
+ # ------------------------------------------------------------
35
+ # Logging setup
36
+ # ------------------------------------------------------------
37
+ def setup_logger() -> logging.Logger:
38
+ logger = logging.getLogger("smart_ocr_textonly")
39
+ logger.setLevel(logging.INFO)
40
+ if not logger.handlers:
41
+ ch = logging.StreamHandler(sys.stdout)
42
+ ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
43
+ logger.addHandler(ch)
44
+ return logger
45
+
46
+ log = setup_logger()
47
+
48
+ # ------------------------------------------------------------
49
+ # Setup
50
+ # ------------------------------------------------------------
51
+ def setup_environment() -> OpenAI:
52
+ api_key = os.getenv("OPENAI_API_KEY")
53
+ if not api_key:
54
+ raise ValueError("OPENAI_API_KEY not found in environment variables.")
55
+ log.info("OpenAI client initialized")
56
+ return OpenAI(api_key=api_key)
57
+
58
+ _DOCTR_MODEL = None
59
+
60
+ def get_doctr_model():
61
+ global _DOCTR_MODEL
62
+ if _DOCTR_MODEL is None:
63
+ t0 = time.time()
64
+ _DOCTR_MODEL = ocr_predictor(pretrained=True)
65
+ log.info(f"DocTR model loaded in {time.time() - t0:.2f}s")
66
+ return _DOCTR_MODEL
67
+
68
+ # ------------------------------------------------------------
69
+ # Image preprocessing
70
+ # ------------------------------------------------------------
71
+ def preprocess_image(input_path: str, output_dir: str = ".") -> str:
72
+ log.info("Loading image for preprocessing...")
73
+ img = cv2.imread(input_path)
74
+ if img is None:
75
+ raise ValueError(f"Cannot load image: {input_path}")
76
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
77
+ denoised = cv2.bilateralFilter(gray, 9, 75, 75)
78
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
79
+ enhanced = clahe.apply(denoised)
80
+ normalized = cv2.normalize(enhanced, None, 0, 255, cv2.NORM_MINMAX)
81
+ processed_path = os.path.join(output_dir, "processed_invoice.png")
82
+ cv2.imwrite(processed_path, normalized)
83
+ log.info(f"Processed image saved: {processed_path}")
84
+ return processed_path
85
+
86
+ # ------------------------------------------------------------
87
+ # OCR extraction
88
+ # ------------------------------------------------------------
89
+ def extract_text_with_doctr(image_path: str, output_dir: str = ".") -> Tuple[str, Dict, List[str]]:
90
+ log.info("Running DocTR OCR...")
91
+ model = get_doctr_model()
92
+ doc = DocumentFile.from_images(image_path)
93
+ result = model(doc)
94
+
95
+ all_lines = []
96
+ for page in result.pages:
97
+ for block in page.blocks:
98
+ for line in block.lines:
99
+ text = " ".join(w.value for w in line.words).strip()
100
+ if text:
101
+ all_lines.append(text)
102
+
103
+ avg_conf = np.mean([w.confidence for p in result.pages for b in p.blocks for l in b.lines for w in l.words])
104
+ ocr_json = {"average_confidence": float(avg_conf), "lines": all_lines}
105
+
106
+ # Smart grouping: pair item lines with their numeric lines
107
+ grouped_lines = []
108
+ pending_item = None
109
+ for line in all_lines:
110
+ if any(x.isdigit() for x in line) and any(w in line.lower() for w in ["pz", "kg", "lt"]):
111
+ # numeric line (quantities)
112
+ if pending_item:
113
+ grouped_lines.append(f"{pending_item} | {line}")
114
+ pending_item = None
115
+ else:
116
+ grouped_lines.append(line)
117
+ else:
118
+ # item line
119
+ if pending_item:
120
+ grouped_lines.append(pending_item)
121
+ pending_item = line
122
+ if pending_item:
123
+ grouped_lines.append(pending_item)
124
+
125
+ ocr_text = "\n".join(grouped_lines)
126
+
127
+ log.info(f"DocTR OCR complete (confidence: {avg_conf:.2f}, lines: {len(all_lines)}, grouped: {len(grouped_lines)})")
128
+ return ocr_text, ocr_json, all_lines
129
+
130
+ # ------------------------------------------------------------
131
+ # GPT post-processing (text only)
132
+ # ------------------------------------------------------------
133
+ def extract_structured_data(client: OpenAI, ocr_text: str, model_name: str = "gpt-4o-mini") -> Dict:
134
+ log.info(f"Processing OCR text with {model_name}...")
135
+
136
+ system_message = """
137
+ You are a professional invoice/receipt parser for ChefCode.
138
+ You receive raw OCR text extracted from an invoice and must convert it into structured JSON.
139
+
140
+ Return ONLY valid JSON with this schema:
141
+ {
142
+ "supplier": "string",
143
+ "invoice_number": "string",
144
+ "date": "YYYY-MM-DD or null",
145
+ "line_items": [
146
+ {
147
+ "lot_number": "string",
148
+ "item_name": "string",
149
+ "unit": "string",
150
+ "quantity": number,
151
+ "unit_price": number or null,
152
+ "line_total": number or null,
153
+ "type": "string"
154
+ }
155
+ ],
156
+ "total_amount": number or null,
157
+ "confidence": "high | medium | low"
158
+ }
159
+ Extraction rules (critical):
160
+ - The table is horizontal: Lot → Item → Unit → Quantity → Unit Price → Line Total.
161
+ - The quantity is the number DIRECTLY AFTER the unit.
162
+ - If numbers for a line appear missing, check up to TWO lines BELOW that line in OCR_LINES,
163
+ - Do not ignore header words (Quantità, Prezzo, Sconto, Importo, IVA).
164
+ - Do not skip any visible row; compare OCR row count with extracted items and recover missing lines.
165
+ - Verify math: quantity × unit_price ≈ line_total (±3%). If off, re-read digits from the image.
166
+ - If two adjacent rows share identical numbers, re-check both in the image; do not merge distinct items.
167
+ - Use "." as decimal separator and strip any currency symbols.
168
+ - Keep supplier and item names exactly as printed; do not translate them.
169
+ - Infer "type" (meat/vegetable/dairy/grain/condiment/beverage/grocery). If invoice language is Italian,
170
+ output these category words in Italian (carne, verdura, latticini, cereali, condimento, bevanda, drogheria).
171
+ - Output ONLY JSON — no prose, no markdown.""".strip()
172
+
173
+ user_message = f"Extract structured data from this OCR text:\n\n{ocr_text[:8000]}"
174
+
175
+ resp = client.chat.completions.create(
176
+ model=model_name,
177
+ temperature=0.1,
178
+ max_completion_tokens=2000,
179
+ messages=[
180
+ {"role": "system", "content": system_message},
181
+ {"role": "user", "content": user_message},
182
+ ],
183
+ )
184
+
185
+ # Capture real token usage
186
+ usage = None
187
+ if hasattr(resp, "usage") and resp.usage:
188
+ usage = {
189
+ "prompt_tokens": resp.usage.prompt_tokens,
190
+ "completion_tokens": resp.usage.completion_tokens,
191
+ "total_tokens": resp.usage.total_tokens,
192
+ }
193
+ print(f"🔢 Token usage: {usage}")
194
+
195
+ raw = resp.choices[0].message.content.strip()
196
+ if raw.startswith("```json"):
197
+ raw = raw.replace("```json", "").replace("```", "").strip()
198
+ elif raw.startswith("```"):
199
+ raw = raw.replace("```", "").strip()
200
+
201
+ try:
202
+ data = json.loads(raw)
203
+ except json.JSONDecodeError:
204
+ log.error("Failed to parse JSON output from GPT.")
205
+ data = {"error": "json_parse_error", "raw": raw}
206
+
207
+ if usage:
208
+ data["usage"] = usage
209
+
210
+ return data
211
+
212
+ # ------------------------------------------------------------
213
+ # Main pipeline
214
+ # ------------------------------------------------------------
215
+ def main(invoice_path: str, output_dir: str = "."):
216
+ print("\n" + "="*60)
217
+ print("🧠 SMART OCR PIPELINE (TEXT-ONLY, gpt-4o-mini)")
218
+ print("="*60 + "\n")
219
+
220
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
221
+ client = setup_environment()
222
+
223
+ t0 = time.time()
224
+ processed_path = preprocess_image(invoice_path, output_dir)
225
+ ocr_text, ocr_json, lines = extract_text_with_doctr(processed_path, output_dir)
226
+ structured = extract_structured_data(client, ocr_text, model_name="gpt-4o-mini")
227
+
228
+ final_output = {
229
+ "status": "success",
230
+ "pipeline_version": "3.2_textonly_gpt4o-mini",
231
+ "input_file": Path(invoice_path).name,
232
+ "ocr_confidence": ocr_json.get("average_confidence", 0.0),
233
+ "lines_detected": len(lines),
234
+ "data": structured,
235
+ "elapsed_sec": round(time.time() - t0, 2),
236
+ "usage": structured.get("usage", None),
237
+ }
238
+
239
+ out_path = os.path.join(output_dir, "smart_output_textonly.json")
240
+ with open(out_path, "w", encoding="utf-8") as f:
241
+ json.dump(final_output, f, indent=2, ensure_ascii=False)
242
+
243
+ log.info(f"Final output saved: {out_path}")
244
+ log.info(f"OCR Confidence: {final_output['ocr_confidence']:.2f}")
245
+ if final_output["usage"]:
246
+ log.info(f"Token usage: {final_output['usage']}")
247
+ log.info(f"Elapsed time: {final_output['elapsed_sec']}s")
248
+
249
+ print("\nDone.\n")
250
+ return final_output
251
+
252
+ if __name__ == "__main__":
253
+ if len(sys.argv) < 2:
254
+ print("Usage: python smart_ocr_pipeline_textonly.py <path/to/invoice.jpg> [output_dir]")
255
+ sys.exit(1)
256
+
257
+ invoice_path = sys.argv[1]
258
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "."
259
+ main(invoice_path, output_dir)