GSoumyajit2005 commited on
Commit
c19ef4d
·
1 Parent(s): f74e17e

feat: Updated UI for cleaner look and ignored temp_uploads

Browse files
Files changed (2) hide show
  1. .gitignore +4 -2
  2. app.py +227 -271
.gitignore CHANGED
@@ -63,11 +63,13 @@ lightning_logs/
63
  wandb/
64
  mlruns/
65
 
66
-
67
  # Ignore all files in the models directory
68
  models/*
69
  !models/.gitkeep
70
  !models/README.md
71
 
72
  # Ignore sroie files in the data directory
73
- data/sroie/
 
 
 
 
63
  wandb/
64
  mlruns/
65
 
 
66
  # Ignore all files in the models directory
67
  models/*
68
  !models/.gitkeep
69
  !models/README.md
70
 
71
  # Ignore sroie files in the data directory
72
+ data/sroie/
73
+
74
+ # Ignore temp uploads
75
+ temp_uploads/
app.py CHANGED
@@ -2,312 +2,268 @@ import streamlit as st
2
  import os
3
  import json
4
  from datetime import datetime
 
5
  from PIL import Image
6
- import numpy as np
7
  import pandas as pd
8
- from pathlib import Path
9
-
10
- # Import our actual, working pipeline function
11
  import sys
12
- sys.path.append('src')
 
 
 
 
13
  from pipeline import process_invoice
14
 
15
- # --- Mock Functions (KEPT AS IS) ---
16
- def detect_invoice_format(ocr_text: str):
17
- """
18
- A mock function to simulate format detection.
19
- In a real system, this would analyze the text layout.
20
- """
21
- if "SDN BHD" in ocr_text:
22
- return {
23
- 'name': 'Template A (Retail)',
24
- 'confidence': 95.0,
25
- 'supported': True,
26
- 'indicators': ["Found 'SDN BHD' suffix", "Date format DD/MM/YYYY detected"]
27
- }
28
- else:
29
  return {
30
- 'name': 'Unknown Format',
31
- 'confidence': 20.0,
32
- 'supported': False,
33
- 'indicators': ["No known company suffixes found"]
34
  }
 
 
 
 
 
 
35
 
36
- def get_format_recommendations(format_info):
37
- """Mock recommendations based on the detected format."""
38
- if format_info['supported']:
39
- return ["• Extraction should be highly accurate."]
40
- else:
41
- return ["• Results may be incomplete.", "• Consider adding patterns for this format."]
42
-
43
- # --- Streamlit App (KEPT AS IS) ---
44
 
 
 
 
45
  st.set_page_config(
46
- page_title="Invoice Processor",
47
- page_icon="📄",
48
- layout="wide",
49
- initial_sidebar_state="expanded"
50
  )
51
 
52
- # Custom CSS (KEPT AS IS)
53
- st.markdown("""
54
- <style>
55
- .main-header {
56
- font-size: 3rem;
57
- color: #1f77b4;
58
- text-align: center;
59
- margin-bottom: 2rem;
60
- }
61
- .success-box {
62
- padding: 1rem;
63
- border-radius: 0.5rem;
64
- background-color: #d4edda;
65
- border: 1px solid #c3e6cb;
66
- margin: 1rem 0;
67
- }
68
- .warning-box {
69
- padding: 1rem;
70
- border-radius: 0.5rem;
71
- background-color: #fff3cd;
72
- border: 1px solid #ffeaa7;
73
- margin: 1rem 0;
74
- }
75
- .error-box {
76
- padding: 1rem;
77
- border-radius: 0.5rem;
78
- background-color: #f8d7da;
79
- border: 1px solid #f5c6cb;
80
- margin: 1rem 0;
81
- }
82
- </style>
83
- """, unsafe_allow_html=True)
84
 
85
- # Title & Sidebar (KEPT AS IS)
86
- st.markdown('<h1 class="main-header">📄 Smart Invoice Processor</h1>', unsafe_allow_html=True)
87
- st.markdown("### Extract structured data from invoices using your custom-built OCR pipeline")
88
 
 
 
 
89
  with st.sidebar:
90
  st.header("ℹ️ About")
91
- st.info("""
92
- This app uses the pipeline you built to automatically extract:
93
- - Receipt/Invoice number
94
- - Date
95
- - Customer information
96
- - Line items
97
- - Total amount
98
-
99
- **Technology Stack:**
100
- - Tesseract OCR
101
- - OpenCV
102
- - Python Regex
103
- - Streamlit
104
- """)
105
-
106
- st.header("📊 Stats")
107
- if 'processed_count' not in st.session_state:
108
- st.session_state.processed_count = 0
109
- st.metric("Invoices Processed Today", st.session_state.processed_count)
110
 
111
- st.header("⚙️ Configuration")
112
  extraction_method = st.selectbox(
113
- "Choose Extraction Method:",
114
- ('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
115
- help="ML-Based is more robust. Rule-Based is faster."
116
  )
117
 
118
- # Main content
119
- tab1, tab2, tab3 = st.tabs(["📤 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"])
 
 
 
 
 
 
 
 
 
 
120
 
 
 
 
121
  with tab1:
122
- st.header("Upload an Invoice")
123
-
124
- uploaded_file = st.file_uploader(
125
- "Choose an invoice image (JPG, PNG) or PDF",
126
- type=['jpg', 'jpeg', 'png', 'pdf'], # Added PDF support
127
- help="Upload a clear image or PDF of an invoice"
128
- )
129
-
130
- if uploaded_file is not None:
131
- col1, col2 = st.columns([1, 1])
132
-
133
- with col1:
134
- st.subheader("📸 Original Document")
135
- # Preview Logic updated for PDF support
 
 
136
  if uploaded_file.type == "application/pdf":
137
- st.info("📄 PDF Uploaded (Preview not supported directly)")
138
  else:
139
  image = Image.open(uploaded_file)
140
- st.image(image, use_container_width=True)
141
- st.caption(f"Filename: {uploaded_file.name}")
142
-
143
- with col2:
144
- st.subheader("🔄 Processing Status")
145
-
146
- if st.button("🚀 Extract Data", type="primary"):
147
- with st.spinner("Executing your custom pipeline..."):
148
- try:
149
- # Save temp file
150
- temp_dir = "temp"
151
- os.makedirs(temp_dir, exist_ok=True)
152
- temp_path = os.path.join(temp_dir, uploaded_file.name)
153
- with open(temp_path, "wb") as f:
154
- f.write(uploaded_file.getbuffer())
155
-
156
- # Call Pipeline
157
- st.write("✅ Calling `process_invoice`...")
158
- method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
159
- st.write(f"⚙️ Using **{method.upper()}** extraction method...")
160
-
161
- # ⚠️ UPDATE: Pass string path
162
- extracted_data = process_invoice(str(temp_path), method=method)
163
-
164
- st.write(" Simulating format detection...")
165
- format_info = detect_invoice_format(extracted_data.get("raw_text", ""))
166
-
167
- st.session_state.extracted_data = extracted_data
168
- st.session_state.format_info = format_info
169
- st.session_state.processed_count += 1
170
-
171
- st.success("✅ Pipeline executed successfully!")
172
-
173
- except Exception as e:
174
- st.error(f" An error occurred in the pipeline: {str(e)}")
175
-
176
- # Display results
177
- if 'extracted_data' in st.session_state:
178
- st.markdown("---")
179
- st.header("📊 Extraction Results")
180
-
181
- # --- Format Detection Section (KEPT AS IS) ---
182
- format_info = st.session_state.format_info
183
- st.subheader("📋 Detected Format (Simulated)")
184
- col1_fmt, col2_fmt = st.columns([2, 3])
185
- with col1_fmt:
186
- st.metric("Format Type", format_info['name'])
187
- st.metric("Detection Confidence", f"{format_info['confidence']:.0f}%")
188
- if format_info['supported']: st.success("✅ Fully Supported")
189
- else: st.warning("⚠️ Limited Support")
190
- with col2_fmt:
191
- st.write("**Detected Indicators:**")
192
- for indicator in format_info['indicators']: st.write(f" {indicator}")
193
- st.write("**Recommendations:**")
194
- for rec in get_format_recommendations(format_info): st.write(rec)
195
- st.markdown("---")
196
-
197
- # --- Main Results Section (UPDATED) ---
198
- data = st.session_state.extracted_data
199
-
200
- # 1. New Validation Display (Replaces old Confidence box)
201
- status = data.get('validation_status', 'unknown')
202
- if status == 'passed':
203
- st.markdown(f'<div class="success-box">✅ <strong>Validation Passed</strong>: Data meets strict quality rules (Pydantic).</div>', unsafe_allow_html=True)
204
- elif status == 'failed':
205
- err_count = len(data.get('validation_errors', []))
206
- st.markdown(f'<div class="error-box">❌ <strong>Validation Failed</strong>: Found {err_count} issues. Check JSON for details.</div>', unsafe_allow_html=True)
207
  else:
208
- st.markdown(f'<div class="warning-box">⚠️ <strong>Status Unknown</strong>: Validation logic was skipped.</div>', unsafe_allow_html=True)
209
-
210
- # 2. Key Metrics (Mapped to NEW keys)
211
- st.metric("🏢 Vendor", data.get('vendor') or "N/A")
212
-
213
- res_col1, res_col2, res_col3 = st.columns(3)
214
- res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
215
- res_col2.metric("📅 Date", data.get('date') or "N/A")
216
- # Handle total (it's now a string from the pipeline, but metric handles strings fine)
217
- total = data.get('total_amount')
218
- res_col3.metric("💵 Total Amount", f"${total}" if total else "N/A")
219
-
220
- # 3. Expanded Details
221
- with st.expander("Show More Details"):
222
- st.markdown(f"**🧾 Receipt Number:** {data.get('receipt_number') or 'N/A'}")
223
-
224
- # Handle bill_to
225
- bill_to = data.get('bill_to')
226
- if isinstance(bill_to, dict):
227
- bill_to_display = bill_to.get('name') or 'N/A'
228
- elif isinstance(bill_to, str):
229
- bill_to_display = bill_to
230
- else:
231
- bill_to_display = 'N/A'
232
- st.markdown(f"**👤 Bill To:** {bill_to_display}")
233
-
234
- st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
235
-
236
- # New: Show Duplicate Hash
237
- st.markdown(f"**🔑 Semantic Hash (Duplicate ID):** `{data.get('semantic_hash') or 'N/A'}`")
238
-
239
- # 4. Line items table
240
- if data.get('items'):
241
- st.subheader("🛒 Line Items")
242
- items_df_data = [{
243
- "Description": item.get("description", "N/A"),
244
- "Qty": item.get("quantity", "N/A"),
245
- "Unit Price": f"${item.get('unit_price', 0.0) if item.get('unit_price') is not None else 0}",
246
- "Total": f"${item.get('total', 0.0) if item.get('total') is not None else 0}"
247
- } for item in data['items']]
248
- df = pd.DataFrame(items_df_data)
249
- st.dataframe(df, use_container_width=True)
250
  else:
251
- st.info("ℹ️ No line items were extracted.")
252
-
253
- # JSON output and download
254
- with st.expander("📄 View Full JSON Output"):
 
 
 
 
 
 
 
 
 
 
 
 
255
  st.json(data)
256
-
257
- json_str = json.dumps(data, indent=2)
258
  st.download_button(
259
- label="💾 Download JSON",
260
- data=json_str,
261
- file_name=f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
262
  mime="application/json"
263
  )
264
-
265
- with st.expander("📝 View Raw OCR Text"):
266
- raw_text = data.get('raw_text', '')
267
- if raw_text:
268
- st.text(raw_text)
269
- else:
270
- st.info("No OCR text available.")
271
 
 
 
 
 
 
 
 
272
  with tab2:
273
  st.header("📚 Sample Invoices")
274
- st.write("Try the sample invoice below to see how the system performs:")
275
-
276
- sample_dir = "data/samples"
277
- if os.path.exists(sample_dir):
278
- sample_files = [f for f in os.listdir(sample_dir) if f.endswith(('.jpg', '.png', '.jpeg', '.pdf'))]
279
-
280
- if sample_files:
281
- file_path = os.path.join(sample_dir, sample_files[0])
282
- st.write(f"**Sample File:** {sample_files[0]}")
283
- if file_path.endswith('.pdf'):
284
- st.info("📄 PDF Sample available. Download and upload it to test.")
285
- else:
286
- st.image(Image.open(file_path), caption=sample_files[0], use_container_width=True)
287
  else:
288
- st.warning("No sample invoices found in `data/samples/`.")
289
  else:
290
- st.error("The `data/samples` directory was not found.")
 
291
 
 
 
 
292
  with tab3:
293
- st.header("ℹ️ How It Works (Your Custom Pipeline)")
294
- st.markdown("""
295
- This app follows the exact pipeline you built:
296
- ```
297
- 1. 📸 Input Handling
298
- Detects JPG vs PDF. Smart Loader extracts text from PDFs instantly.
299
-
300
- 2. 🧠 Hybrid Engine
301
- - Digital PDFs: Direct Text Extraction (Fast)
302
- - Images/Scans: LayoutLMv3 (ML) + Tesseract (OCR)
303
-
304
- 3. 🛡️ Validation Gate
305
- Pydantic Schema ensures data integrity (Decimal precision, Date formats).
306
-
307
- 4. 🔑 Duplicate Detection
308
- Generates a unique semantic hash based on content.
309
-
310
- 5. 📊 Output JSON
311
- Standardized, validated output ready for API response.
312
- ```
313
- """)
 
 
 
 
2
  import os
3
  import json
4
  from datetime import datetime
5
+ from pathlib import Path
6
  from PIL import Image
 
7
  import pandas as pd
 
 
 
8
  import sys
9
+
10
+ # --------------------------------------------------
11
+ # Pipeline import (PURE DATA ONLY)
12
+ # --------------------------------------------------
13
+ sys.path.append("src")
14
  from pipeline import process_invoice
15
 
16
+
17
+ # --------------------------------------------------
18
+ # Mock format detection (UI-level, safe)
19
+ # --------------------------------------------------
20
+ def detect_invoice_format(raw_text: str):
21
+ if raw_text and "SDN BHD" in raw_text:
 
 
 
 
 
 
 
 
22
  return {
23
+ "name": "Retail Invoice (MY)",
24
+ "confidence": 95,
25
+ "supported": True,
26
+ "indicators": ["Detected 'SDN BHD' suffix"]
27
  }
28
+ return {
29
+ "name": "Unknown Format",
30
+ "confidence": 20,
31
+ "supported": False,
32
+ "indicators": ["No known company suffix detected"]
33
+ }
34
 
 
 
 
 
 
 
 
 
35
 
36
+ # --------------------------------------------------
37
+ # Streamlit Page Config
38
+ # --------------------------------------------------
39
  st.set_page_config(
40
+ page_title="Smart Invoice Processor",
41
+ page_icon="🧾",
42
+ layout="wide"
 
43
  )
44
 
45
+ # --------------------------------------------------
46
+ # Header (v2 style)
47
+ # --------------------------------------------------
48
+ st.title("🧾 Smart Invoice Processor (Hybrid ML Pipeline)")
49
+ st.markdown(
50
+ "**System Status:** 🟢 Online &nbsp;&nbsp;|&nbsp;&nbsp; "
51
+ "**Model:** LayoutLMv3 + Rules &nbsp;&nbsp;|&nbsp;&nbsp; "
52
+ "**Pipeline:** OCR → ML → Validation"
53
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ st.divider()
 
 
56
 
57
+ # --------------------------------------------------
58
+ # Sidebar (v1 depth, cleaner)
59
+ # --------------------------------------------------
60
  with st.sidebar:
61
  st.header("ℹ️ About")
62
+ st.info(
63
+ "End-to-end invoice processing system that extracts structured data "
64
+ "from scanned images and PDFs using ML + rule-based validation."
65
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ st.header("⚙️ Extraction Mode")
68
  extraction_method = st.selectbox(
69
+ "Choose extraction method",
70
+ ("ML-Based (LayoutLMv3)", "Rule-Based (Regex)")
 
71
  )
72
 
73
+ st.header("📊 Stats")
74
+ if "processed_count" not in st.session_state:
75
+ st.session_state.processed_count = 0
76
+ st.metric("Invoices Processed", st.session_state.processed_count)
77
+
78
+
79
+ # --------------------------------------------------
80
+ # Tabs
81
+ # --------------------------------------------------
82
+ tab1, tab2, tab3 = st.tabs(
83
+ ["🚀 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"]
84
+ )
85
 
86
+ # ==================================================
87
+ # TAB 1 — Upload & Process (v2 layout + v1 features)
88
+ # ==================================================
89
  with tab1:
90
+ col_left, col_right = st.columns([1, 1])
91
+
92
+ # -----------------------------
93
+ # LEFT Upload + Preview
94
+ # -----------------------------
95
+ with col_left:
96
+ st.subheader("1. Upload Invoice")
97
+
98
+ uploaded_file = st.file_uploader(
99
+ "Upload JPG, PNG, or PDF",
100
+ type=["jpg", "jpeg", "png", "pdf"]
101
+ )
102
+
103
+ if uploaded_file:
104
+ st.caption(f"File: {uploaded_file.name}")
105
+
106
  if uploaded_file.type == "application/pdf":
107
+ st.info("PDF uploaded (preview not available)")
108
  else:
109
  image = Image.open(uploaded_file)
110
+
111
+ st.image(
112
+ image,
113
+ width=350,
114
+ caption="Uploaded Invoice"
115
+ )
116
+
117
+
118
+ # -----------------------------
119
+ # RIGHT Processing + Results
120
+ # -----------------------------
121
+ with col_right:
122
+ st.subheader("2. Extraction Results")
123
+
124
+ if uploaded_file and st.button("✨ Extract Data", type="primary"):
125
+ with st.spinner("Running invoice extraction pipeline..."):
126
+ try:
127
+ temp_dir = Path("temp")
128
+ temp_dir.mkdir(exist_ok=True)
129
+ temp_path = temp_dir / uploaded_file.name
130
+
131
+ with open(temp_path, "wb") as f:
132
+ f.write(uploaded_file.getbuffer())
133
+
134
+ method = "ml" if "ML" in extraction_method else "rules"
135
+ result = process_invoice(str(temp_path), method=method)
136
+
137
+ # Hard guard — prevents DeltaGenerator bugs forever
138
+ if not isinstance(result, dict):
139
+ st.error("Pipeline returned invalid data.")
140
+ st.stop()
141
+
142
+ st.session_state.data = result
143
+ st.session_state.format_info = detect_invoice_format(
144
+ result.get("raw_text", "")
145
+ )
146
+ st.session_state.processed_count += 1
147
+
148
+ st.success("Extraction Complete")
149
+
150
+ except Exception as e:
151
+ st.error(f"Pipeline error: {e}")
152
+
153
+ # -----------------------------
154
+ # Render Results
155
+ # -----------------------------
156
+ if "data" in st.session_state:
157
+ data = st.session_state.data
158
+
159
+ # Validation banner (v2 style)
160
+ status = data.get("validation_status", "unknown")
161
+ if status == "passed":
162
+ st.success(" Data Validation Passed")
163
+ elif status == "failed":
164
+ st.error("❌ Data Validation Failed")
 
 
 
 
 
 
 
 
 
 
 
 
165
  else:
166
+ st.warning("⚠️ Validation Not Performed")
167
+
168
+ # Key metrics (clean & focused)
169
+ m1, m2, m3 = st.columns(3)
170
+ m1.metric("Vendor", data.get("vendor") or "N/A")
171
+ m2.metric("Date", data.get("date") or "N/A")
172
+ total = data.get("total_amount")
173
+ m3.metric("Total Amount", f"${total}" if total else "N/A")
174
+
175
+ st.divider()
176
+
177
+ # Secondary fields
178
+ s1, s2 = st.columns(2)
179
+ s1.metric("Receipt / Invoice #", data.get("receipt_number") or "N/A")
180
+
181
+ bill_to = data.get("bill_to")
182
+ if isinstance(bill_to, dict):
183
+ bill_to = bill_to.get("name")
184
+ s2.metric("Bill To", bill_to or "N/A")
185
+
186
+ # Line items
187
+ st.subheader("🛒 Line Items")
188
+ items = data.get("items", [])
189
+ if items:
190
+ st.dataframe(pd.DataFrame(items), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  else:
192
+ st.info("No line items extracted.")
193
+
194
+ # -----------------------------
195
+ # Advanced / Engineer View
196
+ # -----------------------------
197
+ with st.expander("🔍 Advanced Details"):
198
+ format_info = st.session_state.format_info
199
+ st.write("**Detected Format:**", format_info["name"])
200
+ st.write("**Detection Confidence:**", f"{format_info['confidence']}%")
201
+ for ind in format_info["indicators"]:
202
+ st.write(f"• {ind}")
203
+
204
+ st.markdown("---")
205
+ st.write("**Semantic Hash:**", data.get("semantic_hash", "N/A"))
206
+
207
+ with st.expander("📄 Full JSON Output"):
208
  st.json(data)
209
+
 
210
  st.download_button(
211
+ "💾 Download JSON",
212
+ json.dumps(data, indent=2),
213
+ file_name=f"invoice_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
214
  mime="application/json"
215
  )
 
 
 
 
 
 
 
216
 
217
+ with st.expander("📝 Raw OCR Text"):
218
+ st.text(data.get("raw_text", "No OCR text available"))
219
+
220
+
221
+ # ==================================================
222
+ # TAB 2 — Samples
223
+ # ==================================================
224
  with tab2:
225
  st.header("📚 Sample Invoices")
226
+
227
+ sample_dir = Path("data/samples")
228
+ if sample_dir.exists():
229
+ samples = list(sample_dir.glob("*"))
230
+ if samples:
231
+ st.image(
232
+ Image.open(samples[0]),
233
+ caption=samples[0].name,
234
+ use_container_width=True
235
+ )
 
 
 
236
  else:
237
+ st.info("No sample invoices found.")
238
  else:
239
+ st.warning("Sample directory not found.")
240
+
241
 
242
+ # ==================================================
243
+ # TAB 3 — How It Works
244
+ # ==================================================
245
  with tab3:
246
+ st.header("ℹ️ System Architecture")
247
+ st.markdown(
248
+ """
249
+ Input Handling
250
+
251
+ JPG / PNG / PDF detection
252
+
253
+ OCR & Layout Parsing
254
+
255
+ Tesseract + LayoutLMv3
256
+
257
+ Hybrid Extraction
258
+
259
+ ML predictions with rule-based fallback
260
+
261
+ Validation
262
+
263
+ Schema & consistency checks
264
+
265
+ Output
266
+
267
+ Structured JSON + UI visualization
268
+ """
269
+ )