GSoumyajit2005 commited on
Commit
90dbe20
·
1 Parent(s): 8f86a3c

feat: added bulk processing, html reporting, and geometric table extraction

Browse files
Files changed (5) hide show
  1. README.md +5 -4
  2. app.py +150 -101
  3. src/ml_extraction.py +8 -2
  4. src/report_generator.py +298 -0
  5. src/table_extraction.py +144 -0
README.md CHANGED
@@ -46,12 +46,14 @@ A production-grade Hybrid Invoice Extraction System that combines the semantic u
46
  - **Defensive Data Handling:** Implemented coordinate clamping to prevent model crashes from negative OCR bounding boxes.
47
  - **GPU-Accelerated OCR:** DocTR (Mindee) with automatic CUDA acceleration for faster inference in production.
48
  - **Clean JSON Output:** Normalized schema handling nested entities, line items, and validation flags.
49
- - **Defensive Persistence:** Optional PostgreSQL integration that automatically saves extracted data when credentials are present, but gracefully degrades (skips saving) in serverless/demo environments like Hugging Face Spaces.
50
- - **Duplicate Prevention:** Implemented *Semantic Hashing* (Vendor + Date + Total + ID) to automatically detect and prevent duplicate invoice entries.
 
51
 
52
  ### 💻 Usability
53
 
54
  - **Streamlit Web UI:** Interactive dashboard for real-time inference, visualization, and side-by-side comparison (ML vs. Regex).
 
55
  - **CLI & Batch Processing:** Process single files or entire directories via command line with JSON export.
56
  - **Auto-Validation:** Heuristic checks to validate that the extracted "Total Amount" matches the sum of line items.
57
 
@@ -236,7 +238,6 @@ docker-compose up -d
236
 
237
  The application will automatically detect the database and start saving invoices.
238
 
239
-
240
  ## 💻 Usage
241
 
242
  ### Web Interface (Recommended)
@@ -428,7 +429,7 @@ in significantly higher latency due to the heavy OCR and layout-aware models.
428
  - [ ] (Optional) Add FATURA (table-focused) for line-item extraction
429
  - [ ] Sliding-window chunking for >512 token documents (to avoid truncation)
430
  - [ ] Table detection (Camelot/Tabula/DeepDeSRT) for line items
431
- - [ ] PDF support (pdf2image) for multipage invoices
432
  - [x] FastAPI backend + Docker
433
  - [x] CI/CD pipeline (GitHub Actions → HuggingFace Spaces auto-deploy)
434
  - [ ] Multilingual OCR (PaddleOCR) and multilingual fine‑tuning
 
46
  - **Defensive Data Handling:** Implemented coordinate clamping to prevent model crashes from negative OCR bounding boxes.
47
  - **GPU-Accelerated OCR:** DocTR (Mindee) with automatic CUDA acceleration for faster inference in production.
48
  - **Clean JSON Output:** Normalized schema handling nested entities, line items, and validation flags.
49
+ - **Defensive Persistence:** Optional PostgreSQL integration (local Docker or cloud Supabase) that automatically saves extracted data when credentials are present, but gracefully degrades (skips saving) in serverless/demo environments.
50
+ - **Async Database Saves:** Background thread processing ensures fast UI response (~5-7s) while database operations happen asynchronously.
51
+ - **Duplicate Prevention:** Implemented _Semantic Hashing_ (Vendor + Date + Total + ID) to automatically detect and prevent duplicate invoice entries.
52
 
53
  ### 💻 Usability
54
 
55
  - **Streamlit Web UI:** Interactive dashboard for real-time inference, visualization, and side-by-side comparison (ML vs. Regex).
56
+ - **PDF Preview & Overlay:** Visual preview of uploaded PDFs with ML-detected bounding boxes overlay for transparency.
57
  - **CLI & Batch Processing:** Process single files or entire directories via command line with JSON export.
58
  - **Auto-Validation:** Heuristic checks to validate that the extracted "Total Amount" matches the sum of line items.
59
 
 
238
 
239
  The application will automatically detect the database and start saving invoices.
240
 
 
241
  ## 💻 Usage
242
 
243
  ### Web Interface (Recommended)
 
429
  - [ ] (Optional) Add FATURA (table-focused) for line-item extraction
430
  - [ ] Sliding-window chunking for >512 token documents (to avoid truncation)
431
  - [ ] Table detection (Camelot/Tabula/DeepDeSRT) for line items
432
+ - [x] PDF support (pdf2image) for multipage invoices
433
  - [x] FastAPI backend + Docker
434
  - [x] CI/CD pipeline (GitHub Actions → HuggingFace Spaces auto-deploy)
435
  - [ ] Multilingual OCR (PaddleOCR) and multilingual fine‑tuning
app.py CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
6
  from PIL import Image, ImageDraw
7
  import pandas as pd
8
  import sys
 
9
 
10
  # PDF to image conversion
11
  try:
@@ -126,19 +127,75 @@ with tab1:
126
  with col_left:
127
  st.subheader("1. Upload Invoice")
128
 
129
- uploaded_file = st.file_uploader(
130
- "Upload JPG, PNG, or PDF",
131
- type=["jpg", "jpeg", "png", "pdf"]
 
 
132
  )
133
 
134
- if uploaded_file:
135
- st.caption(f"File: {uploaded_file.name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  # Handle PDF preview
138
- if uploaded_file.type == "application/pdf":
139
  if PDF_SUPPORT:
140
- pdf_bytes = uploaded_file.read()
141
- uploaded_file.seek(0) # Reset for later processing
142
  pages = convert_from_bytes(pdf_bytes, first_page=1, last_page=1)
143
  if pages:
144
  pdf_preview_image = pages[0]
@@ -147,7 +204,8 @@ with tab1:
147
  else:
148
  st.warning("PDF preview requires pdf2image. Install with: `pip install pdf2image`")
149
  else:
150
- image = Image.open(uploaded_file)
 
151
  st.image(image, width=250, caption="Uploaded Invoice")
152
 
153
 
@@ -157,101 +215,84 @@ with tab1:
157
  with col_right:
158
  st.subheader("2. Extraction Results")
159
 
160
- if uploaded_file and st.button("✨ Extract Data", type="primary"):
161
- with st.spinner("Running invoice extraction pipeline..."):
162
- try:
163
- temp_dir = Path("temp")
164
- temp_dir.mkdir(exist_ok=True)
165
- temp_path = temp_dir / uploaded_file.name
166
-
167
- with open(temp_path, "wb") as f:
168
- f.write(uploaded_file.getbuffer())
169
-
170
- method = "ml" if "ML" in extraction_method else "rules"
171
-
172
- # CALL PIPELINE
173
- result = process_invoice(str(temp_path), method=method)
174
-
175
- # --- SMART STATUS NOTIFICATIONS ---
176
- db_status = result.get('_db_status', 'disabled')
177
-
178
- if db_status == 'saved':
179
- st.success("✅ Extraction & Storage Complete")
180
- st.toast("Invoice saved to Database!", icon="💾")
181
-
182
- elif db_status == 'queued':
183
- st.success("✅ Extraction Complete")
184
- st.toast("Saving to database...", icon="💾")
185
-
186
- elif db_status == 'duplicate':
187
- st.success("✅ Extraction Complete")
188
- st.toast("Duplicate invoice (already in database)", icon="⚠️")
189
 
190
- elif db_status == 'disabled':
191
- st.success("✅ Extraction Complete")
192
- # Only show "Demo Mode" toast once per session
193
- if not st.session_state.get('_db_warning_shown', False):
194
- st.toast("Database disabled (Demo Mode)", icon="ℹ️")
195
- st.session_state['_db_warning_shown'] = True
196
-
197
- else:
198
- st.success("✅ Extraction Complete")
199
 
200
- # Hard guard prevents DeltaGenerator bugs
201
- if not isinstance(result, dict):
202
- st.error("Pipeline returned invalid data.")
203
- st.stop()
204
 
205
- # Remove the metadata field so it doesn't show up in the JSON view
206
- if '_db_status' in result:
207
- del result['_db_status']
208
-
209
- st.session_state.data = result
210
- st.session_state.format_info = detect_invoice_format(
211
- result.get("raw_text", "")
212
- )
213
- st.session_state.processed_count += 1
214
-
215
- # --- AI Detection Overlay Visualization ---
216
- raw_predictions = result.get("raw_predictions")
217
- if raw_predictions:
218
- # Get the base image for annotation
219
- if uploaded_file.type == "application/pdf":
220
- # Use the converted PDF preview image
221
- if "pdf_preview" in st.session_state:
222
- overlay_image = st.session_state.pdf_preview.copy().convert("RGB")
223
- else:
224
- overlay_image = None
225
  else:
226
- # Reload the original image for annotation
227
- uploaded_file.seek(0)
228
- overlay_image = Image.open(uploaded_file).convert("RGB")
229
-
230
- if overlay_image:
231
- draw = ImageDraw.Draw(overlay_image)
232
-
233
- # Draw red rectangles around each detected entity's bounding boxes
234
- for entity_name, entity_data in raw_predictions.items():
235
- bboxes = entity_data.get("bbox", [])
236
- for box in bboxes:
237
- # bbox format: [x, y, width, height]
238
- x, y, w, h = box
239
- draw.rectangle(
240
- [x, y, x + w, y + h],
241
- outline="red",
242
- width=2
243
- )
244
-
245
- overlay_image.thumbnail((800, 800))
246
-
247
- st.image(
248
- overlay_image,
249
- caption="AI Detection Overlay",
250
- width="content"
251
- )
252
-
253
- except Exception as e:
254
- st.error(f"Pipeline error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  # -----------------------------
257
  # Render Results
@@ -290,7 +331,7 @@ with tab1:
290
  st.subheader("🛒 Line Items")
291
  items = data.get("items", [])
292
  if items:
293
- st.dataframe(pd.DataFrame(items), use_container_width=True)
294
  else:
295
  st.info("No line items extracted.")
296
 
@@ -317,6 +358,14 @@ with tab1:
317
  mime="application/json"
318
  )
319
 
 
 
 
 
 
 
 
 
320
  with st.expander("📝 Raw OCR Text"):
321
  st.text(data.get("raw_text", "No OCR text available"))
322
 
 
6
  from PIL import Image, ImageDraw
7
  import pandas as pd
8
  import sys
9
+ from src.report_generator import generate_bulk_html_report
10
 
11
  # PDF to image conversion
12
  try:
 
127
  with col_left:
128
  st.subheader("1. Upload Invoice")
129
 
130
+ # 1. Allow Multiple Files
131
+ uploaded_files = st.file_uploader(
132
+ "Upload Invoices (Bulk Supported)",
133
+ type=["jpg", "jpeg", "png", "pdf"],
134
+ accept_multiple_files=True
135
  )
136
 
137
+ if "bulk_results" not in st.session_state:
138
+ st.session_state.bulk_results = None
139
+
140
+ if uploaded_files and st.button("✨ Process All Files", type="primary"):
141
+ all_results = []
142
+ progress_bar = st.progress(0)
143
+ status_text = st.empty()
144
+
145
+ with st.spinner(f"Processing {len(uploaded_files)} documents..."):
146
+ temp_dir = Path("temp")
147
+ temp_dir.mkdir(exist_ok=True)
148
+
149
+ for i, uploaded_file in enumerate(uploaded_files):
150
+ status_text.text(f"Processing file {i+1}/{len(uploaded_files)}: {uploaded_file.name}")
151
+ # Save temp file
152
+ temp_path = temp_dir / uploaded_file.name
153
+ with open(temp_path, "wb") as f:
154
+ f.write(uploaded_file.getbuffer())
155
+
156
+ # Run Pipeline
157
+ try:
158
+ # Use 'ml' method as per the requirement
159
+ result = process_invoice(str(temp_path), method='ml')
160
+ all_results.append(result)
161
+ except Exception as e:
162
+ st.error(f"Error processing {uploaded_file.name}: {e}")
163
+
164
+ # Update Progress
165
+ progress_bar.progress((i + 1) / len(uploaded_files))
166
+
167
+ st.success("✅ Bulk Processing Complete!")
168
+ st.session_state.bulk_results = all_results
169
+
170
+ if st.session_state.bulk_results:
171
+ # Generate Report
172
+ html_report = generate_bulk_html_report(st.session_state.bulk_results)
173
+
174
+ # Download Button for the HTML
175
+ st.download_button(
176
+ label="📥 Download Bulk HTML Report",
177
+ data=html_report,
178
+ file_name="bulk_invoice_report.html",
179
+ mime="text/html"
180
+ )
181
+
182
+ # Display Summary Table in UI
183
+ st.subheader("Summary")
184
+ df = pd.DataFrame(st.session_state.bulk_results)
185
+ if not df.empty:
186
+ # Select clean columns for display
187
+ cols = [c for c in ["vendor", "date", "total_amount", "validation_status"] if c in df.columns]
188
+ st.dataframe(df[cols], width='stretch')
189
+ # Preview first file (if any files selected)
190
+ if uploaded_files:
191
+ first_file = uploaded_files[0]
192
+ st.caption(f"Preview: {first_file.name}" + (f" (+{len(uploaded_files)-1} more)" if len(uploaded_files) > 1 else ""))
193
 
194
  # Handle PDF preview
195
+ if first_file.type == "application/pdf":
196
  if PDF_SUPPORT:
197
+ pdf_bytes = first_file.read()
198
+ first_file.seek(0) # Reset for later processing
199
  pages = convert_from_bytes(pdf_bytes, first_page=1, last_page=1)
200
  if pages:
201
  pdf_preview_image = pages[0]
 
204
  else:
205
  st.warning("PDF preview requires pdf2image. Install with: `pip install pdf2image`")
206
  else:
207
+ image = Image.open(first_file)
208
+ first_file.seek(0) # Reset for later processing
209
  st.image(image, width=250, caption="Uploaded Invoice")
210
 
211
 
 
215
  with col_right:
216
  st.subheader("2. Extraction Results")
217
 
218
+ # Single-file extraction (original functionality)
219
+ # Works when exactly 1 file is uploaded
220
+ if uploaded_files and len(uploaded_files) == 1:
221
+ single_file = uploaded_files[0]
222
+ if st.button("✨ Extract Data", type="primary"):
223
+ with st.spinner("Running invoice extraction pipeline..."):
224
+ try:
225
+ temp_dir = Path("temp")
226
+ temp_dir.mkdir(exist_ok=True)
227
+ temp_path = temp_dir / single_file.name
228
+
229
+ with open(temp_path, "wb") as f:
230
+ f.write(single_file.getbuffer())
231
+
232
+ method = "ml" if "ML" in extraction_method else "rules"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ # CALL PIPELINE
235
+ result = process_invoice(str(temp_path), method=method)
 
 
 
 
 
 
 
236
 
237
+ # --- SMART STATUS NOTIFICATIONS ---
238
+ db_status = result.get('_db_status', 'disabled')
 
 
239
 
240
+ if db_status == 'saved':
241
+ st.success("✅ Extraction & Storage Complete")
242
+ st.toast("Invoice saved to Database!", icon="💾")
243
+ elif db_status == 'queued':
244
+ st.success("✅ Extraction Complete")
245
+ st.toast("Saving to database...", icon="💾")
246
+ elif db_status == 'duplicate':
247
+ st.success("✅ Extraction Complete")
248
+ st.toast("Duplicate invoice (already in database)", icon="⚠️")
249
+ elif db_status == 'disabled':
250
+ st.success("✅ Extraction Complete")
251
+ if not st.session_state.get('_db_warning_shown', False):
252
+ st.toast("Database disabled (Demo Mode)", icon="ℹ️")
253
+ st.session_state['_db_warning_shown'] = True
 
 
 
 
 
 
254
  else:
255
+ st.success("✅ Extraction Complete")
256
+
257
+ # Hard guard
258
+ if not isinstance(result, dict):
259
+ st.error("Pipeline returned invalid data.")
260
+ st.stop()
261
+
262
+ if '_db_status' in result:
263
+ del result['_db_status']
264
+
265
+ st.session_state.data = result
266
+ st.session_state.format_info = detect_invoice_format(
267
+ result.get("raw_text", "")
268
+ )
269
+ st.session_state.processed_count += 1
270
+
271
+ # --- AI Detection Overlay Visualization ---
272
+ raw_predictions = result.get("raw_predictions")
273
+ if raw_predictions:
274
+ if single_file.type == "application/pdf":
275
+ if "pdf_preview" in st.session_state:
276
+ overlay_image = st.session_state.pdf_preview.copy().convert("RGB")
277
+ else:
278
+ overlay_image = None
279
+ else:
280
+ single_file.seek(0)
281
+ overlay_image = Image.open(single_file).convert("RGB")
282
+
283
+ if overlay_image:
284
+ draw = ImageDraw.Draw(overlay_image)
285
+ for entity_name, entity_data in raw_predictions.items():
286
+ bboxes = entity_data.get("bbox", [])
287
+ for box in bboxes:
288
+ x, y, w, h = box
289
+ draw.rectangle([x, y, x + w, y + h], outline="red", width=2)
290
+
291
+ overlay_image.thumbnail((800, 800))
292
+ st.image(overlay_image, caption="AI Detection Overlay", width="content")
293
+
294
+ except Exception as e:
295
+ st.error(f"Pipeline error: {e}")
296
 
297
  # -----------------------------
298
  # Render Results
 
331
  st.subheader("🛒 Line Items")
332
  items = data.get("items", [])
333
  if items:
334
+ st.dataframe(pd.DataFrame(items), width='stretch')
335
  else:
336
  st.info("No line items extracted.")
337
 
 
358
  mime="application/json"
359
  )
360
 
361
+ html_report = generate_bulk_html_report([data])
362
+ st.download_button(
363
+ "📥 Download HTML Report",
364
+ html_report,
365
+ file_name=f"invoice_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html",
366
+ mime="text/html"
367
+ )
368
+
369
  with st.expander("📝 Raw OCR Text"):
370
  st.text(data.get("raw_text", "No OCR text available"))
371
 
src/ml_extraction.py CHANGED
@@ -9,6 +9,7 @@ from typing import List, Dict, Any, Tuple
9
  import re
10
  import numpy as np
11
  from src.extraction import extract_invoice_number, extract_total, extract_address
 
12
  from doctr.io import DocumentFile
13
  from doctr.models import ocr_predictor
14
 
@@ -155,7 +156,6 @@ def _process_predictions(words, unnormalized_boxes, encoding, predictions, id2la
155
 
156
  return entities
157
 
158
-
159
  def extract_ml_based(image_path: str) -> Dict[str, Any]:
160
  if not MODEL or not PROCESSOR:
161
  raise RuntimeError("ML model is not loaded.")
@@ -176,7 +176,6 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
176
  # Reconstructs lines so regex can work line-by-line
177
  lines = []
178
  current_line = []
179
-
180
  if len(unnormalized_boxes) > 0:
181
  # Initialize with first word's Y and Height
182
  current_y = unnormalized_boxes[0][1]
@@ -330,4 +329,11 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
330
  "bbox": [found_box]
331
  }
332
 
 
 
 
 
 
 
 
333
  return final_output
 
9
  import re
10
  import numpy as np
11
  from src.extraction import extract_invoice_number, extract_total, extract_address
12
+ from src.table_extraction import extract_table_items
13
  from doctr.io import DocumentFile
14
  from doctr.models import ocr_predictor
15
 
 
156
 
157
  return entities
158
 
 
159
  def extract_ml_based(image_path: str) -> Dict[str, Any]:
160
  if not MODEL or not PROCESSOR:
161
  raise RuntimeError("ML model is not loaded.")
 
176
  # Reconstructs lines so regex can work line-by-line
177
  lines = []
178
  current_line = []
 
179
  if len(unnormalized_boxes) > 0:
180
  # Initialize with first word's Y and Height
181
  current_y = unnormalized_boxes[0][1]
 
329
  "bbox": [found_box]
330
  }
331
 
332
+ # --- TABLE EXTRACTION (Geometric Heuristic) ---
333
+ # Use the geometric fallback to extract line items from table region
334
+ if words and unnormalized_boxes:
335
+ extracted_items = extract_table_items(words, unnormalized_boxes)
336
+ if extracted_items:
337
+ final_output["items"] = extracted_items
338
+
339
  return final_output
src/report_generator.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/report_generator.py
2
+
3
+ import os
4
+ from datetime import datetime
5
+
6
+ def generate_bulk_html_report(results: list, output_path: str = "bulk_report.html"):
7
+ """
8
+ Creates a single HTML report summarizing multiple invoices.
9
+ """
10
+
11
+ # Calculate summary stats
12
+ total_invoices = len(results)
13
+ total_value = sum(float(r.get('total_amount') or 0) for r in results)
14
+ passed_count = sum(1 for r in results if r.get('validation_status') == 'passed')
15
+
16
+ rows_html = ""
17
+ for idx, res in enumerate(results, 1):
18
+ # Create a mini-table for the items in this invoice
19
+ items_list = ""
20
+ for item in res.get("items", []):
21
+ total_val = item.get('total', 0)
22
+ try:
23
+ total_val = float(total_val)
24
+ items_list += f"<li>{item.get('description', 'Item')} <span class='item-price'>${total_val:.2f}</span></li>"
25
+ except:
26
+ items_list += f"<li>{item.get('description', 'Item')}</li>"
27
+
28
+ if not items_list:
29
+ items_list = "<li class='no-items'>No items detected</li>"
30
+
31
+ # Format total amount
32
+ total_amt = res.get('total_amount')
33
+ try:
34
+ total_display = f"${float(total_amt):,.2f}" if total_amt else "N/A"
35
+ except:
36
+ total_display = str(total_amt) if total_amt else "N/A"
37
+
38
+ status = res.get('validation_status') or 'unknown'
39
+
40
+ rows_html += f"""
41
+ <tr class="invoice-row">
42
+ <td class="row-num">{idx}</td>
43
+ <td class="vendor-cell">{res.get('vendor') or 'Unknown Vendor'}</td>
44
+ <td>{res.get('date') or 'N/A'}</td>
45
+ <td>{res.get('receipt_number') or 'N/A'}</td>
46
+ <td class="total-cell">{total_display}</td>
47
+ <td><ul class="item-list">{items_list}</ul></td>
48
+ <td><span class="badge badge-{status}">{status.title()}</span></td>
49
+ </tr>
50
+ """
51
+
52
+ html_content = f"""<!DOCTYPE html>
53
+ <html lang="en">
54
+ <head>
55
+ <meta charset="UTF-8">
56
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
57
+ <title>Bulk Invoice Report - {datetime.now().strftime('%Y-%m-%d')}</title>
58
+ <style>
59
+ * {{ box-sizing: border-box; margin: 0; padding: 0; }}
60
+
61
+ body {{
62
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
63
+ background: linear-gradient(135deg, #f5f7fa 0%, #e4e8ec 100%);
64
+ min-height: 100vh;
65
+ padding: 40px 20px;
66
+ color: #333;
67
+ }}
68
+
69
+ .container {{
70
+ max-width: 1400px;
71
+ margin: 0 auto;
72
+ }}
73
+
74
+ /* Header */
75
+ .report-header {{
76
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
77
+ color: white;
78
+ padding: 30px 40px;
79
+ border-radius: 16px;
80
+ margin-bottom: 30px;
81
+ box-shadow: 0 10px 40px rgba(102, 126, 234, 0.3);
82
+ }}
83
+
84
+ .report-header h1 {{
85
+ font-size: 2rem;
86
+ font-weight: 700;
87
+ margin-bottom: 8px;
88
+ }}
89
+
90
+ .report-header .subtitle {{
91
+ opacity: 0.9;
92
+ font-size: 0.95rem;
93
+ }}
94
+
95
+ /* Stats Cards */
96
+ .stats-grid {{
97
+ display: grid;
98
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
99
+ gap: 20px;
100
+ margin-bottom: 30px;
101
+ }}
102
+
103
+ .stat-card {{
104
+ background: white;
105
+ padding: 24px;
106
+ border-radius: 12px;
107
+ box-shadow: 0 4px 15px rgba(0,0,0,0.08);
108
+ text-align: center;
109
+ }}
110
+
111
+ .stat-card .stat-value {{
112
+ font-size: 2rem;
113
+ font-weight: 700;
114
+ color: #667eea;
115
+ }}
116
+
117
+ .stat-card .stat-label {{
118
+ font-size: 0.85rem;
119
+ color: #666;
120
+ text-transform: uppercase;
121
+ letter-spacing: 0.5px;
122
+ margin-top: 4px;
123
+ }}
124
+
125
+ /* Table */
126
+ .table-wrapper {{
127
+ background: white;
128
+ border-radius: 16px;
129
+ overflow: hidden;
130
+ box-shadow: 0 4px 20px rgba(0,0,0,0.1);
131
+ }}
132
+
133
+ table {{
134
+ width: 100%;
135
+ border-collapse: collapse;
136
+ }}
137
+
138
+ thead th {{
139
+ background: #2d3748;
140
+ color: white;
141
+ padding: 16px 12px;
142
+ text-align: left;
143
+ font-weight: 600;
144
+ font-size: 0.85rem;
145
+ text-transform: uppercase;
146
+ letter-spacing: 0.5px;
147
+ }}
148
+
149
+ tbody td {{
150
+ padding: 16px 12px;
151
+ border-bottom: 1px solid #e2e8f0;
152
+ vertical-align: top;
153
+ }}
154
+
155
+ tbody tr:nth-child(even) {{
156
+ background: #f8fafc;
157
+ }}
158
+
159
+ tbody tr:hover {{
160
+ background: #edf2f7;
161
+ }}
162
+
163
+ .row-num {{
164
+ color: #a0aec0;
165
+ font-weight: 600;
166
+ width: 50px;
167
+ }}
168
+
169
+ .vendor-cell {{
170
+ font-weight: 600;
171
+ color: #2d3748;
172
+ }}
173
+
174
+ .total-cell {{
175
+ font-weight: 700;
176
+ color: #38a169;
177
+ font-size: 1.05rem;
178
+ }}
179
+
180
+ /* Item List */
181
+ .item-list {{
182
+ list-style: none;
183
+ padding: 0;
184
+ margin: 0;
185
+ font-size: 0.85rem;
186
+ }}
187
+
188
+ .item-list li {{
189
+ padding: 4px 0;
190
+ color: #4a5568;
191
+ border-bottom: 1px dashed #e2e8f0;
192
+ }}
193
+
194
+ .item-list li:last-child {{
195
+ border-bottom: none;
196
+ }}
197
+
198
+ .item-list .item-price {{
199
+ float: right;
200
+ color: #667eea;
201
+ font-weight: 600;
202
+ }}
203
+
204
+ .item-list .no-items {{
205
+ color: #a0aec0;
206
+ font-style: italic;
207
+ }}
208
+
209
+ /* Badges */
210
+ .badge {{
211
+ display: inline-block;
212
+ padding: 6px 12px;
213
+ border-radius: 20px;
214
+ font-size: 0.75rem;
215
+ font-weight: 600;
216
+ text-transform: uppercase;
217
+ letter-spacing: 0.5px;
218
+ }}
219
+
220
+ .badge-passed {{
221
+ background: linear-gradient(135deg, #48bb78, #38a169);
222
+ color: white;
223
+ }}
224
+
225
+ .badge-failed {{
226
+ background: linear-gradient(135deg, #fc8181, #e53e3e);
227
+ color: white;
228
+ }}
229
+
230
+ .badge-unknown {{
231
+ background: #e2e8f0;
232
+ color: #4a5568;
233
+ }}
234
+
235
+ /* Footer */
236
+ .report-footer {{
237
+ text-align: center;
238
+ margin-top: 40px;
239
+ color: #718096;
240
+ font-size: 0.85rem;
241
+ }}
242
+
243
+ @media print {{
244
+ body {{ background: white; padding: 0; }}
245
+ .report-header {{ box-shadow: none; }}
246
+ .table-wrapper {{ box-shadow: none; }}
247
+ }}
248
+ </style>
249
+ </head>
250
+ <body>
251
+ <div class="container">
252
+ <header class="report-header">
253
+ <h1>🧾 Bulk Invoice Extraction Report</h1>
254
+ <p class="subtitle">Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}</p>
255
+ </header>
256
+
257
+ <div class="stats-grid">
258
+ <div class="stat-card">
259
+ <div class="stat-value">{total_invoices}</div>
260
+ <div class="stat-label">Total Invoices</div>
261
+ </div>
262
+ <div class="stat-card">
263
+ <div class="stat-value">${total_value:,.2f}</div>
264
+ <div class="stat-label">Total Value</div>
265
+ </div>
266
+ <div class="stat-card">
267
+ <div class="stat-value">{passed_count}/{total_invoices}</div>
268
+ <div class="stat-label">Validation Passed</div>
269
+ </div>
270
+ </div>
271
+
272
+ <div class="table-wrapper">
273
+ <table>
274
+ <thead>
275
+ <tr>
276
+ <th>#</th>
277
+ <th>Vendor</th>
278
+ <th>Date</th>
279
+ <th>Invoice #</th>
280
+ <th>Total</th>
281
+ <th>Line Items</th>
282
+ <th>Status</th>
283
+ </tr>
284
+ </thead>
285
+ <tbody>
286
+ {rows_html}
287
+ </tbody>
288
+ </table>
289
+ </div>
290
+
291
+ <footer class="report-footer">
292
+ <p>Generated by Smart Invoice Processor • Powered by LayoutLMv3 + DocTR</p>
293
+ </footer>
294
+ </div>
295
+ </body>
296
+ </html>"""
297
+
298
+ return html_content
src/table_extraction.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/table_extraction.py
2
+
3
+ from typing import List, Dict, Any
4
+ import re
5
+
6
+ # Common phrases that indicate NON-item text (should be filtered out)
7
+ EXCLUDE_PHRASES = [
8
+ "thank you", "thank", "goods sold", "not returnable", "returnable",
9
+ "shopping at", "visit again", "customer copy", "merchant copy",
10
+ "powered by", "terms and conditions", "t&c apply", "cashier",
11
+ "counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc",
12
+ "gst summary", "tax summary", "payment", "change", "cash",
13
+ "credit card", "debit card", "subtotal", "sub total", "grand total",
14
+ "total includes", "includes gst", "tax invoice", "invoice"
15
+ ]
16
+
17
+ def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]:
18
+ """
19
+ Geometric Heuristic to extract table rows.
20
+ Logic:
21
+ 1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty').
22
+ 2. Find 'Footer' Y-position (where 'Total' usually sits).
23
+ 3. Filter all words strictly BETWEEN Header and Footer.
24
+ 4. Group remaining words into 'Rows' based on similar Y-coordinates.
25
+ """
26
+
27
+ if not words or not boxes:
28
+ return []
29
+
30
+ # 1. Identify Anchor Points
31
+ header_y = 0
32
+ footer_y = float('inf')
33
+
34
+ header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"]
35
+ footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"]
36
+
37
+ # Scan for Header (Top boundary)
38
+ for i, word in enumerate(words):
39
+ if word.lower() in header_keywords:
40
+ y_bottom = boxes[i][1] + boxes[i][3]
41
+ if y_bottom > header_y:
42
+ header_y = y_bottom
43
+
44
+ # Scan for Footer (Bottom boundary)
45
+ for i, word in enumerate(words):
46
+ if word.lower() in footer_keywords:
47
+ y_top = boxes[i][1]
48
+ if y_top < footer_y and y_top > header_y:
49
+ footer_y = y_top
50
+
51
+ # If no header found, assume top 25% is header
52
+ if header_y == 0 and boxes:
53
+ max_y = max(b[1] for b in boxes)
54
+ header_y = max_y * 0.25
55
+
56
+ # If no footer found, assume bottom 25% is footer
57
+ if footer_y == float('inf') and boxes:
58
+ max_y = max(b[1] for b in boxes)
59
+ footer_y = max_y * 0.75
60
+
61
+ # 2. Filter Content (The "Sandwich" Meat)
62
+ table_words = []
63
+ for i, word in enumerate(words):
64
+ bx, by, bw, bh = boxes[i]
65
+ if by > header_y and (by + bh) < footer_y:
66
+ table_words.append({"text": word, "box": boxes[i]})
67
+
68
+ # 3. Group by Rows (Y-clustering)
69
+ rows = []
70
+ if not table_words:
71
+ return []
72
+
73
+ table_words.sort(key=lambda x: x["box"][1])
74
+
75
+ current_row = [table_words[0]]
76
+ current_y = table_words[0]["box"][1]
77
+
78
+ for item in table_words[1:]:
79
+ y = item["box"][1]
80
+ if abs(y - current_y) < 15:
81
+ current_row.append(item)
82
+ else:
83
+ current_row.sort(key=lambda x: x["box"][0])
84
+ rows.append(current_row)
85
+ current_row = [item]
86
+ current_y = y
87
+
88
+ if current_row:
89
+ current_row.sort(key=lambda x: x["box"][0])
90
+ rows.append(current_row)
91
+
92
+ # 4. Convert Rows to Structured Dicts with FILTERING
93
+ structured_items = []
94
+
95
+ for row in rows:
96
+ full_text = " ".join([w["text"] for w in row])
97
+ full_text_lower = full_text.lower()
98
+
99
+ # Skip rows that match exclude phrases
100
+ if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES):
101
+ continue
102
+
103
+ # Skip very short rows (likely noise)
104
+ if len(full_text.strip()) < 3:
105
+ continue
106
+
107
+ # Find all numbers (potential prices)
108
+ # Match patterns like: 0.90, 12.50, 1,234.56
109
+ numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text)
110
+
111
+ item_obj = {
112
+ "description": full_text,
113
+ "quantity": 1,
114
+ "unit_price": 0.0,
115
+ "total": 0.0
116
+ }
117
+
118
+ if numbers:
119
+ try:
120
+ # Clean and convert last number as price
121
+ val = float(numbers[-1].replace(',', ''))
122
+
123
+ # Skip if price is 0 or unreasonably small for a line item
124
+ if val <= 0:
125
+ continue
126
+
127
+ item_obj["total"] = val
128
+ item_obj["unit_price"] = val
129
+ # Remove the price from description
130
+ item_obj["description"] = full_text.replace(numbers[-1], "").strip()
131
+
132
+ # Skip if description is now empty or too short
133
+ if len(item_obj["description"].strip()) < 2:
134
+ continue
135
+
136
+ except:
137
+ continue
138
+ else:
139
+ # No numbers found = not a valid line item
140
+ continue
141
+
142
+ structured_items.append(item_obj)
143
+
144
+ return structured_items