GSoumyajit2005 commited on
Commit
4bdd01c
·
1 Parent(s): 2a944a5

perf: async DB save with duplicate check for faster extraction

Browse files

- Cache init_db() with

@st
.cache_resource (runs once per session)
- Background thread for database saves (fire-and-forget)
- Quick duplicate check before queueing save
- Accurate status: 'duplicate' vs 'queued' vs 'disabled'

Files changed (2) hide show
  1. app.py +11 -2
  2. src/pipeline.py +28 -23
app.py CHANGED
@@ -20,8 +20,13 @@ except ImportError:
20
  from src.pipeline import process_invoice
21
  from src.database import init_db
22
 
23
- # Initialize database
24
- init_db()
 
 
 
 
 
25
 
26
  # --------------------------------------------------
27
  # Mock format detection (UI-level, safe)
@@ -174,6 +179,10 @@ with tab1:
174
  st.success("✅ Extraction & Storage Complete")
175
  st.toast("Invoice saved to Database!", icon="💾")
176
 
 
 
 
 
177
  elif db_status == 'duplicate':
178
  st.success("✅ Extraction Complete")
179
  st.toast("Duplicate invoice (already in database)", icon="⚠️")
 
20
  from src.pipeline import process_invoice
21
  from src.database import init_db
22
 
23
+ # Initialize database (cached to run only once per session)
24
+ @st.cache_resource
25
+ def initialize_database_once():
26
+ """Run DB init only once per session/restart"""
27
+ init_db()
28
+
29
+ initialize_database_once()
30
 
31
  # --------------------------------------------------
32
  # Mock format detection (UI-level, safe)
 
179
  st.success("✅ Extraction & Storage Complete")
180
  st.toast("Invoice saved to Database!", icon="💾")
181
 
182
+ elif db_status == 'queued':
183
+ st.success("✅ Extraction Complete")
184
+ st.toast("Saving to database...", icon="💾")
185
+
186
  elif db_status == 'duplicate':
187
  st.success("✅ Extraction Complete")
188
  st.toast("Duplicate invoice (already in database)", icon="⚠️")
src/pipeline.py CHANGED
@@ -8,6 +8,7 @@ Orchestrates preprocessing, OCR, and extraction
8
  from typing import Dict, Any, Optional
9
  from pathlib import Path
10
  import json
 
11
  from pydantic import ValidationError
12
  import cv2
13
 
@@ -139,37 +140,41 @@ def process_invoice(image_path: str,
139
  # This gives us a unique fingerprint for this specific business transaction.
140
  final_data['semantic_hash'] = generate_semantic_hash(final_data)
141
 
142
- # --- DATABASE SAVE (The Integration) ---
143
- if not DB_CONNECTED:
144
- # Database not available - skip save entirely (message shown once at startup)
145
- final_data['_db_status'] = 'disabled'
146
- else:
147
- final_data['_db_status'] = 'disabled' # Default assumption
 
 
 
 
 
 
 
 
 
 
148
  try:
149
- print("💾 Attempting to save to Database...")
150
  repo = InvoiceRepository()
151
-
152
  if repo.session:
153
- saved_record = repo.save_invoice(final_data)
154
- if saved_record:
155
- print(f" Successfully saved Invoice #{saved_record.id}")
156
- final_data['_db_status'] = 'saved'
157
  else:
158
- # Check if it's a duplicate by looking up the hash
159
- existing = repo.get_by_hash(final_data.get('semantic_hash', ''))
160
- if existing:
161
- print(" ⚠️ Duplicate invoice detected (already in database)")
162
- final_data['_db_status'] = 'duplicate'
163
- else:
164
- print(" ⚠️ Save failed (unknown error)")
165
- final_data['_db_status'] = 'error'
166
  else:
167
- print(" ⚠️ Skipped DB Save (Database disabled)")
168
  final_data['_db_status'] = 'disabled'
169
-
170
  except Exception as e:
171
- print(f" ⚠️ Database Error (Ignored): {e}")
172
  final_data['_db_status'] = 'error'
 
 
173
 
174
  # --- SAVING STEP ---
175
  if save_results:
 
8
  from typing import Dict, Any, Optional
9
  from pathlib import Path
10
  import json
11
+ import threading
12
  from pydantic import ValidationError
13
  import cv2
14
 
 
140
  # This gives us a unique fingerprint for this specific business transaction.
141
  final_data['semantic_hash'] = generate_semantic_hash(final_data)
142
 
143
+ # --- DATABASE SAVE (ASYNC - Fire and Forget) ---
144
+ def background_save(data_to_save):
145
+ """Save to database in background thread"""
146
+ try:
147
+ repo = InvoiceRepository()
148
+ if repo.session:
149
+ saved = repo.save_invoice(data_to_save)
150
+ if saved:
151
+ print(f" ✅ [Background] Invoice saved: {data_to_save.get('receipt_number')}")
152
+ else:
153
+ print(f" ⚠️ [Background] Duplicate or error for: {data_to_save.get('receipt_number')}")
154
+ except Exception as e:
155
+ print(f" ⚠️ [Background] Save failed: {e}")
156
+
157
+ if DB_CONNECTED:
158
+ # Quick duplicate check before queueing save
159
  try:
 
160
  repo = InvoiceRepository()
 
161
  if repo.session:
162
+ existing = repo.get_by_hash(final_data.get('semantic_hash', ''))
163
+ if existing:
164
+ print(" ⚠️ Duplicate invoice (already in database)")
165
+ final_data['_db_status'] = 'duplicate'
166
  else:
167
+ # Not a duplicate - save in background
168
+ save_thread = threading.Thread(target=background_save, args=(final_data.copy(),))
169
+ save_thread.start()
170
+ final_data['_db_status'] = 'queued'
 
 
 
 
171
  else:
 
172
  final_data['_db_status'] = 'disabled'
 
173
  except Exception as e:
174
+ print(f" ⚠️ Duplicate check failed: {e}")
175
  final_data['_db_status'] = 'error'
176
+ else:
177
+ final_data['_db_status'] = 'disabled'
178
 
179
  # --- SAVING STEP ---
180
  if save_results: