Spaces:
Sleeping
Sleeping
Commit
·
4bdd01c
1
Parent(s):
2a944a5
perf: async DB save with duplicate check for faster extraction
Browse files- Cache init_db() with
@st
.cache_resource (runs once per session)
- Background thread for database saves (fire-and-forget)
- Quick duplicate check before queueing save
- Accurate status: 'duplicate' vs 'queued' vs 'disabled'
- app.py +11 -2
- src/pipeline.py +28 -23
app.py
CHANGED
|
@@ -20,8 +20,13 @@ except ImportError:
|
|
| 20 |
from src.pipeline import process_invoice
|
| 21 |
from src.database import init_db
|
| 22 |
|
| 23 |
-
# Initialize database
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# --------------------------------------------------
|
| 27 |
# Mock format detection (UI-level, safe)
|
|
@@ -174,6 +179,10 @@ with tab1:
|
|
| 174 |
st.success("✅ Extraction & Storage Complete")
|
| 175 |
st.toast("Invoice saved to Database!", icon="💾")
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
elif db_status == 'duplicate':
|
| 178 |
st.success("✅ Extraction Complete")
|
| 179 |
st.toast("Duplicate invoice (already in database)", icon="⚠️")
|
|
|
|
| 20 |
from src.pipeline import process_invoice
|
| 21 |
from src.database import init_db
|
| 22 |
|
| 23 |
+
# Initialize database (cached to run only once per session)
|
| 24 |
+
@st.cache_resource
|
| 25 |
+
def initialize_database_once():
|
| 26 |
+
"""Run DB init only once per session/restart"""
|
| 27 |
+
init_db()
|
| 28 |
+
|
| 29 |
+
initialize_database_once()
|
| 30 |
|
| 31 |
# --------------------------------------------------
|
| 32 |
# Mock format detection (UI-level, safe)
|
|
|
|
| 179 |
st.success("✅ Extraction & Storage Complete")
|
| 180 |
st.toast("Invoice saved to Database!", icon="💾")
|
| 181 |
|
| 182 |
+
elif db_status == 'queued':
|
| 183 |
+
st.success("✅ Extraction Complete")
|
| 184 |
+
st.toast("Saving to database...", icon="💾")
|
| 185 |
+
|
| 186 |
elif db_status == 'duplicate':
|
| 187 |
st.success("✅ Extraction Complete")
|
| 188 |
st.toast("Duplicate invoice (already in database)", icon="⚠️")
|
src/pipeline.py
CHANGED
|
@@ -8,6 +8,7 @@ Orchestrates preprocessing, OCR, and extraction
|
|
| 8 |
from typing import Dict, Any, Optional
|
| 9 |
from pathlib import Path
|
| 10 |
import json
|
|
|
|
| 11 |
from pydantic import ValidationError
|
| 12 |
import cv2
|
| 13 |
|
|
@@ -139,37 +140,41 @@ def process_invoice(image_path: str,
|
|
| 139 |
# This gives us a unique fingerprint for this specific business transaction.
|
| 140 |
final_data['semantic_hash'] = generate_semantic_hash(final_data)
|
| 141 |
|
| 142 |
-
# --- DATABASE SAVE (
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
try:
|
| 149 |
-
print("💾 Attempting to save to Database...")
|
| 150 |
repo = InvoiceRepository()
|
| 151 |
-
|
| 152 |
if repo.session:
|
| 153 |
-
|
| 154 |
-
if
|
| 155 |
-
print(
|
| 156 |
-
final_data['_db_status'] = '
|
| 157 |
else:
|
| 158 |
-
#
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
final_data['_db_status'] = 'duplicate'
|
| 163 |
-
else:
|
| 164 |
-
print(" ⚠️ Save failed (unknown error)")
|
| 165 |
-
final_data['_db_status'] = 'error'
|
| 166 |
else:
|
| 167 |
-
print(" ⚠️ Skipped DB Save (Database disabled)")
|
| 168 |
final_data['_db_status'] = 'disabled'
|
| 169 |
-
|
| 170 |
except Exception as e:
|
| 171 |
-
print(f" ⚠️
|
| 172 |
final_data['_db_status'] = 'error'
|
|
|
|
|
|
|
| 173 |
|
| 174 |
# --- SAVING STEP ---
|
| 175 |
if save_results:
|
|
|
|
| 8 |
from typing import Dict, Any, Optional
|
| 9 |
from pathlib import Path
|
| 10 |
import json
|
| 11 |
+
import threading
|
| 12 |
from pydantic import ValidationError
|
| 13 |
import cv2
|
| 14 |
|
|
|
|
| 140 |
# This gives us a unique fingerprint for this specific business transaction.
|
| 141 |
final_data['semantic_hash'] = generate_semantic_hash(final_data)
|
| 142 |
|
| 143 |
+
# --- DATABASE SAVE (ASYNC - Fire and Forget) ---
|
| 144 |
+
def background_save(data_to_save):
|
| 145 |
+
"""Save to database in background thread"""
|
| 146 |
+
try:
|
| 147 |
+
repo = InvoiceRepository()
|
| 148 |
+
if repo.session:
|
| 149 |
+
saved = repo.save_invoice(data_to_save)
|
| 150 |
+
if saved:
|
| 151 |
+
print(f" ✅ [Background] Invoice saved: {data_to_save.get('receipt_number')}")
|
| 152 |
+
else:
|
| 153 |
+
print(f" ⚠️ [Background] Duplicate or error for: {data_to_save.get('receipt_number')}")
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f" ⚠️ [Background] Save failed: {e}")
|
| 156 |
+
|
| 157 |
+
if DB_CONNECTED:
|
| 158 |
+
# Quick duplicate check before queueing save
|
| 159 |
try:
|
|
|
|
| 160 |
repo = InvoiceRepository()
|
|
|
|
| 161 |
if repo.session:
|
| 162 |
+
existing = repo.get_by_hash(final_data.get('semantic_hash', ''))
|
| 163 |
+
if existing:
|
| 164 |
+
print(" ⚠️ Duplicate invoice (already in database)")
|
| 165 |
+
final_data['_db_status'] = 'duplicate'
|
| 166 |
else:
|
| 167 |
+
# Not a duplicate - save in background
|
| 168 |
+
save_thread = threading.Thread(target=background_save, args=(final_data.copy(),))
|
| 169 |
+
save_thread.start()
|
| 170 |
+
final_data['_db_status'] = 'queued'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
else:
|
|
|
|
| 172 |
final_data['_db_status'] = 'disabled'
|
|
|
|
| 173 |
except Exception as e:
|
| 174 |
+
print(f" ⚠️ Duplicate check failed: {e}")
|
| 175 |
final_data['_db_status'] = 'error'
|
| 176 |
+
else:
|
| 177 |
+
final_data['_db_status'] = 'disabled'
|
| 178 |
|
| 179 |
# --- SAVING STEP ---
|
| 180 |
if save_results:
|