dipan004 commited on
Commit
aef305f
Β·
verified Β·
1 Parent(s): d275cbe

Update backend/app/api/ingest.py

Browse files
Files changed (1) hide show
  1. backend/app/api/ingest.py +112 -67
backend/app/api/ingest.py CHANGED
@@ -1,11 +1,4 @@
1
- """
2
- Complete ingest pipeline with AUTONOMOUS AGENT INTEGRATION
3
- βœ… Step 1: HF agents extract raw text
4
- βœ… Step 2: HF NER finds entities
5
- βœ… Step 3: Gemini maps to structured invoice fields
6
- βœ… NEW: Autonomous agent orchestrates, validates, and self-corrects
7
- βœ… UPDATED: Retry logic with exponential backoff + Local OCR fallback
8
- """
9
 
10
  import os
11
  import uuid
@@ -27,36 +20,46 @@ logging.basicConfig(
27
  level=logging.INFO,
28
  format='%(asctime)s - %(levelname)s - %(message)s',
29
  handlers=[logging.StreamHandler(sys.stdout)],
30
- force=True # Override any existing config
31
  )
32
 
33
  logger = logging.getLogger(__name__)
34
 
35
- # ============================================
36
- # FORCE DATABASE INITIALIZATION ON MODULE LOAD
37
- # ============================================
38
 
39
- def init_tables_now():
40
- """Force create tables on module import"""
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
 
42
  DB_PATH.parent.mkdir(parents=True, exist_ok=True)
43
 
44
- with FileLock(str(LOCK_PATH), timeout=30):
45
  conn = sqlite3.connect(str(DB_PATH))
46
  cursor = conn.cursor()
47
 
48
- # Check if table exists
49
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='ingest_jobs'")
50
  if cursor.fetchone():
51
  conn.close()
52
  logger.info("βœ… Database tables already exist")
53
  return
54
 
55
- logger.warning("⚠️ Creating database tables...")
56
 
57
- # Create all tables (abbreviated version shown)
58
- cursor.execute("""
59
- CREATE TABLE IF NOT EXISTS ingest_jobs (
60
  job_id TEXT PRIMARY KEY,
61
  doc_id TEXT,
62
  filename TEXT NOT NULL,
@@ -64,34 +67,30 @@ def init_tables_now():
64
  error_text TEXT,
65
  created_at TEXT DEFAULT CURRENT_TIMESTAMP,
66
  updated_at TEXT DEFAULT CURRENT_TIMESTAMP
67
- )
68
- """)
69
-
70
- cursor.execute("""
71
- CREATE TABLE IF NOT EXISTS documents (
72
  doc_id TEXT PRIMARY KEY,
73
  job_id TEXT NOT NULL,
74
  path TEXT NOT NULL,
75
  filename TEXT NOT NULL,
76
  content_type TEXT NOT NULL,
77
- uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP
78
- )
79
- """)
80
-
81
- cursor.execute("""
82
- CREATE TABLE IF NOT EXISTS extractions (
83
  doc_id TEXT PRIMARY KEY,
84
  raw_text TEXT,
85
  tables_json TEXT,
86
  entities_json TEXT,
87
  classification_json TEXT,
88
  summary_text TEXT,
89
- extracted_at TEXT DEFAULT CURRENT_TIMESTAMP
90
- )
91
- """)
92
-
93
- cursor.execute("""
94
- CREATE TABLE IF NOT EXISTS invoice_fields (
95
  invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
96
  doc_id TEXT NOT NULL,
97
  cust_number TEXT,
@@ -100,30 +99,81 @@ def init_tables_now():
100
  business_code TEXT,
101
  cust_payment_terms TEXT,
102
  confidence_map TEXT,
103
- created_at TEXT DEFAULT CURRENT_TIMESTAMP
104
- )
105
- """)
106
-
107
- cursor.execute("""
108
- CREATE TABLE IF NOT EXISTS batch_jobs (
109
  batch_id TEXT PRIMARY KEY,
110
  total_files INTEGER,
111
  message TEXT,
112
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
113
- )
114
- """)
115
-
116
- cursor.execute("""
117
- CREATE TABLE IF NOT EXISTS batch_job_mapping (
118
  batch_id TEXT,
119
- job_id TEXT
120
- )
121
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # Create indexes
124
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_ingest_jobs_status ON ingest_jobs(status)")
125
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_documents_job_id ON documents(job_id)")
126
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_invoice_fields_doc_id ON invoice_fields(doc_id)")
 
 
 
 
 
 
 
 
127
 
128
  conn.commit()
129
  conn.close()
@@ -135,21 +185,16 @@ def init_tables_now():
135
  import traceback
136
  logger.error(traceback.format_exc())
137
 
138
- # Run immediately on import
139
- logger.info("πŸ” Initializing database on module load...")
140
- init_tables_now()
141
-
142
- # Setup
143
- BASE_DIR = Path(__file__).parent.parent.parent.parent
144
- STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
145
- DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
146
- LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
147
- PREDICT_ENDPOINT = 'http://localhost:7860/predict'
148
 
149
- STORAGE_PATH.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
150
 
151
- logger = logging.getLogger(__name__)
152
- router = APIRouter(prefix="/api", tags=["ingest"])
153
 
154
 
155
  # ============================================
 
1
+
 
 
 
 
 
 
 
2
 
3
  import os
4
  import uuid
 
20
  level=logging.INFO,
21
  format='%(asctime)s - %(levelname)s - %(message)s',
22
  handlers=[logging.StreamHandler(sys.stdout)],
23
+ force=True
24
  )
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
28
 
29
+ BASE_DIR = Path(__file__).parent.parent.parent.parent
30
+ STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
31
+ DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
32
+ LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
33
+ PREDICT_ENDPOINT = 'http://localhost:7860/predict'
34
+
35
+ STORAGE_PATH.mkdir(parents=True, exist_ok=True)
36
+
37
+ router = APIRouter(prefix="/api", tags=["ingest"])
38
+
39
+
40
+
41
+ def _init_db_tables():
42
+ """Create tables on module import - ensures HF Space has tables"""
43
  try:
44
+ logger.info("πŸ” Checking if database tables exist...")
45
  DB_PATH.parent.mkdir(parents=True, exist_ok=True)
46
 
47
+ with FileLock(str(LOCK_PATH), timeout=10):
48
  conn = sqlite3.connect(str(DB_PATH))
49
  cursor = conn.cursor()
50
 
51
+ # Quick check
52
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='ingest_jobs'")
53
  if cursor.fetchone():
54
  conn.close()
55
  logger.info("βœ… Database tables already exist")
56
  return
57
 
58
+ logger.warning("⚠️ Database tables not found, creating...")
59
 
60
+ # Create all tables
61
+ tables_sql = [
62
+ """CREATE TABLE IF NOT EXISTS ingest_jobs (
63
  job_id TEXT PRIMARY KEY,
64
  doc_id TEXT,
65
  filename TEXT NOT NULL,
 
67
  error_text TEXT,
68
  created_at TEXT DEFAULT CURRENT_TIMESTAMP,
69
  updated_at TEXT DEFAULT CURRENT_TIMESTAMP
70
+ )""",
71
+
72
+ """CREATE TABLE IF NOT EXISTS documents (
 
 
73
  doc_id TEXT PRIMARY KEY,
74
  job_id TEXT NOT NULL,
75
  path TEXT NOT NULL,
76
  filename TEXT NOT NULL,
77
  content_type TEXT NOT NULL,
78
+ uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
79
+ FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
80
+ )""",
81
+
82
+ """CREATE TABLE IF NOT EXISTS extractions (
 
83
  doc_id TEXT PRIMARY KEY,
84
  raw_text TEXT,
85
  tables_json TEXT,
86
  entities_json TEXT,
87
  classification_json TEXT,
88
  summary_text TEXT,
89
+ extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
90
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
91
+ )""",
92
+
93
+ """CREATE TABLE IF NOT EXISTS invoice_fields (
 
94
  invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
95
  doc_id TEXT NOT NULL,
96
  cust_number TEXT,
 
99
  business_code TEXT,
100
  cust_payment_terms TEXT,
101
  confidence_map TEXT,
102
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
103
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
104
+ )""",
105
+
106
+ """CREATE TABLE IF NOT EXISTS batch_jobs (
 
107
  batch_id TEXT PRIMARY KEY,
108
  total_files INTEGER,
109
  message TEXT,
110
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
111
+ )""",
112
+
113
+ """CREATE TABLE IF NOT EXISTS batch_job_mapping (
 
 
114
  batch_id TEXT,
115
+ job_id TEXT,
116
+ FOREIGN KEY (batch_id) REFERENCES batch_jobs(batch_id),
117
+ FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
118
+ )""",
119
+
120
+ # ML feature tables
121
+ """CREATE TABLE IF NOT EXISTS customer_aggregates (
122
+ cust_number TEXT PRIMARY KEY,
123
+ cust_avg_days REAL,
124
+ cust_median_days REAL,
125
+ cust_invoice_count INTEGER,
126
+ last_updated TEXT DEFAULT CURRENT_TIMESTAMP
127
+ )""",
128
+
129
+ """CREATE TABLE IF NOT EXISTS payment_terms_aggregates (
130
+ cust_payment_terms TEXT PRIMARY KEY,
131
+ terms_avg_days REAL,
132
+ terms_median_days REAL,
133
+ terms_invoice_count INTEGER,
134
+ last_updated TEXT DEFAULT CURRENT_TIMESTAMP
135
+ )""",
136
+
137
+ """CREATE TABLE IF NOT EXISTS business_code_aggregates (
138
+ business_code TEXT PRIMARY KEY,
139
+ bc_avg_days REAL,
140
+ bc_median_days REAL,
141
+ bc_invoice_count INTEGER,
142
+ last_updated TEXT DEFAULT CURRENT_TIMESTAMP
143
+ )""",
144
+
145
+ """CREATE TABLE IF NOT EXISTS predictions_log (
146
+ prediction_id INTEGER PRIMARY KEY AUTOINCREMENT,
147
+ invoice_id INTEGER,
148
+ cust_number TEXT,
149
+ posting_date TEXT,
150
+ total_open_amount REAL,
151
+ business_code TEXT,
152
+ cust_payment_terms TEXT,
153
+ predicted_days_to_clear REAL,
154
+ predicted_clear_date TEXT,
155
+ model_version TEXT,
156
+ features_json TEXT,
157
+ predicted_at TEXT DEFAULT CURRENT_TIMESTAMP
158
+ )"""
159
+ ]
160
+
161
+ # Execute all CREATE TABLE statements
162
+ for sql in tables_sql:
163
+ cursor.execute(sql)
164
 
165
  # Create indexes
166
+ indexes_sql = [
167
+ "CREATE INDEX IF NOT EXISTS idx_ingest_jobs_status ON ingest_jobs(status)",
168
+ "CREATE INDEX IF NOT EXISTS idx_ingest_jobs_created ON ingest_jobs(created_at DESC)",
169
+ "CREATE INDEX IF NOT EXISTS idx_documents_job_id ON documents(job_id)",
170
+ "CREATE INDEX IF NOT EXISTS idx_invoice_fields_doc_id ON invoice_fields(doc_id)",
171
+ "CREATE INDEX IF NOT EXISTS idx_batch_mapping_batch ON batch_job_mapping(batch_id)",
172
+ "CREATE INDEX IF NOT EXISTS idx_predictions_log_cust ON predictions_log(cust_number)"
173
+ ]
174
+
175
+ for sql in indexes_sql:
176
+ cursor.execute(sql)
177
 
178
  conn.commit()
179
  conn.close()
 
185
  import traceback
186
  logger.error(traceback.format_exc())
187
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Run on module import
190
+ logger.info("πŸ” Initializing database on module load...")
191
+ try:
192
+ _init_db_tables()
193
+ logger.info("βœ… Database initialization complete")
194
+ except Exception as e:
195
+ logger.error(f"❌ Database initialization failed: {e}")
196
+ logger.warning("⚠️ Application may not work correctly!")
197
 
 
 
198
 
199
 
200
  # ============================================