Ram-090 Claude Opus 4.6 (1M context) commited on
Commit
8949afe
Β·
1 Parent(s): f97e336

Enable full pipeline for all document types with startup preloading

Browse files

- All file types (PDF, TXT, DOCX, Excel, CSV) now use the full verification pipeline
- Pipeline pre-loads ML models at startup in background thread
- Excel/CSV also load into structured data store for analytical queries
- Groq handles response generation, verification algorithm handles claim checking

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. api.py +21 -20
api.py CHANGED
@@ -396,44 +396,31 @@ def verify_claims(req: VerifyRequest):
396
  @app.post("/api/upload")
397
  async def upload_document(file: UploadFile = File(...)):
398
  """Upload and ingest a document (TXT, PDF, DOCX, Excel, CSV)."""
399
- # Validate extension
400
  ext = os.path.splitext(file.filename)[1].lower()
401
  if ext not in ALLOWED_EXTENSIONS:
402
  raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
403
 
404
- # Save file
405
  save_path = os.path.join(UPLOAD_DIR, file.filename)
406
  content = await file.read()
407
  with open(save_path, "wb") as f:
408
  f.write(content)
409
 
410
  try:
411
- chunks_added = 0
 
 
412
 
413
- # For Excel/CSV: load into structured data store (fast, no ML models needed)
414
  if ext in (".xlsx", ".xls"):
415
- rows = data_store.load_excel(save_path)
416
- chunks_added = rows
417
  elif ext == ".csv":
418
- rows = data_store.load_csv(save_path)
419
- chunks_added = rows
420
-
421
- # For text documents: need the full pipeline with ML models
422
- if ext in (".txt", ".pdf", ".docx"):
423
- p = get_pipeline()
424
- chunks_added = p.ingest_file(save_path)
425
-
426
- # Also ingest into vector store for RAG queries (if pipeline already loaded)
427
- if pipeline is not None and ext in (".xlsx", ".xls", ".csv"):
428
- chunks_added = pipeline.ingest_file(save_path)
429
-
430
- uploaded_files.append(file.filename)
431
 
432
  return {
433
  "filename": file.filename,
434
  "file_type": ext,
435
  "chunks_added": chunks_added,
436
- "total_chunks": pipeline.document_count if pipeline else chunks_added,
437
  }
438
  except Exception as e:
439
  raise HTTPException(500, f"Failed to process {file.filename}: {str(e)}")
@@ -484,6 +471,20 @@ def delete_file(req: DeleteRequest):
484
  }
485
 
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  # ── Serve React build ────────────────────────────────────────────────────────
488
  frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
489
  if os.path.exists(frontend_dist):
 
396
  @app.post("/api/upload")
397
  async def upload_document(file: UploadFile = File(...)):
398
  """Upload and ingest a document (TXT, PDF, DOCX, Excel, CSV)."""
 
399
  ext = os.path.splitext(file.filename)[1].lower()
400
  if ext not in ALLOWED_EXTENSIONS:
401
  raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
402
 
 
403
  save_path = os.path.join(UPLOAD_DIR, file.filename)
404
  content = await file.read()
405
  with open(save_path, "wb") as f:
406
  f.write(content)
407
 
408
  try:
409
+ p = get_pipeline()
410
+ chunks_added = p.ingest_file(save_path)
411
+ uploaded_files.append(file.filename)
412
 
413
+ # Also load into structured data store for Excel/CSV analytical queries
414
  if ext in (".xlsx", ".xls"):
415
+ data_store.load_excel(save_path)
 
416
  elif ext == ".csv":
417
+ data_store.load_csv(save_path)
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
  return {
420
  "filename": file.filename,
421
  "file_type": ext,
422
  "chunks_added": chunks_added,
423
+ "total_chunks": p.document_count,
424
  }
425
  except Exception as e:
426
  raise HTTPException(500, f"Failed to process {file.filename}: {str(e)}")
 
471
  }
472
 
473
 
474
+ # ── Pre-load pipeline at startup ─────────────────────────────────────────────
475
+ import threading
476
+
477
+ def _preload_pipeline():
478
+ """Load ML models in background so first request is fast."""
479
+ print("Pre-loading VDHF pipeline (this may take a minute)...")
480
+ get_pipeline()
481
+ print("Pipeline ready!")
482
+
483
+ @app.on_event("startup")
484
+ def startup_event():
485
+ threading.Thread(target=_preload_pipeline, daemon=True).start()
486
+
487
+
488
  # ── Serve React build ────────────────────────────────────────────────────────
489
  frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
490
  if os.path.exists(frontend_dist):