Spaces:
Sleeping
Sleeping
Enable full pipeline for all document types with startup preloading
Browse files- All file types (PDF, TXT, DOCX, Excel, CSV) now use the full verification pipeline
- Pipeline pre-loads ML models at startup in background thread
- Excel/CSV also load into structured data store for analytical queries
- Groq handles response generation, verification algorithm handles claim checking
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
api.py
CHANGED
|
@@ -396,44 +396,31 @@ def verify_claims(req: VerifyRequest):
|
|
| 396 |
@app.post("/api/upload")
|
| 397 |
async def upload_document(file: UploadFile = File(...)):
|
| 398 |
"""Upload and ingest a document (TXT, PDF, DOCX, Excel, CSV)."""
|
| 399 |
-
# Validate extension
|
| 400 |
ext = os.path.splitext(file.filename)[1].lower()
|
| 401 |
if ext not in ALLOWED_EXTENSIONS:
|
| 402 |
raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
|
| 403 |
|
| 404 |
-
# Save file
|
| 405 |
save_path = os.path.join(UPLOAD_DIR, file.filename)
|
| 406 |
content = await file.read()
|
| 407 |
with open(save_path, "wb") as f:
|
| 408 |
f.write(content)
|
| 409 |
|
| 410 |
try:
|
| 411 |
-
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
#
|
| 414 |
if ext in (".xlsx", ".xls"):
|
| 415 |
-
|
| 416 |
-
chunks_added = rows
|
| 417 |
elif ext == ".csv":
|
| 418 |
-
|
| 419 |
-
chunks_added = rows
|
| 420 |
-
|
| 421 |
-
# For text documents: need the full pipeline with ML models
|
| 422 |
-
if ext in (".txt", ".pdf", ".docx"):
|
| 423 |
-
p = get_pipeline()
|
| 424 |
-
chunks_added = p.ingest_file(save_path)
|
| 425 |
-
|
| 426 |
-
# Also ingest into vector store for RAG queries (if pipeline already loaded)
|
| 427 |
-
if pipeline is not None and ext in (".xlsx", ".xls", ".csv"):
|
| 428 |
-
chunks_added = pipeline.ingest_file(save_path)
|
| 429 |
-
|
| 430 |
-
uploaded_files.append(file.filename)
|
| 431 |
|
| 432 |
return {
|
| 433 |
"filename": file.filename,
|
| 434 |
"file_type": ext,
|
| 435 |
"chunks_added": chunks_added,
|
| 436 |
-
"total_chunks":
|
| 437 |
}
|
| 438 |
except Exception as e:
|
| 439 |
raise HTTPException(500, f"Failed to process {file.filename}: {str(e)}")
|
|
@@ -484,6 +471,20 @@ def delete_file(req: DeleteRequest):
|
|
| 484 |
}
|
| 485 |
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
# ββ Serve React build ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 488 |
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
| 489 |
if os.path.exists(frontend_dist):
|
|
|
|
| 396 |
@app.post("/api/upload")
|
| 397 |
async def upload_document(file: UploadFile = File(...)):
|
| 398 |
"""Upload and ingest a document (TXT, PDF, DOCX, Excel, CSV)."""
|
|
|
|
| 399 |
ext = os.path.splitext(file.filename)[1].lower()
|
| 400 |
if ext not in ALLOWED_EXTENSIONS:
|
| 401 |
raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
|
| 402 |
|
|
|
|
| 403 |
save_path = os.path.join(UPLOAD_DIR, file.filename)
|
| 404 |
content = await file.read()
|
| 405 |
with open(save_path, "wb") as f:
|
| 406 |
f.write(content)
|
| 407 |
|
| 408 |
try:
|
| 409 |
+
p = get_pipeline()
|
| 410 |
+
chunks_added = p.ingest_file(save_path)
|
| 411 |
+
uploaded_files.append(file.filename)
|
| 412 |
|
| 413 |
+
# Also load into structured data store for Excel/CSV analytical queries
|
| 414 |
if ext in (".xlsx", ".xls"):
|
| 415 |
+
data_store.load_excel(save_path)
|
|
|
|
| 416 |
elif ext == ".csv":
|
| 417 |
+
data_store.load_csv(save_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
return {
|
| 420 |
"filename": file.filename,
|
| 421 |
"file_type": ext,
|
| 422 |
"chunks_added": chunks_added,
|
| 423 |
+
"total_chunks": p.document_count,
|
| 424 |
}
|
| 425 |
except Exception as e:
|
| 426 |
raise HTTPException(500, f"Failed to process {file.filename}: {str(e)}")
|
|
|
|
| 471 |
}
|
| 472 |
|
| 473 |
|
| 474 |
+
# ββ Pre-load pipeline at startup βββββββββββββββββββββββββββββββββββββββββββββ
|
| 475 |
+
import threading
|
| 476 |
+
|
| 477 |
+
def _preload_pipeline():
|
| 478 |
+
"""Load ML models in background so first request is fast."""
|
| 479 |
+
print("Pre-loading VDHF pipeline (this may take a minute)...")
|
| 480 |
+
get_pipeline()
|
| 481 |
+
print("Pipeline ready!")
|
| 482 |
+
|
| 483 |
+
@app.on_event("startup")
|
| 484 |
+
def startup_event():
|
| 485 |
+
threading.Thread(target=_preload_pipeline, daemon=True).start()
|
| 486 |
+
|
| 487 |
+
|
| 488 |
# ββ Serve React build ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 489 |
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
| 490 |
if os.path.exists(frontend_dist):
|