Spaces:
Sleeping
Sleeping
File size: 9,174 Bytes
92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 92314c4 f08e772 5c5d420 f08e772 5c5d420 f08e772 da57193 f08e772 5c5d420 f08e772 5c5d420 f08e772 5c5d420 f08e772 f975d5b f08e772 f975d5b 4eec2ab f975d5b 4eec2ab f975d5b 4eec2ab f975d5b 4eec2ab f975d5b 4eec2ab f975d5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
import os
import time
from typing import List, Dict
from fastapi import FastAPI, UploadFile, File, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from sqlalchemy.orm import Session
from .db import Base, engine, SessionLocal
from .models import ExtractionRecord
from .schemas import ExtractionRecordBase, ExtractionStage
from .openrouter_client import extract_fields_from_document
# Ensure data dir exists for SQLite
os.makedirs("data", exist_ok=True)
# Create tables
Base.metadata.create_all(bind=engine)
app = FastAPI(title="Document Capture Demo – Backend")
# CORS (for safety we allow all; you can tighten later)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
@app.get("/ping")
def ping():
"""Healthcheck."""
return {"status": "ok", "message": "backend alive"}
def make_stages(total_ms: int, status: str) -> Dict[str, ExtractionStage]:
"""
Build synthetic stage timing data for the History UI.
For now we just split total_ms into 4 stages.
"""
if total_ms <= 0:
total_ms = 1000
return {
"uploading": ExtractionStage(
time=int(total_ms * 0.15),
status="completed",
variation="normal",
),
"aiAnalysis": ExtractionStage(
time=int(total_ms * 0.55),
status="completed" if status == "completed" else "failed",
variation="normal",
),
"dataExtraction": ExtractionStage(
time=int(total_ms * 0.2),
status="completed" if status == "completed" else "skipped",
variation="fast",
),
"outputRendering": ExtractionStage(
time=int(total_ms * 0.1),
status="completed" if status == "completed" else "skipped",
variation="normal",
),
}
@app.post("/api/extract")
async def extract_document(
file: UploadFile = File(...),
db: Session = Depends(get_db),
):
"""
Main extraction endpoint used by the Dashboard.
1) Read the uploaded file
2) Call OpenRouter + Qwen3-VL
3) Store a record in SQLite
4) Return extraction result + metadata
"""
start = time.time()
content = await file.read()
content_type = file.content_type or "application/octet-stream"
size_mb = len(content) / 1024 / 1024
size_str = f"{size_mb:.2f} MB"
try:
print(f"[INFO] Starting extraction for file: {file.filename}, type: {content_type}, size: {size_str}")
extracted = await extract_fields_from_document(content, content_type, file.filename)
total_ms = int((time.time() - start) * 1000)
print(f"[INFO] Extraction completed. Response keys: {list(extracted.keys())}")
print(f"[INFO] Fields extracted: {extracted.get('fields', {})}")
confidence = float(extracted.get("confidence", 90))
fields = extracted.get("fields", {})
# Include full_text in fields if present (for frontend display)
full_text = extracted.get("full_text", "")
if full_text:
fields["full_text"] = full_text
full_text_words = len(str(full_text).split())
print(f"[INFO] Full text extracted: {full_text_words} words")
# Also check for pages array
pages_data = extracted.get("pages", [])
if pages_data and isinstance(pages_data, list):
print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
# Add pages to fields for frontend
fields["pages"] = pages_data
# Count fields, including full_text if present
fields_extracted = len(fields) if isinstance(fields, dict) else 0
print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")
status = "completed"
error_message = None
except Exception as e:
import traceback
total_ms = int((time.time() - start) * 1000)
confidence = 0.0
fields = {}
fields_extracted = 0
status = "failed"
error_message = str(e)
print(f"[ERROR] Extraction failed: {error_message}")
print(f"[ERROR] Traceback: {traceback.format_exc()}")
# Save record to DB
rec = ExtractionRecord(
file_name=file.filename,
file_type=content_type,
file_size=size_str,
status=status,
confidence=confidence,
fields_extracted=fields_extracted,
total_time_ms=total_ms,
raw_output=str(fields),
error_message=error_message,
)
db.add(rec)
db.commit()
db.refresh(rec)
stages = make_stages(total_ms, status)
# Response shape that frontend will consume
return {
"id": rec.id,
"fileName": rec.file_name,
"fileType": rec.file_type,
"fileSize": rec.file_size,
"status": status,
"confidence": confidence,
"fieldsExtracted": fields_extracted,
"totalTime": total_ms,
"fields": fields,
"stages": {k: v.dict() for k, v in stages.items()},
"errorMessage": error_message,
}
@app.get("/api/history", response_model=List[ExtractionRecordBase])
def get_history(db: Session = Depends(get_db)):
"""
Used by the History page.
Returns last 100 records, with synthetic stage data.
"""
recs = (
db.query(ExtractionRecord)
.order_by(ExtractionRecord.created_at.desc())
.limit(100)
.all()
)
output: List[ExtractionRecordBase] = []
for r in recs:
stages = make_stages(r.total_time_ms or 1000, r.status or "completed")
output.append(
ExtractionRecordBase(
id=r.id,
fileName=r.file_name,
fileType=r.file_type or "",
fileSize=r.file_size or "",
extractedAt=r.created_at,
status=r.status or "completed",
confidence=r.confidence or 0.0,
fieldsExtracted=r.fields_extracted or 0,
totalTime=r.total_time_ms or 0,
stages=stages,
errorMessage=r.error_message,
)
)
return output
# Static frontend mounting (used after we build React)
# Dockerfile copies the Vite build into backend/frontend_dist
# IMPORTANT: API routes must be defined BEFORE this so they take precedence
frontend_dir = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "frontend_dist"
)
if os.path.isdir(frontend_dir):
# Serve static files (JS, CSS, images, etc.) from assets directory
assets_dir = os.path.join(frontend_dir, "assets")
if os.path.isdir(assets_dir):
app.mount(
"/assets",
StaticFiles(directory=assets_dir),
name="assets",
)
# Serve static files from root (logo.png, favicon.ico, etc.)
# Files in public/ directory are copied to dist/ root during Vite build
# These routes must be defined BEFORE the catch-all route
@app.get("/logo.png")
async def serve_logo():
"""Serve logo.png from frontend_dist root."""
from fastapi.responses import FileResponse
logo_path = os.path.join(frontend_dir, "logo.png")
if os.path.exists(logo_path):
return FileResponse(logo_path, media_type="image/png")
from fastapi import HTTPException
raise HTTPException(status_code=404)
@app.get("/favicon.ico")
async def serve_favicon():
"""Serve favicon.ico from frontend_dist root."""
from fastapi.responses import FileResponse
favicon_path = os.path.join(frontend_dir, "favicon.ico")
if os.path.exists(favicon_path):
return FileResponse(favicon_path, media_type="image/x-icon")
from fastapi import HTTPException
raise HTTPException(status_code=404)
# Catch-all route to serve index.html for React Router
# This must be last so API routes and static files are matched first
@app.get("/{full_path:path}")
async def serve_frontend(full_path: str):
"""
Serve React app for all non-API routes.
React Router will handle client-side routing.
"""
# Skip API routes, docs, static assets, and known static files
if (full_path.startswith("api/") or
full_path.startswith("docs") or
full_path.startswith("openapi.json") or
full_path.startswith("assets/") or
full_path in ["logo.png", "favicon.ico"]):
from fastapi import HTTPException
raise HTTPException(status_code=404)
# Serve index.html for all other routes (React Router will handle routing)
from fastapi.responses import FileResponse
index_path = os.path.join(frontend_dir, "index.html")
if os.path.exists(index_path):
return FileResponse(index_path)
from fastapi import HTTPException
raise HTTPException(status_code=404)
|