Spaces:

Seth0330
/

DocClassify

Sleeping

File size: 3,203 Bytes

from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pathlib import Path
from app.pdf_processor import extract_text_from_pdf
from app.classifier import get_classifier

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize classifier (lazy loading)
classifier = None

def get_classifier_instance():
    """Lazy load the classifier."""
    global classifier
    if classifier is None:
        classifier = get_classifier()
    return classifier

@app.on_event("startup")
async def startup_event():
    """Preload the classifier on startup to avoid first-request delay."""
    print("Preloading classifier on startup...")
    try:
        get_classifier_instance()
        print("✅ Classifier loaded and ready!")
    except Exception as e:
        print(f"⚠️  Warning: Could not preload classifier: {e}")
        print("Classifier will be loaded on first request.")

# ---- API ----
@app.get("/api/health")
def health():
    return {"status": "ok"}

@app.get("/api/hello")
def hello():
    return {"message": "Hello from FastAPI"}

@app.post("/api/classify")
async def classify_document(file: UploadFile = File(...)):
    """
    Classify a PDF document.
    
    Args:
        file: Uploaded PDF file
        
    Returns:
        Classification results with document type and confidence
    """
    # Validate file type
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")
    
    try:
        # Read file content
        contents = await file.read()
        
        # Extract text from PDF
        text = extract_text_from_pdf(contents)
        
        if not text:
            raise HTTPException(
                status_code=400, 
                detail="Could not extract text from PDF. The file might be empty, corrupted, or image-based."
            )
        
        # Classify the document
        classifier_instance = get_classifier_instance()
        result = classifier_instance.classify_document(text)
        
        return JSONResponse(content={
            "success": True,
            "filename": file.filename,
            "classification": result,
            "text_length": len(text)
        })
        
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")

# ---- Frontend static serving ----
FRONTEND_DIST = Path(__file__).resolve().parents[2] / "frontend" / "dist"
INDEX_FILE = FRONTEND_DIST / "index.html"

if FRONTEND_DIST.exists():
    app.mount("/", StaticFiles(directory=str(FRONTEND_DIST), html=True), name="static")

    # SPA fallback: any non-/api route should return React index.html
    @app.get("/{full_path:path}")
    def spa_fallback(full_path: str):
        if full_path.startswith("api/"):
            return {"detail": "Not Found"}
        return FileResponse(str(INDEX_FILE))