""" main.py — FastAPI application entry point. Why FastAPI? - Native async support, Pydantic integration, automatic OpenAPI docs. - Extremely low boilerplate for typed REST APIs. - Standard choice for Python ML/LLM API services. Startup pattern: we load the catalog and build/load the index once at startup using FastAPI's lifespan context manager (the modern replacement for @app.on_event("startup")). All request handlers then receive these pre-loaded objects via app.state — no global variables, no singleton anti-patterns. Interview Q: "Why app.state instead of module-level globals?" A: Module-level globals can't be easily mocked in tests, and their initialization order is implicit. app.state is explicit, testable, and scoped to the application instance. Interview Q: "How do you make this horizontally scalable?" A: The service is stateless — no per-user data is stored. Multiple instances can run behind a load balancer with no sticky sessions required. """ import os import logging from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse from .schemas import ChatRequest, ChatResponse from .catalog_loader import load_catalog from .retrieval import get_or_build_index from .agent import run_agent # --------------------------------------------------------------------------- # Logging: structured logs to stdout so HF Spaces / Docker captures them. # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Lifespan: load all expensive artifacts once at startup. # --------------------------------------------------------------------------- @asynccontextmanager async def lifespan(app: FastAPI): """ FastAPI lifespan context manager. Everything before `yield` runs at startup; after yield runs at shutdown. We load the catalog and build/load the TF-IDF index here so that: 1. Startup failures are loud and immediate (not silent until first request). 2. Request handlers pay zero I/O cost. """ logger.info("Loading SHL catalog...") catalog = load_catalog() logger.info(f"Catalog loaded: {len(catalog)} items.") logger.info("Building/loading TF-IDF index...") vectorizer, tfidf_matrix = get_or_build_index(catalog) logger.info("Index ready.") # Build a URL set for O(1) hallucination checks in the agent. catalog_url_set = {item["url"] for item in catalog} # Store on app.state so all request handlers can access without globals. app.state.catalog = catalog app.state.catalog_url_set = catalog_url_set app.state.vectorizer = vectorizer app.state.tfidf_matrix = tfidf_matrix logger.info("SHL Agent ready.") yield # Shutdown: nothing to clean up (no DB connections, no file handles). logger.info("Shutting down SHL Agent.") # --------------------------------------------------------------------------- # Application instance # --------------------------------------------------------------------------- app = FastAPI( title="SHL Assessment Recommendation Agent", description=( "Conversational agent for recommending SHL psychometric assessments. " "Stateless API — send full conversation history on every POST /chat call." ), version="1.0.0", lifespan=lifespan, ) # --------------------------------------------------------------------------- # Global exception handler: returns JSON (not HTML) for unexpected errors. # This is important for automated evaluators that expect JSON responses. # --------------------------------------------------------------------------- @app.exception_handler(Exception) async def global_exception_handler(request: Request, exc: Exception) -> JSONResponse: logger.error(f"Unhandled exception: {exc}", exc_info=True) return JSONResponse( status_code=500, content={"detail": "Internal server error. Please check server logs."}, ) # --------------------------------------------------------------------------- # Routes # --------------------------------------------------------------------------- @app.get("/health") async def health() -> dict: """ Health check endpoint. Returns {"status": "ok"} when the service is running and catalog is loaded. Design: this is the canonical liveness probe for HF Spaces and load balancers. We intentionally don't check the LLM API here — that would make health checks flaky and expensive. LLM availability is tested at the first /chat call. """ return {"status": "ok"} @app.post("/chat", response_model=ChatResponse) async def chat(request: ChatRequest, req: Request) -> ChatResponse: """ Main conversational endpoint. Accepts: full conversation history (stateless — caller owns state). Returns: reply, recommendations (0–10 items), end_of_conversation flag. Error handling: - Pydantic validates the request shape; FastAPI returns 422 on invalid input. - We catch ValueError (e.g., empty messages) and return 400. - anthropic.APIError is caught and returned as 502 (upstream failure). - All other exceptions bubble to the global handler (500). """ logger.info(f"POST /chat — {len(request.messages)} message(s) in history.") try: response = run_agent( messages=request.messages, vectorizer=req.app.state.vectorizer, tfidf_matrix=req.app.state.tfidf_matrix, catalog=req.app.state.catalog, catalog_url_set=req.app.state.catalog_url_set, ) except ValueError as e: logger.warning(f"Bad request: {e}") raise HTTPException(status_code=400, detail=str(e)) except Exception as e: # Let the global handler deal with unexpected errors. raise logger.info( f"Response: end_of_conversation={response.end_of_conversation}, " f"recommendations={len(response.recommendations)}" ) return response