khubchand's picture
Optimize Hugging Face Space: add eager model loading, reduce max tokens, fix stop tokens, limit CPU threads
09bc714
import traceback
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from routes.chat import router as chat_router
from routes.upload import router as upload_router
from routes.health import router as health_router
from utils.logger import logger
from llm.model_loader import get_llm
from embeddings.embedding_model import get_embedding_model
from embeddings.vector_store import load_vector_store
@asynccontextmanager
async def lifespan(app: FastAPI):
# Eagerly load models on startup
logger.info("Eagerly loading LLM model on startup...")
try:
get_llm()
logger.info("LLM model loaded successfully!")
except Exception as e:
logger.error(f"Error loading LLM model on startup: {e}")
logger.info("Eagerly loading embedding model on startup...")
try:
get_embedding_model()
logger.info("Embedding model loaded successfully!")
except Exception as e:
logger.error(f"Error loading embedding model on startup: {e}")
logger.info("Eagerly loading vector store on startup...")
try:
load_vector_store()
logger.info("Vector store loaded successfully!")
except Exception as e:
logger.error(f"Error loading vector store on startup: {e}")
yield
app = FastAPI(title="AI Assistant", lifespan=lifespan)
app.include_router(chat_router)
app.include_router(upload_router)
app.include_router(health_router)
# Global fallback: convert unhandled exceptions to JSON 500 with detail
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
logger.error(f"Unhandled exception: {exc}\n{traceback.format_exc()}")
return JSONResponse(
status_code=500,
content={
"error": type(exc).__name__,
"detail": str(exc),
}
)
@app.get("/")
def root():
return {"message": "AI Engine Running"}