File size: 3,650 Bytes
1e23279
 
 
 
f28306b
1e23279
9db586c
 
1e23279
dc14519
1e23279
6851411
dc14519
6541672
6851411
dc14519
7ee7723
 
 
 
 
dc14519
772dd21
6851411
1e23279
 
 
dc14519
1e23279
6851411
 
 
 
dc14519
 
 
6851411
 
 
1e23279
772dd21
1e23279
 
 
 
 
 
a750766
16c2a22
 
 
 
 
ee07ed2
 
1e23279
 
afd6869
16c2a22
ee07ed2
 
 
 
 
6851411
1e23279
6851411
f28306b
1e23279
 
 
 
 
772dd21
 
cc88da6
dc14519
1e23279
9c71bb7
772dd21
 
1e23279
772dd21
f28306b
9db586c
1e23279
 
9db586c
1e23279
9db586c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6851411
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Main FastAPI application entry point."""

import logging
import threading
from typing import Dict

from fastapi import FastAPI, status
from fastapi.responses import JSONResponse

from app import __version__
from app.config import settings
from app.middleware import api_key_guard
from app.middleware.rate_limit import rate_limit_middleware
from app.routers import openai_api

# Configure logging with level from settings
try:
    log_level = getattr(logging, settings.log_level.upper())
except AttributeError:
    print(f"Warning: Invalid log level '{settings.log_level}'. Falling back to INFO.")
    log_level = logging.INFO
logging.basicConfig(level=log_level)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="LLM Pro Finance API (Transformers)",
    description="OpenAI-compatible API for financial LLM inference",
    version=__version__
)

# Mount routers
app.include_router(openai_api.router, prefix="/v1")

# Rate limiting middleware (applied first)
app.middleware("http")(rate_limit_middleware)

# Optional API key middleware
app.middleware("http")(api_key_guard)


@app.on_event("startup")
async def startup_event() -> None:
    """Startup event - initialize model in background thread.
    
    Loads the model asynchronously to avoid blocking the API startup.
    Model loading happens in a daemon thread so it doesn't prevent shutdown.
    """
    logger.info("Starting LLM Pro Finance API...")
    
    force_reload = settings.force_model_reload
    if force_reload:
        logger.info("Force model reload enabled (FORCE_MODEL_RELOAD=true)")
    
    logger.info("Initializing model in background thread...")
    
    def load_model() -> None:
        """Load the model in a background thread."""
        from app.providers.transformers_provider import initialize_model
        initialize_model(force_reload=force_reload)
    
    # Start model loading in background thread
    thread = threading.Thread(target=load_model, daemon=True)
    thread.start()
    logger.info("Model initialization started in background")


@app.get("/")
async def root() -> Dict[str, str]:
    """Root endpoint returning API status and information.
    
    Returns:
        Dictionary containing API status, service name, version, model, and backend.
    """
    return {
        "status": "ok", 
        "service": "Qwen Open Finance R 8B Inference", 
        "version": __version__,
        "model": settings.model,
        "backend": "Transformers"
    }


@app.get("/health")
async def health() -> Dict[str, str]:
    """Liveness check endpoint for monitoring and load balancers.
    
    Returns:
        Dictionary indicating the service is alive.
    """
    return {"status": "service alive", "service": "LLM Pro Finance API"}


@app.get("/ready")
async def ready() -> JSONResponse:
    """Readiness check endpoint for orchestrators and load balancers.
    
    Checks if the model is loaded and ready to handle requests.
    Returns 503 Service Unavailable if the model is not ready.
    
    Returns:
        JSONResponse with ready/model_loaded fields and appropriate status code.
    """
    from app.providers.transformers_provider import is_model_ready
    
    model_loaded = is_model_ready()
    ready_status = model_loaded
    
    response_data = {
        "ready": ready_status,
        "model_loaded": model_loaded,
        "service": "LLM Pro Finance API"
    }
    
    if ready_status:
        return JSONResponse(content=response_data, status_code=status.HTTP_200_OK)
    else:
        return JSONResponse(
            content=response_data,
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE
        )