Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

jeanbaptdzd commited on 25 days ago

Commit

192844a

Merge feat/tool-enabling into master - resolve conflicts

- Keep tool calls implementation from feat/tool-enabling
- Keep latest changes from origin/master for main.py and middleware.py

Files changed (9) hide show

.coderabbit.yaml +0 -1
app/config.py +26 -4
app/main.py +38 -30
app/middleware.py +30 -10
app/providers/base.py +24 -2
app/services/chat_service.py +22 -2
app/utils/constants.py +24 -15
app/utils/helpers.py +3 -3
app/utils/memory.py +14 -3

.coderabbit.yaml CHANGED Viewed

@@ -16,7 +16,6 @@ review:
   simple: false  # Set to true for faster, simpler reviews
   high_level_summary: true
   estimate_time: true
-  project_language: python
 chat:
   enabled: true

   simple: false  # Set to true for faster, simpler reviews
   high_level_summary: true
   estimate_time: true
 chat:
   enabled: true

app/config.py CHANGED Viewed

@@ -1,11 +1,33 @@
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
-    model: str = "DragonLLM/qwen3-8b-fin-v1.0"
-    service_api_key: str | None = None
-    log_level: str = "info"
-    force_model_reload: bool = False  # Set FORCE_MODEL_RELOAD=true to bypass cache on startup
     model_config = SettingsConfigDict(
         env_file=".env",

+"""Application configuration using Pydantic settings."""
+from typing import Literal
+from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
+    """Application settings loaded from environment variables.
+    Supports loading from .env file with UTF-8 encoding.
+    All settings can be overridden via environment variables.
+    """
+    model: str = Field(
+        default="DragonLLM/qwen3-8b-fin-v1.0",
+        description="Hugging Face model identifier"
+    )
+    service_api_key: str | None = Field(
+        default=None,
+        description="Optional API key for authentication (SERVICE_API_KEY env var)"
+    )
+    log_level: Literal["debug", "info", "warning", "error"] = Field(
+        default="info",
+        description="Logging level"
+    )
+    force_model_reload: bool = Field(
+        default=False,
+        description="Force model reload from Hugging Face, bypassing cache (FORCE_MODEL_RELOAD env var)"
+    )
     model_config = SettingsConfigDict(
         env_file=".env",

app/main.py CHANGED Viewed

@@ -1,30 +1,39 @@
-from typing import Dict, Any
 from fastapi import FastAPI
 from app.middleware import api_key_guard
-from app.middleware.rate_limit import rate_limit_middleware
 from app.routers import openai_api
-from app.config import settings
-from app.providers.transformers_provider import model, _initialized
-from app.utils.stats import get_stats_tracker
-import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="LLM Pro Finance API (Transformers)")
 # Mount routers
 app.include_router(openai_api.router, prefix="/v1")
-# Middleware order: rate limiting first, then API key guard
-app.middleware("http")(rate_limit_middleware)
 app.middleware("http")(api_key_guard)
 @app.on_event("startup")
-async def startup_event():
-    """Startup event - initialize model in background"""
-    import threading
     logger.info("Starting LLM Pro Finance API...")
     force_reload = settings.force_model_reload
@@ -33,7 +42,8 @@ async def startup_event():
     logger.info("Initializing model in background thread...")
-    def load_model():
         from app.providers.transformers_provider import initialize_model
         initialize_model(force_reload=force_reload)
@@ -42,32 +52,30 @@ async def startup_event():
     thread.start()
     logger.info("Model initialization started in background")
 @app.get("/")
 async def root() -> Dict[str, str]:
-    """Root endpoint returning API status and information."""
     return {
         "status": "ok",
         "service": "Qwen Open Finance R 8B Inference",
         "version": "1.0.0",
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "backend": "Transformers"
     }
-@app.get("/health")
-async def health() -> Dict[str, Any]:
-    """Health check endpoint with model readiness status."""
-    model_ready = _initialized and model is not None
-    return {
-        "status": "healthy" if model_ready else "initializing",
-        "service": "LLM Pro Finance API",
-        "model_ready": model_ready,
-    }
-@app.get("/v1/stats")
-async def get_stats() -> Dict[str, Any]:
-    """Get API usage statistics and token counts."""
-    stats_tracker = get_stats_tracker()
-    return stats_tracker.get_stats()

+"""Main FastAPI application entry point."""
+import logging
+import threading
+from typing import Dict
 from fastapi import FastAPI
+from app.config import settings
 from app.middleware import api_key_guard
 from app.routers import openai_api
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="LLM Pro Finance API (Transformers)",
+    description="OpenAI-compatible API for financial LLM inference",
+    version="1.0.0"
+)
 # Mount routers
 app.include_router(openai_api.router, prefix="/v1")
+# Optional API key middleware
 app.middleware("http")(api_key_guard)
 @app.on_event("startup")
+async def startup_event() -> None:
+    """Startup event - initialize model in background thread.
+    Loads the model asynchronously to avoid blocking the API startup.
+    Model loading happens in a daemon thread so it doesn't prevent shutdown.
+    """
     logger.info("Starting LLM Pro Finance API...")
     force_reload = settings.force_model_reload
     logger.info("Initializing model in background thread...")
+    def load_model() -> None:
+        """Load the model in a background thread."""
         from app.providers.transformers_provider import initialize_model
         initialize_model(force_reload=force_reload)
     thread.start()
     logger.info("Model initialization started in background")
 @app.get("/")
 async def root() -> Dict[str, str]:
+    """Root endpoint returning API status and information.
+    Returns:
+        Dictionary containing API status, service name, version, model, and backend.
+    """
     return {
         "status": "ok",
         "service": "Qwen Open Finance R 8B Inference",
         "version": "1.0.0",
+        "model": settings.model,
         "backend": "Transformers"
     }
+@app.get("/health")
+async def health() -> Dict[str, str]:
+    """Health check endpoint for monitoring and load balancers.
+    Returns:
+        Dictionary with service health status.
+    """
+    return {"status": "healthy", "service": "LLM Pro Finance API"}

app/middleware.py CHANGED Viewed

@@ -1,26 +1,46 @@
-from fastapi import Request, HTTPException
-from fastapi.responses import JSONResponse
 from app.config import settings
-async def api_key_guard(request: Request, call_next):
-    # Public endpoints that don't require authentication
-    public_paths = ["/", "/health", "/docs", "/redoc", "/openapi.json", "/v1/stats"]
     # Skip auth for public endpoints
-    if request.url.path in public_paths:
         return await call_next(request)
     # Skip auth if no API key is configured
     if not settings.service_api_key:
         return await call_next(request)
-    # Check API key
-    key = request.headers.get("x-api-key") or request.headers.get("authorization")
-    if key and key.replace("Bearer ", "").strip() == settings.service_api_key:
         return await call_next(request)
-    return JSONResponse({"error": "unauthorized"}, status_code=401)

+from fastapi import Request
+from fastapi.responses import JSONResponse, Response
+from typing import Callable, Awaitable, Union
 from app.config import settings
+# Public endpoints that don't require authentication
+PUBLIC_PATHS = frozenset(["/", "/health", "/docs", "/redoc", "/openapi.json"])
+async def api_key_guard(request: Request, call_next: Callable[[Request], Awaitable[Response]]) -> Union[Response, JSONResponse]:
+    """
+    Middleware to protect API endpoints with optional API key authentication.
+    Args:
+        request: FastAPI request object
+        call_next: Next middleware/handler in the chain
+    Returns:
+        Response from next handler or 401 if unauthorized
+    """
     # Skip auth for public endpoints
+    if request.url.path in PUBLIC_PATHS:
         return await call_next(request)
     # Skip auth if no API key is configured
     if not settings.service_api_key:
         return await call_next(request)
+    # Check API key from headers
+    api_key = request.headers.get("x-api-key")
+    if not api_key:
+        # Also check Authorization header with Bearer token
+        auth_header = request.headers.get("authorization", "")
+        if auth_header.startswith("Bearer "):
+            api_key = auth_header.replace("Bearer ", "").strip()
+    if api_key and api_key == settings.service_api_key:
         return await call_next(request)
+    return JSONResponse(
+        content={"error": {"message": "unauthorized", "type": "authentication_error"}},
+        status_code=401
+    )

app/providers/base.py CHANGED Viewed

@@ -1,11 +1,33 @@
-from typing import Protocol, Dict, Any
 class LLMProvider(Protocol):
     async def list_models(self) -> Dict[str, Any]:
         ...
     async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Any:
         ...

+"""Base protocol for LLM providers."""
+from typing import Any, Dict, Protocol
 class LLMProvider(Protocol):
+    """Protocol defining the interface for LLM providers.
+    Any class implementing this protocol must provide async methods
+    for listing models and generating chat completions.
+    """
     async def list_models(self) -> Dict[str, Any]:
+        """List available models.
+        Returns:
+            Dictionary containing model information.
+        """
         ...
     async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Any:
+        """Generate chat completion.
+        Args:
+            payload: Request payload containing messages and parameters
+            stream: Whether to stream the response
+        Returns:
+            Chat completion response (varies by implementation)
+        """
         ...

app/services/chat_service.py CHANGED Viewed

@@ -1,13 +1,33 @@
-from typing import Any, Dict
 from app.providers import transformers_provider as provider
 async def list_models() -> Dict[str, Any]:
     return await provider.list_models()
-async def chat(payload: Dict[str, Any], stream: bool = False):
     return await provider.chat(payload, stream=stream)

+"""Chat service layer providing abstraction over the provider."""
+from typing import Any, Dict, Union, AsyncIterator
 from app.providers import transformers_provider as provider
 async def list_models() -> Dict[str, Any]:
+    """
+    List available models.
+    Returns:
+        Dictionary containing model list in OpenAI-compatible format
+    """
     return await provider.list_models()
+async def chat(
+    payload: Dict[str, Any],
+    stream: bool = False
+) -> Union[Dict[str, Any], AsyncIterator[str]]:
+    """
+    Process chat completion request.
+    Args:
+        payload: Request payload containing messages and generation parameters
+        stream: Whether to stream the response
+    Returns:
+        Response dictionary or async iterator for streaming
+    """
     return await provider.chat(payload, stream=stream)

app/utils/constants.py CHANGED Viewed

@@ -1,18 +1,25 @@
-"""Application-wide constants."""
 import os
 # Model configuration
-MODEL_NAME = "DragonLLM/qwen3-8b-fin-v1.0"
 # Cache directory - respect HF_HOME if set, otherwise use default
-CACHE_DIR = os.getenv("HF_HOME", "/tmp/huggingface")
 # Hugging Face token environment variable priority order
-HF_TOKEN_VARS = ["HF_TOKEN_LC2", "HF_TOKEN_LC", "HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"]
 # French language detection patterns
-FRENCH_PHRASES = [
     "en français",
     "répondez en français",
     "réponse française",
@@ -20,9 +27,11 @@ FRENCH_PHRASES = [
     "expliquez en français",
 ]
-FRENCH_CHARS = ["é", "è", "ê", "à", "ç", "ù", "ô", "î", "â", "û", "ë", "ï"]
-FRENCH_PATTERNS = [
     "qu'est-ce",
     "qu'est",
     "expliquez",
@@ -38,7 +47,7 @@ FRENCH_PATTERNS = [
     "définissez",
 ]
-FRENCH_SYSTEM_PROMPT = (
     "Vous êtes un assistant financier expert. "
     "Répondez TOUJOURS en français. "
     "Soyez concis et précis dans vos explications. "
@@ -46,15 +55,15 @@ FRENCH_SYSTEM_PROMPT = (
 )
 # Qwen3 EOS tokens
-EOS_TOKENS = [151645, 151643]  # [<|im_end|>, <|endoftext|>]
-PAD_TOKEN_ID = 151643  # <|endoftext|>
 # Generation defaults
-DEFAULT_MAX_TOKENS = 1000  # Increased for complete answers with concise reasoning
-DEFAULT_TEMPERATURE = 0.7
-DEFAULT_TOP_P = 1.0
-DEFAULT_TOP_K = 20
-REPETITION_PENALTY = 1.05
 # Model initialization constants
 MODEL_INIT_TIMEOUT_SECONDS = 300  # 5 minutes timeout for model initialization

+"""Application-wide constants and configuration."""
 import os
+from typing import Final, List
 # Model configuration
+MODEL_NAME: Final[str] = "DragonLLM/qwen3-8b-fin-v1.0"
 # Cache directory - respect HF_HOME if set, otherwise use default
+CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
 # Hugging Face token environment variable priority order
+HF_TOKEN_VARS: Final[List[str]] = [
+    "HF_TOKEN_LC2",
+    "HF_TOKEN_LC",
+    "HF_TOKEN",
+    "HUGGING_FACE_HUB_TOKEN"
+]
 # French language detection patterns
+FRENCH_PHRASES: Final[List[str]] = [
     "en français",
     "répondez en français",
     "réponse française",
     "expliquez en français",
 ]
+FRENCH_CHARS: Final[List[str]] = [
+    "é", "è", "ê", "à", "ç", "ù", "ô", "î", "â", "û", "ë", "ï"
+]
+FRENCH_PATTERNS: Final[List[str]] = [
     "qu'est-ce",
     "qu'est",
     "expliquez",
     "définissez",
 ]
+FRENCH_SYSTEM_PROMPT: Final[str] = (
     "Vous êtes un assistant financier expert. "
     "Répondez TOUJOURS en français. "
     "Soyez concis et précis dans vos explications. "
 )
 # Qwen3 EOS tokens
+EOS_TOKENS: Final[List[int]] = [151645, 151643]  # [<|im_end|>, <|endoftext|>]
+PAD_TOKEN_ID: Final[int] = 151643  # <|endoftext|>
 # Generation defaults
+DEFAULT_MAX_TOKENS: Final[int] = 1000  # Increased for complete answers with concise reasoning
+DEFAULT_TEMPERATURE: Final[float] = 0.7
+DEFAULT_TOP_P: Final[float] = 1.0
+DEFAULT_TOP_K: Final[int] = 20
+REPETITION_PENALTY: Final[float] = 1.05
 # Model initialization constants
 MODEL_INIT_TIMEOUT_SECONDS = 300  # 5 minutes timeout for model initialization

app/utils/helpers.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import logging
-from typing import Optional, Tuple
 from app.utils.constants import HF_TOKEN_VARS, FRENCH_PHRASES, FRENCH_CHARS, FRENCH_PATTERNS
@@ -24,7 +24,7 @@ def get_hf_token() -> Tuple[Optional[str], str]:
     return None, "none"
-def is_french_request(messages: list) -> bool:
     """
     Detect if the request is in French based on user messages.
@@ -55,7 +55,7 @@ def is_french_request(messages: list) -> bool:
     return False
-def has_french_system_prompt(messages: list) -> bool:
     """Check if messages already contain a French system prompt."""
     return any(
         "français" in msg.get("content", "").lower()

 import os
 import logging
+from typing import Optional, Tuple, List, Dict, Any
 from app.utils.constants import HF_TOKEN_VARS, FRENCH_PHRASES, FRENCH_CHARS, FRENCH_PATTERNS
     return None, "none"
+def is_french_request(messages: List[Dict[str, Any]]) -> bool:
     """
     Detect if the request is in French based on user messages.
     return False
+def has_french_system_prompt(messages: List[Dict[str, Any]]) -> bool:
     """Check if messages already contain a French system prompt."""
     return any(
         "français" in msg.get("content", "").lower()

app/utils/memory.py CHANGED Viewed

@@ -1,12 +1,23 @@
 """GPU memory management utilities."""
 import gc
 import torch
-from typing import Optional
-def clear_gpu_memory(model=None, tokenizer=None):
-    """Clear GPU memory completely."""
     if not torch.cuda.is_available():
         return

 """GPU memory management utilities."""
 import gc
+from typing import Optional, Any
 import torch
+def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
+    """Clear GPU memory completely.
+    This function performs aggressive GPU memory cleanup by:
+    1. Deleting model and tokenizer objects if provided
+    2. Clearing CUDA cache
+    3. Running multiple garbage collection passes
+    Args:
+        model: Optional model object to delete
+        tokenizer: Optional tokenizer object to delete
+    """
     if not torch.cuda.is_available():
         return