Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

jeanbaptdzd commited on Nov 3

Commit

1e23279

1 Parent(s): 20548ac

refactor: Enhance codebase with comprehensive improvements for CodeRabbit review

- app/config.py: Add Pydantic Field descriptions, type validation, and docstrings
- Use Literal type for log_level validation
- Add comprehensive class and field documentation

- app/main.py: Improve structure and documentation
- Add module docstring
- Enhance FastAPI app configuration with description and version
- Add comprehensive docstrings to all functions
- Use settings.model instead of hardcoded model name

- app/utils/memory.py: Add type hints and comprehensive docstring
- Add Optional[Any] type hints for model/tokenizer parameters
- Add return type annotation (None)
- Document function behavior and cleanup process

- app/models/openai.py: Add Pydantic Field validation and docstrings
- Add comprehensive docstrings to all model classes
- Add Field descriptions and validation (ge, le constraints)
- Improve type safety with proper Field annotations

- app/providers/base.py: Add protocol documentation
- Add module docstring
- Add comprehensive docstrings to LLMProvider protocol

- app/utils/constants.py: Add Final type hints for immutability
- Use typing.Final for all constants to indicate immutability
- Improve code clarity and type safety

These changes expand CodeRabbit review coverage to include config, models,
base providers, and utility modules that were not previously reviewed.

Files changed (6) hide show

app/config.py +26 -4
app/main.py +34 -10
app/models/openai.py +70 -21
app/providers/base.py +24 -2
app/utils/constants.py +24 -15
app/utils/memory.py +14 -3

app/config.py CHANGED Viewed

@@ -1,11 +1,33 @@
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
-    model: str = "DragonLLM/qwen3-8b-fin-v1.0"
-    service_api_key: str | None = None
-    log_level: str = "info"
-    force_model_reload: bool = False  # Set FORCE_MODEL_RELOAD=true to bypass cache on startup
     model_config = SettingsConfigDict(
         env_file=".env",

+"""Application configuration using Pydantic settings."""
+from typing import Literal
+from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
+    """Application settings loaded from environment variables.
+    Supports loading from .env file with UTF-8 encoding.
+    All settings can be overridden via environment variables.
+    """
+    model: str = Field(
+        default="DragonLLM/qwen3-8b-fin-v1.0",
+        description="Hugging Face model identifier"
+    )
+    service_api_key: str | None = Field(
+        default=None,
+        description="Optional API key for authentication (SERVICE_API_KEY env var)"
+    )
+    log_level: Literal["debug", "info", "warning", "error"] = Field(
+        default="info",
+        description="Logging level"
+    )
+    force_model_reload: bool = Field(
+        default=False,
+        description="Force model reload from Hugging Face, bypassing cache (FORCE_MODEL_RELOAD env var)"
+    )
     model_config = SettingsConfigDict(
         env_file=".env",

app/main.py CHANGED Viewed

@@ -1,15 +1,24 @@
 from typing import Dict
 from fastapi import FastAPI
 from app.middleware import api_key_guard
 from app.routers import openai_api
-from app.config import settings
-import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="LLM Pro Finance API (Transformers)")
 # Mount routers
 app.include_router(openai_api.router, prefix="/v1")
@@ -17,10 +26,14 @@ app.include_router(openai_api.router, prefix="/v1")
 # Optional API key middleware
 app.middleware("http")(api_key_guard)
 @app.on_event("startup")
-async def startup_event():
-    """Startup event - initialize model in background"""
-    import threading
     logger.info("Starting LLM Pro Finance API...")
     force_reload = settings.force_model_reload
@@ -29,7 +42,8 @@ async def startup_event():
     logger.info("Initializing model in background thread...")
-    def load_model():
         from app.providers.transformers_provider import initialize_model
         initialize_model(force_reload=force_reload)
@@ -38,20 +52,30 @@ async def startup_event():
     thread.start()
     logger.info("Model initialization started in background")
 @app.get("/")
 async def root() -> Dict[str, str]:
-    """Root endpoint returning API status and information."""
     return {
         "status": "ok",
         "service": "Qwen Open Finance R 8B Inference",
         "version": "1.0.0",
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "backend": "Transformers"
     }
 @app.get("/health")
 async def health() -> Dict[str, str]:
-    """Health check endpoint."""
     return {"status": "healthy", "service": "LLM Pro Finance API"}

+"""Main FastAPI application entry point."""
+import logging
+import threading
 from typing import Dict
 from fastapi import FastAPI
+from app.config import settings
 from app.middleware import api_key_guard
 from app.routers import openai_api
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="LLM Pro Finance API (Transformers)",
+    description="OpenAI-compatible API for financial LLM inference",
+    version="1.0.0"
+)
 # Mount routers
 app.include_router(openai_api.router, prefix="/v1")
 # Optional API key middleware
 app.middleware("http")(api_key_guard)
 @app.on_event("startup")
+async def startup_event() -> None:
+    """Startup event - initialize model in background thread.
+    Loads the model asynchronously to avoid blocking the API startup.
+    Model loading happens in a daemon thread so it doesn't prevent shutdown.
+    """
     logger.info("Starting LLM Pro Finance API...")
     force_reload = settings.force_model_reload
     logger.info("Initializing model in background thread...")
+    def load_model() -> None:
+        """Load the model in a background thread."""
         from app.providers.transformers_provider import initialize_model
         initialize_model(force_reload=force_reload)
     thread.start()
     logger.info("Model initialization started in background")
 @app.get("/")
 async def root() -> Dict[str, str]:
+    """Root endpoint returning API status and information.
+    Returns:
+        Dictionary containing API status, service name, version, model, and backend.
+    """
     return {
         "status": "ok",
         "service": "Qwen Open Finance R 8B Inference",
         "version": "1.0.0",
+        "model": settings.model,
         "backend": "Transformers"
     }
 @app.get("/health")
 async def health() -> Dict[str, str]:
+    """Health check endpoint for monitoring and load balancers.
+    Returns:
+        Dictionary with service health status.
+    """
     return {"status": "healthy", "service": "LLM Pro Finance API"}

app/models/openai.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from typing import List, Literal, Optional
 from pydantic import BaseModel, Field
@@ -6,42 +9,88 @@ Role = Literal["system", "user", "assistant", "tool"]
 class Message(BaseModel):
     role: Role
-    content: str
 class ChatCompletionRequest(BaseModel):
-    model: Optional[str] = None  # Optional, will use default from config
-    messages: List[Message]
-    temperature: Optional[float] = 0.7
-    max_tokens: Optional[int] = None
-    stream: Optional[bool] = False
-    top_p: Optional[float] = 1.0
 class ChoiceMessage(BaseModel):
-    role: Literal["assistant"]
-    content: Optional[str] = None
 class Choice(BaseModel):
-    index: int
-    message: ChoiceMessage
-    finish_reason: Optional[str] = None
 class Usage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
 class ChatCompletionResponse(BaseModel):
-    id: str
-    object: Literal["chat.completion"] = "chat.completion"
-    created: int
-    model: str
-    choices: List[Choice]
-    usage: Optional[Usage] = None

+"""OpenAI-compatible API models using Pydantic."""
 from typing import List, Literal, Optional
 from pydantic import BaseModel, Field
 class Message(BaseModel):
+    """A single message in a conversation.
+    Attributes:
+        role: The role of the message sender
+        content: The text content of the message
+    """
     role: Role
+    content: str = Field(..., description="Message content")
 class ChatCompletionRequest(BaseModel):
+    """Request model for chat completions endpoint.
+    Attributes:
+        model: Optional model identifier (uses default from config if not provided)
+        messages: List of messages in the conversation
+        temperature: Sampling temperature (0-2)
+        max_tokens: Maximum tokens to generate
+        stream: Whether to stream the response
+        top_p: Nucleus sampling parameter
+    """
+    model: Optional[str] = Field(default=None, description="Model identifier")
+    messages: List[Message] = Field(..., description="Conversation messages")
+    temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0, description="Sampling temperature")
+    max_tokens: Optional[int] = Field(default=None, ge=1, description="Maximum tokens to generate")
+    stream: Optional[bool] = Field(default=False, description="Stream response")
+    top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0, description="Nucleus sampling parameter")
 class ChoiceMessage(BaseModel):
+    """Assistant message in a completion choice.
+    Attributes:
+        role: Always "assistant" for completion messages
+        content: The generated message content
+    """
+    role: Literal["assistant"] = "assistant"
+    content: Optional[str] = Field(default=None, description="Generated message content")
 class Choice(BaseModel):
+    """A single completion choice.
+    Attributes:
+        index: Choice index
+        message: The generated message
+        finish_reason: Reason why generation finished (stop, length, etc.)
+    """
+    index: int = Field(..., description="Choice index")
+    message: ChoiceMessage = Field(..., description="Generated message")
+    finish_reason: Optional[str] = Field(default=None, description="Reason for completion")
 class Usage(BaseModel):
+    """Token usage statistics.
+    Attributes:
+        prompt_tokens: Number of tokens in the prompt
+        completion_tokens: Number of tokens in the completion
+        total_tokens: Total tokens used
+    """
+    prompt_tokens: int = Field(..., ge=0, description="Tokens in prompt")
+    completion_tokens: int = Field(..., ge=0, description="Tokens in completion")
+    total_tokens: int = Field(..., ge=0, description="Total tokens used")
 class ChatCompletionResponse(BaseModel):
+    """Response model for chat completions endpoint.
+    Attributes:
+        id: Unique completion ID
+        object: Always "chat.completion"
+        created: Unix timestamp of creation
+        model: Model identifier used
+        choices: List of completion choices
+        usage: Optional token usage statistics
+    """
+    id: str = Field(..., description="Completion ID")
+    object: Literal["chat.completion"] = Field(default="chat.completion", description="Object type")
+    created: int = Field(..., description="Unix timestamp")
+    model: str = Field(..., description="Model identifier")
+    choices: List[Choice] = Field(..., description="Completion choices")
+    usage: Optional[Usage] = Field(default=None, description="Token usage statistics")

app/providers/base.py CHANGED Viewed

@@ -1,11 +1,33 @@
-from typing import Protocol, Dict, Any
 class LLMProvider(Protocol):
     async def list_models(self) -> Dict[str, Any]:
         ...
     async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Any:
         ...

+"""Base protocol for LLM providers."""
+from typing import Any, Dict, Protocol
 class LLMProvider(Protocol):
+    """Protocol defining the interface for LLM providers.
+    Any class implementing this protocol must provide async methods
+    for listing models and generating chat completions.
+    """
     async def list_models(self) -> Dict[str, Any]:
+        """List available models.
+        Returns:
+            Dictionary containing model information.
+        """
         ...
     async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Any:
+        """Generate chat completion.
+        Args:
+            payload: Request payload containing messages and parameters
+            stream: Whether to stream the response
+        Returns:
+            Chat completion response (varies by implementation)
+        """
         ...

app/utils/constants.py CHANGED Viewed

@@ -1,18 +1,25 @@
-"""Application-wide constants."""
 import os
 # Model configuration
-MODEL_NAME = "DragonLLM/qwen3-8b-fin-v1.0"
 # Cache directory - respect HF_HOME if set, otherwise use default
-CACHE_DIR = os.getenv("HF_HOME", "/tmp/huggingface")
 # Hugging Face token environment variable priority order
-HF_TOKEN_VARS = ["HF_TOKEN_LC2", "HF_TOKEN_LC", "HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"]
 # French language detection patterns
-FRENCH_PHRASES = [
     "en français",
     "répondez en français",
     "réponse française",
@@ -20,9 +27,11 @@ FRENCH_PHRASES = [
     "expliquez en français",
 ]
-FRENCH_CHARS = ["é", "è", "ê", "à", "ç", "ù", "ô", "î", "â", "û", "ë", "ï"]
-FRENCH_PATTERNS = [
     "qu'est-ce",
     "qu'est",
     "expliquez",
@@ -38,7 +47,7 @@ FRENCH_PATTERNS = [
     "définissez",
 ]
-FRENCH_SYSTEM_PROMPT = (
     "Vous êtes un assistant financier expert. "
     "Répondez TOUJOURS en français. "
     "Soyez concis et précis dans vos explications. "
@@ -46,13 +55,13 @@ FRENCH_SYSTEM_PROMPT = (
 )
 # Qwen3 EOS tokens
-EOS_TOKENS = [151645, 151643]  # [<|im_end|>, <|endoftext|>]
-PAD_TOKEN_ID = 151643  # <|endoftext|>
 # Generation defaults
-DEFAULT_MAX_TOKENS = 1000  # Increased for complete answers with concise reasoning
-DEFAULT_TEMPERATURE = 0.7
-DEFAULT_TOP_P = 1.0
-DEFAULT_TOP_K = 20
-REPETITION_PENALTY = 1.05

+"""Application-wide constants and configuration."""
 import os
+from typing import Final, List
 # Model configuration
+MODEL_NAME: Final[str] = "DragonLLM/qwen3-8b-fin-v1.0"
 # Cache directory - respect HF_HOME if set, otherwise use default
+CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
 # Hugging Face token environment variable priority order
+HF_TOKEN_VARS: Final[List[str]] = [
+    "HF_TOKEN_LC2",
+    "HF_TOKEN_LC",
+    "HF_TOKEN",
+    "HUGGING_FACE_HUB_TOKEN"
+]
 # French language detection patterns
+FRENCH_PHRASES: Final[List[str]] = [
     "en français",
     "répondez en français",
     "réponse française",
     "expliquez en français",
 ]
+FRENCH_CHARS: Final[List[str]] = [
+    "é", "è", "ê", "à", "ç", "ù", "ô", "î", "â", "û", "ë", "ï"
+]
+FRENCH_PATTERNS: Final[List[str]] = [
     "qu'est-ce",
     "qu'est",
     "expliquez",
     "définissez",
 ]
+FRENCH_SYSTEM_PROMPT: Final[str] = (
     "Vous êtes un assistant financier expert. "
     "Répondez TOUJOURS en français. "
     "Soyez concis et précis dans vos explications. "
 )
 # Qwen3 EOS tokens
+EOS_TOKENS: Final[List[int]] = [151645, 151643]  # [<|im_end|>, <|endoftext|>]
+PAD_TOKEN_ID: Final[int] = 151643  # <|endoftext|>
 # Generation defaults
+DEFAULT_MAX_TOKENS: Final[int] = 1000  # Increased for complete answers with concise reasoning
+DEFAULT_TEMPERATURE: Final[float] = 0.7
+DEFAULT_TOP_P: Final[float] = 1.0
+DEFAULT_TOP_K: Final[int] = 20
+REPETITION_PENALTY: Final[float] = 1.05

app/utils/memory.py CHANGED Viewed

@@ -1,12 +1,23 @@
 """GPU memory management utilities."""
 import gc
 import torch
-from typing import Optional
-def clear_gpu_memory(model=None, tokenizer=None):
-    """Clear GPU memory completely."""
     if not torch.cuda.is_available():
         return

 """GPU memory management utilities."""
 import gc
+from typing import Optional, Any
 import torch
+def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
+    """Clear GPU memory completely.
+    This function performs aggressive GPU memory cleanup by:
+    1. Deleting model and tokenizer objects if provided
+    2. Clearing CUDA cache
+    3. Running multiple garbage collection passes
+    Args:
+        model: Optional model object to delete
+        tokenizer: Optional tokenizer object to delete
+    """
     if not torch.cuda.is_available():
         return