Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

jeanbaptdzd commited on 25 days ago

Commit

9db586c

1 Parent(s): 192844a

Fix model ID and improve memory management

- Replace non-existent model ID 'DragonLLM/qwen3-8b-fin-v1.0' with 'DragonLLM/Qwen-Open-Finance-R-8B'
- Update all references across codebase (38 occurrences in 11 files)
- Fix memory management: remove ineffective del statements that only cleared local references
- Update clear_gpu_memory() to focus on CUDA cache clearing and document caller responsibility
- Update docstrings to clarify that callers must set references to None for proper cleanup

Files changed (14) hide show

README.md +8 -8
app/config.py +1 -1
app/main.py +35 -4
app/middleware.py +6 -2
app/providers/transformers_provider.py +12 -1
app/utils/constants.py +1 -1
app/utils/memory.py +13 -19
docs/qwen3_specifications.md +1 -1
pydanticai_app/config.py +1 -1
test_tool_calls.py +3 -3
tests/performance/benchmark.py +7 -7
tests/performance/test_inference_speed.py +6 -6
tests/performance/test_openai_compatibility.py +8 -8
tests/test_config.py +1 -1

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ suggested_hardware: l4x1
 # Open Finance LLM 8B
-OpenAI-compatible API powered by DragonLLM/qwen3-8b-fin-v1.0 using Transformers.
 ## Overview
@@ -39,7 +39,7 @@ curl -X GET "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/models"
 curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "DragonLLM/qwen3-8b-fin-v1.0",
     "messages": [{"role": "user", "content": "What is compound interest?"}],
     "temperature": 0.7,
     "max_tokens": 500
@@ -51,7 +51,7 @@ curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completio
 curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "DragonLLM/qwen3-8b-fin-v1.0",
     "messages": [{"role": "user", "content": "Explain Value at Risk"}],
     "stream": true
   }'
@@ -84,7 +84,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
 - `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
 **Optional:**
-- `MODEL` - Model name (default: DragonLLM/qwen3-8b-fin-v1.0)
 - `SERVICE_API_KEY` - API key for authentication
 - `LOG_LEVEL` - Logging level (default: info)
 - `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
@@ -92,7 +92,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
 Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
-**Note:** Accept model terms at https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0 before use.
 ## Integration
@@ -122,7 +122,7 @@ client = OpenAI(
 )
 response = client.chat.completions.create(
-    model="DragonLLM/qwen3-8b-fin-v1.0",
     messages=[{"role": "user", "content": "What is compound interest?"}],
     max_tokens=500
 )
@@ -134,7 +134,7 @@ response = client.chat.completions.create(
 import dspy
 lm = dspy.OpenAI(
-    model="DragonLLM/qwen3-8b-fin-v1.0",
     api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
 )
 ```
@@ -142,7 +142,7 @@ lm = dspy.OpenAI(
 ## Technical Specifications
 **Model:**
-- DragonLLM/qwen3-8b-fin-v1.0 (8B parameters)
 - Fine-tuned on financial data
 - English and French support

 # Open Finance LLM 8B
+OpenAI-compatible API powered by DragonLLM/Qwen-Open-Finance-R-8B using Transformers.
 ## Overview
 curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
   -H "Content-Type: application/json" \
   -d '{
+    "model": "DragonLLM/Qwen-Open-Finance-R-8B",
     "messages": [{"role": "user", "content": "What is compound interest?"}],
     "temperature": 0.7,
     "max_tokens": 500
 curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
   -H "Content-Type: application/json" \
   -d '{
+    "model": "DragonLLM/Qwen-Open-Finance-R-8B",
     "messages": [{"role": "user", "content": "Explain Value at Risk"}],
     "stream": true
   }'
 - `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
 **Optional:**
+- `MODEL` - Model name (default: DragonLLM/Qwen-Open-Finance-R-8B)
 - `SERVICE_API_KEY` - API key for authentication
 - `LOG_LEVEL` - Logging level (default: info)
 - `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
 Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
+**Note:** Accept model terms at https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B before use.
 ## Integration
 )
 response = client.chat.completions.create(
+    model="DragonLLM/Qwen-Open-Finance-R-8B",
     messages=[{"role": "user", "content": "What is compound interest?"}],
     max_tokens=500
 )
 import dspy
 lm = dspy.OpenAI(
+    model="DragonLLM/Qwen-Open-Finance-R-8B",
     api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
 )
 ```
 ## Technical Specifications
 **Model:**
+- DragonLLM/Qwen-Open-Finance-R-8B (8B parameters)
 - Fine-tuned on financial data
 - English and French support

app/config.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Settings(BaseSettings):
     """
     model: str = Field(
-        default="DragonLLM/qwen3-8b-fin-v1.0",
         description="Hugging Face model identifier"
     )
     service_api_key: str | None = Field(

     """
     model: str = Field(
+        default="DragonLLM/Qwen-Open-Finance-R-8B",
         description="Hugging Face model identifier"
     )
     service_api_key: str | None = Field(

app/main.py CHANGED Viewed

@@ -4,7 +4,8 @@ import logging
 import threading
 from typing import Dict
-from fastapi import FastAPI
 from app.config import settings
 from app.middleware import api_key_guard
@@ -71,11 +72,41 @@ async def root() -> Dict[str, str]:
 @app.get("/health")
 async def health() -> Dict[str, str]:
-    """Health check endpoint for monitoring and load balancers.
     Returns:
-        Dictionary with service health status.
     """
-    return {"status": "healthy", "service": "LLM Pro Finance API"}

 import threading
 from typing import Dict
+from fastapi import FastAPI, status
+from fastapi.responses import JSONResponse
 from app.config import settings
 from app.middleware import api_key_guard
 @app.get("/health")
 async def health() -> Dict[str, str]:
+    """Liveness check endpoint for monitoring and load balancers.
     Returns:
+        Dictionary indicating the service is alive.
     """
+    return {"status": "service alive", "service": "LLM Pro Finance API"}
+@app.get("/ready")
+async def ready() -> JSONResponse:
+    """Readiness check endpoint for orchestrators and load balancers.
+    Checks if the model is loaded and ready to handle requests.
+    Returns 503 Service Unavailable if the model is not ready.
+    Returns:
+        JSONResponse with ready/model_loaded fields and appropriate status code.
+    """
+    from app.providers.transformers_provider import is_model_ready
+    model_loaded = is_model_ready()
+    ready_status = model_loaded
+    response_data = {
+        "ready": ready_status,
+        "model_loaded": model_loaded,
+        "service": "LLM Pro Finance API"
+    }
+    if ready_status:
+        return JSONResponse(content=response_data, status_code=status.HTTP_200_OK)
+    else:
+        return JSONResponse(
+            content=response_data,
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE
+        )

app/middleware.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from fastapi import Request
 from fastapi.responses import JSONResponse, Response
 from typing import Callable, Awaitable, Union
@@ -35,8 +36,11 @@ async def api_key_guard(request: Request, call_next: Callable[[Request], Awaitab
         if auth_header.startswith("Bearer "):
             api_key = auth_header.replace("Bearer ", "").strip()
-    if api_key and api_key == settings.service_api_key:
-        return await call_next(request)
     return JSONResponse(
         content={"error": {"message": "unauthorized", "type": "authentication_error"}},

+import hmac
 from fastapi import Request
 from fastapi.responses import JSONResponse, Response
 from typing import Callable, Awaitable, Union
         if auth_header.startswith("Bearer "):
             api_key = auth_header.replace("Bearer ", "").strip()
+    if api_key:
+        # Use constant-time comparison to prevent timing attacks
+        expected_key = str(settings.service_api_key) if settings.service_api_key else ""
+        if hmac.compare_digest(str(api_key), expected_key):
+            return await call_next(request)
     return JSONResponse(
         content={"error": {"message": "unauthorized", "type": "authentication_error"}},

app/providers/transformers_provider.py CHANGED Viewed

@@ -186,7 +186,7 @@ def initialize_model(force_reload: bool = False):
             if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
                 print("\nAuthentication Error Detected!")
                 print("1. Ensure HF_TOKEN_LC2 is set in your environment")
-                print("2. Accept model terms at: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
                 print("3. Verify token has access to DragonLLM models")
             raise
@@ -559,6 +559,17 @@ class TransformersProvider:
 _provider = TransformersProvider()
 # Module-level functions for direct import
 async def list_models() -> Dict[str, Any]:
     """List available models."""

             if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
                 print("\nAuthentication Error Detected!")
                 print("1. Ensure HF_TOKEN_LC2 is set in your environment")
+                print("2. Accept model terms at: https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B")
                 print("3. Verify token has access to DragonLLM models")
             raise
 _provider = TransformersProvider()
+def is_model_ready() -> bool:
+    """
+    Thread-safe check if the model is loaded and ready for inference.
+    Returns:
+        True if model is initialized and loaded, False otherwise.
+    """
+    with _init_lock:
+        return _initialized and model is not None and tokenizer is not None
 # Module-level functions for direct import
 async def list_models() -> Dict[str, Any]:
     """List available models."""

app/utils/constants.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Final, List
 # Model configuration
-MODEL_NAME: Final[str] = "DragonLLM/qwen3-8b-fin-v1.0"
 # Cache directory - respect HF_HOME if set, otherwise use default
 CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")

 # Model configuration
+MODEL_NAME: Final[str] = "DragonLLM/Qwen-Open-Finance-R-8B"
 # Cache directory - respect HF_HOME if set, otherwise use default
 CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")

app/utils/memory.py CHANGED Viewed

@@ -7,33 +7,27 @@ import torch
 def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
-    """Clear GPU memory completely.
     This function performs aggressive GPU memory cleanup by:
-    1. Deleting model and tokenizer objects if provided
-    2. Clearing CUDA cache
-    3. Running multiple garbage collection passes
     Args:
-        model: Optional model object to delete
-        tokenizer: Optional tokenizer object to delete
     """
     if not torch.cuda.is_available():
         return
-    # Delete model and tokenizer if provided
-    if model is not None:
-        try:
-            del model
-        except Exception:
-            pass
-    if tokenizer is not None:
-        try:
-            del tokenizer
-        except Exception:
-            pass
     # Clear CUDA cache
     torch.cuda.empty_cache()
     torch.cuda.synchronize()

 def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
+    """Clear GPU memory by emptying CUDA cache and running garbage collection.
     This function performs aggressive GPU memory cleanup by:
+    1. Clearing CUDA cache
+    2. Running multiple garbage collection passes
+    Important: This function does NOT delete model or tokenizer objects.
+    The caller must set their references to None (e.g., `model = None`)
+    for the objects to be garbage collected and GPU memory to be freed.
+    The model and tokenizer parameters are accepted for API compatibility
+    but are not used internally. They serve as documentation that the caller
+    should clear their references after calling this function.
     Args:
+        model: Optional model object (caller must set reference to None)
+        tokenizer: Optional tokenizer object (caller must set reference to None)
     """
     if not torch.cuda.is_available():
         return
     # Clear CUDA cache
     torch.cuda.empty_cache()
     torch.cuda.synchronize()

docs/qwen3_specifications.md CHANGED Viewed

@@ -2,7 +2,7 @@
 ## Fenêtre de contexte maximale
-Le modèle **DragonLLM/qwen3-8b-fin-v1.0** (basé sur Qwen-3 8B) supporte:
 ### Fenêtre de base
 - **32 768 tokens** (32K tokens)

 ## Fenêtre de contexte maximale
+Le modèle **DragonLLM/Qwen-Open-Finance-R-8B** (basé sur Qwen-3 8B) supporte:
 ### Fenêtre de base
 - **32 768 tokens** (32K tokens)

pydanticai_app/config.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Settings(BaseSettings):
     # OpenAI-compatible API settings
     api_key: str = "not-needed"  # No authentication required
-    model_name: str = "DragonLLM/qwen3-8b-fin-v1.0"
     # API configuration
     timeout: float = 120.0

     # OpenAI-compatible API settings
     api_key: str = "not-needed"  # No authentication required
+    model_name: str = "DragonLLM/Qwen-Open-Finance-R-8B"
     # API configuration
     timeout: float = 120.0

test_tool_calls.py CHANGED Viewed

@@ -56,7 +56,7 @@ def test_tool_calls_basic():
     # Make request with tools
     payload = {
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "messages": [
             {
                 "role": "user",
@@ -159,7 +159,7 @@ def test_tool_calls_multiple():
     ]
     payload = {
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "messages": [
             {
                 "role": "user",
@@ -228,7 +228,7 @@ def test_tool_calls_format():
     ]
     payload = {
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "messages": [
             {
                 "role": "user",

     # Make request with tools
     payload = {
+        "model": "DragonLLM/Qwen-Open-Finance-R-8B",
         "messages": [
             {
                 "role": "user",
     ]
     payload = {
+        "model": "DragonLLM/Qwen-Open-Finance-R-8B",
         "messages": [
             {
                 "role": "user",
     ]
     payload = {
+        "model": "DragonLLM/Qwen-Open-Finance-R-8B",
         "messages": [
             {
                 "role": "user",

tests/performance/benchmark.py CHANGED Viewed

@@ -39,7 +39,7 @@ class Benchmark:
         tokens_per_sec = []
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [
                 {"role": "user", "content": "What is artificial intelligence?"}
             ],
@@ -91,7 +91,7 @@ class Benchmark:
         async def make_request(request_id: int):
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [
                     {"role": "user", "content": f"Request {request_id}: Explain machine learning."}
                 ],
@@ -155,7 +155,7 @@ class Benchmark:
         for test_case in test_cases:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [
                     {"role": "user", "content": "Write about the history of computing."}
                 ],
@@ -231,7 +231,7 @@ class Benchmark:
         # Test 3: System message
         try:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [
                     {"role": "system", "content": "Be helpful."},
                     {"role": "user", "content": "Hi"}
@@ -247,7 +247,7 @@ class Benchmark:
         # Test 4: Conversation history
         try:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [
                     {"role": "user", "content": "My name is Alice"},
                     {"role": "assistant", "content": "Hello Alice"},
@@ -264,7 +264,7 @@ class Benchmark:
         # Test 5: Temperature parameter
         try:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [{"role": "user", "content": "Hi"}],
                 "temperature": 0.5
             }
@@ -278,7 +278,7 @@ class Benchmark:
         # Test 6: Max tokens parameter
         try:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [{"role": "user", "content": "Hi"}],
                 "max_tokens": 10
             }

         tokens_per_sec = []
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [
                 {"role": "user", "content": "What is artificial intelligence?"}
             ],
         async def make_request(request_id: int):
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [
                     {"role": "user", "content": f"Request {request_id}: Explain machine learning."}
                 ],
         for test_case in test_cases:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [
                     {"role": "user", "content": "Write about the history of computing."}
                 ],
         # Test 3: System message
         try:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [
                     {"role": "system", "content": "Be helpful."},
                     {"role": "user", "content": "Hi"}
         # Test 4: Conversation history
         try:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [
                     {"role": "user", "content": "My name is Alice"},
                     {"role": "assistant", "content": "Hello Alice"},
         # Test 5: Temperature parameter
         try:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [{"role": "user", "content": "Hi"}],
                 "temperature": 0.5
             }
         # Test 6: Max tokens parameter
         try:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [{"role": "user", "content": "Hi"}],
                 "max_tokens": 10
             }

tests/performance/test_inference_speed.py CHANGED Viewed

@@ -20,7 +20,7 @@ def client():
 async def test_single_request_latency(client):
     """Test latency for a single chat completion request"""
     payload = {
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "messages": [
             {"role": "user", "content": "What is the capital of France?"}
         ],
@@ -66,7 +66,7 @@ async def test_token_throughput_various_lengths(client):
     for test_case in test_cases:
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [{"role": "user", "content": test_case["prompt"]}],
             "max_tokens": test_case["max_tokens"],
             "temperature": 0.7
@@ -98,7 +98,7 @@ async def test_concurrent_requests(client):
     async def make_request(request_id: int):
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [
                 {"role": "user", "content": f"Request {request_id}: What is 2+2?"}
             ],
@@ -142,7 +142,7 @@ async def test_concurrent_requests(client):
 async def test_time_to_first_token(client):
     """Test time to first token (TTFT) using streaming"""
     payload = {
-        "model": "DragonLLM/qwen3-8b-fin-v1.0",
         "messages": [
             {"role": "user", "content": "Count from 1 to 10."}
         ],
@@ -190,7 +190,7 @@ async def test_prompt_processing_speed(client):
     for i, prompt in enumerate(prompts):
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [{"role": "user", "content": prompt}],
             "max_tokens": 50,
             "temperature": 0.7
@@ -221,7 +221,7 @@ async def test_temperature_variance(client):
     for temp in temperatures:
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [{"role": "user", "content": prompt}],
             "max_tokens": 50,
             "temperature": temp

 async def test_single_request_latency(client):
     """Test latency for a single chat completion request"""
     payload = {
+        "model": "DragonLLM/Qwen-Open-Finance-R-8B",
         "messages": [
             {"role": "user", "content": "What is the capital of France?"}
         ],
     for test_case in test_cases:
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [{"role": "user", "content": test_case["prompt"]}],
             "max_tokens": test_case["max_tokens"],
             "temperature": 0.7
     async def make_request(request_id: int):
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [
                 {"role": "user", "content": f"Request {request_id}: What is 2+2?"}
             ],
 async def test_time_to_first_token(client):
     """Test time to first token (TTFT) using streaming"""
     payload = {
+        "model": "DragonLLM/Qwen-Open-Finance-R-8B",
         "messages": [
             {"role": "user", "content": "Count from 1 to 10."}
         ],
     for i, prompt in enumerate(prompts):
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [{"role": "user", "content": prompt}],
             "max_tokens": 50,
             "temperature": 0.7
     for temp in temperatures:
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [{"role": "user", "content": prompt}],
             "max_tokens": 50,
             "temperature": temp

tests/performance/test_openai_compatibility.py CHANGED Viewed

@@ -58,7 +58,7 @@ class TestEndpointCompatibility:
     async def test_chat_completions_endpoint(self, httpx_client):
         """Test POST /v1/chat/completions endpoint"""
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [
                 {"role": "user", "content": "Say hello"}
             ]
@@ -109,7 +109,7 @@ class TestOpenAIClientLibrary:
         """Test chat completion using official OpenAI client"""
         try:
             response = openai_client.chat.completions.create(
-                model="DragonLLM/qwen3-8b-fin-v1.0",
                 messages=[
                     {"role": "user", "content": "What is 2+2?"}
                 ],
@@ -133,7 +133,7 @@ class TestOpenAIClientLibrary:
         """Test streaming with official OpenAI client"""
         try:
             stream = openai_client.chat.completions.create(
-                model="DragonLLM/qwen3-8b-fin-v1.0",
                 messages=[
                     {"role": "user", "content": "Count to 5"}
                 ],
@@ -162,7 +162,7 @@ class TestMessageFormats:
     async def test_system_message(self, httpx_client):
         """Test with system message"""
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [
                 {"role": "system", "content": "You are a helpful assistant."},
                 {"role": "user", "content": "Hello"}
@@ -185,7 +185,7 @@ class TestMessageFormats:
     async def test_conversation_history(self, httpx_client):
         """Test with conversation history"""
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [
                 {"role": "user", "content": "My name is Alice."},
                 {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
@@ -220,7 +220,7 @@ class TestMessageFormats:
         for params in parameters:
             payload = {
-                "model": "DragonLLM/qwen3-8b-fin-v1.0",
                 "messages": [{"role": "user", "content": "Hello"}],
                 **params
             }
@@ -276,7 +276,7 @@ class TestErrorHandling:
     async def test_empty_message(self, httpx_client):
         """Test with empty message content"""
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [{"role": "user", "content": ""}],
             "max_tokens": 50
         }
@@ -297,7 +297,7 @@ class TestResponseFormat:
     async def test_response_schema(self, httpx_client):
         """Validate complete response schema"""
         payload = {
-            "model": "DragonLLM/qwen3-8b-fin-v1.0",
             "messages": [{"role": "user", "content": "Test"}],
             "max_tokens": 50
         }

     async def test_chat_completions_endpoint(self, httpx_client):
         """Test POST /v1/chat/completions endpoint"""
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [
                 {"role": "user", "content": "Say hello"}
             ]
         """Test chat completion using official OpenAI client"""
         try:
             response = openai_client.chat.completions.create(
+                model="DragonLLM/Qwen-Open-Finance-R-8B",
                 messages=[
                     {"role": "user", "content": "What is 2+2?"}
                 ],
         """Test streaming with official OpenAI client"""
         try:
             stream = openai_client.chat.completions.create(
+                model="DragonLLM/Qwen-Open-Finance-R-8B",
                 messages=[
                     {"role": "user", "content": "Count to 5"}
                 ],
     async def test_system_message(self, httpx_client):
         """Test with system message"""
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [
                 {"role": "system", "content": "You are a helpful assistant."},
                 {"role": "user", "content": "Hello"}
     async def test_conversation_history(self, httpx_client):
         """Test with conversation history"""
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [
                 {"role": "user", "content": "My name is Alice."},
                 {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
         for params in parameters:
             payload = {
+                "model": "DragonLLM/Qwen-Open-Finance-R-8B",
                 "messages": [{"role": "user", "content": "Hello"}],
                 **params
             }
     async def test_empty_message(self, httpx_client):
         """Test with empty message content"""
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [{"role": "user", "content": ""}],
             "max_tokens": 50
         }
     async def test_response_schema(self, httpx_client):
         """Validate complete response schema"""
         payload = {
+            "model": "DragonLLM/Qwen-Open-Finance-R-8B",
             "messages": [{"role": "user", "content": "Test"}],
             "max_tokens": 50
         }

tests/test_config.py CHANGED Viewed

@@ -9,7 +9,7 @@ from app.config import Settings
 def test_settings_defaults():
     """Test that settings have correct default values."""
     settings = Settings()
-    assert settings.model == "DragonLLM/qwen3-8b-fin-v1.0"
     assert settings.service_api_key is None
     assert settings.log_level == "info"

 def test_settings_defaults():
     """Test that settings have correct default values."""
     settings = Settings()
+    assert settings.model == "DragonLLM/Qwen-Open-Finance-R-8B"
     assert settings.service_api_key is None
     assert settings.log_level == "info"