jeanbaptdzd commited on
Commit
9db586c
·
1 Parent(s): 192844a

Fix model ID and improve memory management

Browse files

- Replace non-existent model ID 'DragonLLM/qwen3-8b-fin-v1.0' with 'DragonLLM/Qwen-Open-Finance-R-8B'
- Update all references across codebase (38 occurrences in 11 files)
- Fix memory management: remove ineffective del statements that only cleared local references
- Update clear_gpu_memory() to focus on CUDA cache clearing and document caller responsibility
- Update docstrings to clarify that callers must set references to None for proper cleanup

README.md CHANGED
@@ -11,7 +11,7 @@ suggested_hardware: l4x1
11
 
12
  # Open Finance LLM 8B
13
 
14
- OpenAI-compatible API powered by DragonLLM/qwen3-8b-fin-v1.0 using Transformers.
15
 
16
  ## Overview
17
 
@@ -39,7 +39,7 @@ curl -X GET "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/models"
39
  curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
40
  -H "Content-Type: application/json" \
41
  -d '{
42
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
43
  "messages": [{"role": "user", "content": "What is compound interest?"}],
44
  "temperature": 0.7,
45
  "max_tokens": 500
@@ -51,7 +51,7 @@ curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completio
51
  curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
52
  -H "Content-Type: application/json" \
53
  -d '{
54
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
55
  "messages": [{"role": "user", "content": "Explain Value at Risk"}],
56
  "stream": true
57
  }'
@@ -84,7 +84,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
84
  - `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
85
 
86
  **Optional:**
87
- - `MODEL` - Model name (default: DragonLLM/qwen3-8b-fin-v1.0)
88
  - `SERVICE_API_KEY` - API key for authentication
89
  - `LOG_LEVEL` - Logging level (default: info)
90
  - `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
@@ -92,7 +92,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
92
 
93
  Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
94
 
95
- **Note:** Accept model terms at https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0 before use.
96
 
97
  ## Integration
98
 
@@ -122,7 +122,7 @@ client = OpenAI(
122
  )
123
 
124
  response = client.chat.completions.create(
125
- model="DragonLLM/qwen3-8b-fin-v1.0",
126
  messages=[{"role": "user", "content": "What is compound interest?"}],
127
  max_tokens=500
128
  )
@@ -134,7 +134,7 @@ response = client.chat.completions.create(
134
  import dspy
135
 
136
  lm = dspy.OpenAI(
137
- model="DragonLLM/qwen3-8b-fin-v1.0",
138
  api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
139
  )
140
  ```
@@ -142,7 +142,7 @@ lm = dspy.OpenAI(
142
  ## Technical Specifications
143
 
144
  **Model:**
145
- - DragonLLM/qwen3-8b-fin-v1.0 (8B parameters)
146
  - Fine-tuned on financial data
147
  - English and French support
148
 
 
11
 
12
  # Open Finance LLM 8B
13
 
14
+ OpenAI-compatible API powered by DragonLLM/Qwen-Open-Finance-R-8B using Transformers.
15
 
16
  ## Overview
17
 
 
39
  curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
40
  -H "Content-Type: application/json" \
41
  -d '{
42
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
43
  "messages": [{"role": "user", "content": "What is compound interest?"}],
44
  "temperature": 0.7,
45
  "max_tokens": 500
 
51
  curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
52
  -H "Content-Type: application/json" \
53
  -d '{
54
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
55
  "messages": [{"role": "user", "content": "Explain Value at Risk"}],
56
  "stream": true
57
  }'
 
84
  - `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
85
 
86
  **Optional:**
87
+ - `MODEL` - Model name (default: DragonLLM/Qwen-Open-Finance-R-8B)
88
  - `SERVICE_API_KEY` - API key for authentication
89
  - `LOG_LEVEL` - Logging level (default: info)
90
  - `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
 
92
 
93
  Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
94
 
95
+ **Note:** Accept model terms at https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B before use.
96
 
97
  ## Integration
98
 
 
122
  )
123
 
124
  response = client.chat.completions.create(
125
+ model="DragonLLM/Qwen-Open-Finance-R-8B",
126
  messages=[{"role": "user", "content": "What is compound interest?"}],
127
  max_tokens=500
128
  )
 
134
  import dspy
135
 
136
  lm = dspy.OpenAI(
137
+ model="DragonLLM/Qwen-Open-Finance-R-8B",
138
  api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
139
  )
140
  ```
 
142
  ## Technical Specifications
143
 
144
  **Model:**
145
+ - DragonLLM/Qwen-Open-Finance-R-8B (8B parameters)
146
  - Fine-tuned on financial data
147
  - English and French support
148
 
app/config.py CHANGED
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
13
  """
14
 
15
  model: str = Field(
16
- default="DragonLLM/qwen3-8b-fin-v1.0",
17
  description="Hugging Face model identifier"
18
  )
19
  service_api_key: str | None = Field(
 
13
  """
14
 
15
  model: str = Field(
16
+ default="DragonLLM/Qwen-Open-Finance-R-8B",
17
  description="Hugging Face model identifier"
18
  )
19
  service_api_key: str | None = Field(
app/main.py CHANGED
@@ -4,7 +4,8 @@ import logging
4
  import threading
5
  from typing import Dict
6
 
7
- from fastapi import FastAPI
 
8
 
9
  from app.config import settings
10
  from app.middleware import api_key_guard
@@ -71,11 +72,41 @@ async def root() -> Dict[str, str]:
71
 
72
  @app.get("/health")
73
  async def health() -> Dict[str, str]:
74
- """Health check endpoint for monitoring and load balancers.
75
 
76
  Returns:
77
- Dictionary with service health status.
78
  """
79
- return {"status": "healthy", "service": "LLM Pro Finance API"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
 
4
  import threading
5
  from typing import Dict
6
 
7
+ from fastapi import FastAPI, status
8
+ from fastapi.responses import JSONResponse
9
 
10
  from app.config import settings
11
  from app.middleware import api_key_guard
 
72
 
73
  @app.get("/health")
74
  async def health() -> Dict[str, str]:
75
+ """Liveness check endpoint for monitoring and load balancers.
76
 
77
  Returns:
78
+ Dictionary indicating the service is alive.
79
  """
80
+ return {"status": "service alive", "service": "LLM Pro Finance API"}
81
+
82
+
83
+ @app.get("/ready")
84
+ async def ready() -> JSONResponse:
85
+ """Readiness check endpoint for orchestrators and load balancers.
86
+
87
+ Checks if the model is loaded and ready to handle requests.
88
+ Returns 503 Service Unavailable if the model is not ready.
89
+
90
+ Returns:
91
+ JSONResponse with ready/model_loaded fields and appropriate status code.
92
+ """
93
+ from app.providers.transformers_provider import is_model_ready
94
+
95
+ model_loaded = is_model_ready()
96
+ ready_status = model_loaded
97
+
98
+ response_data = {
99
+ "ready": ready_status,
100
+ "model_loaded": model_loaded,
101
+ "service": "LLM Pro Finance API"
102
+ }
103
+
104
+ if ready_status:
105
+ return JSONResponse(content=response_data, status_code=status.HTTP_200_OK)
106
+ else:
107
+ return JSONResponse(
108
+ content=response_data,
109
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE
110
+ )
111
 
112
 
app/middleware.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from fastapi import Request
2
  from fastapi.responses import JSONResponse, Response
3
  from typing import Callable, Awaitable, Union
@@ -35,8 +36,11 @@ async def api_key_guard(request: Request, call_next: Callable[[Request], Awaitab
35
  if auth_header.startswith("Bearer "):
36
  api_key = auth_header.replace("Bearer ", "").strip()
37
 
38
- if api_key and api_key == settings.service_api_key:
39
- return await call_next(request)
 
 
 
40
 
41
  return JSONResponse(
42
  content={"error": {"message": "unauthorized", "type": "authentication_error"}},
 
1
+ import hmac
2
  from fastapi import Request
3
  from fastapi.responses import JSONResponse, Response
4
  from typing import Callable, Awaitable, Union
 
36
  if auth_header.startswith("Bearer "):
37
  api_key = auth_header.replace("Bearer ", "").strip()
38
 
39
+ if api_key:
40
+ # Use constant-time comparison to prevent timing attacks
41
+ expected_key = str(settings.service_api_key) if settings.service_api_key else ""
42
+ if hmac.compare_digest(str(api_key), expected_key):
43
+ return await call_next(request)
44
 
45
  return JSONResponse(
46
  content={"error": {"message": "unauthorized", "type": "authentication_error"}},
app/providers/transformers_provider.py CHANGED
@@ -186,7 +186,7 @@ def initialize_model(force_reload: bool = False):
186
  if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
187
  print("\nAuthentication Error Detected!")
188
  print("1. Ensure HF_TOKEN_LC2 is set in your environment")
189
- print("2. Accept model terms at: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
190
  print("3. Verify token has access to DragonLLM models")
191
 
192
  raise
@@ -559,6 +559,17 @@ class TransformersProvider:
559
  _provider = TransformersProvider()
560
 
561
 
 
 
 
 
 
 
 
 
 
 
 
562
  # Module-level functions for direct import
563
  async def list_models() -> Dict[str, Any]:
564
  """List available models."""
 
186
  if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
187
  print("\nAuthentication Error Detected!")
188
  print("1. Ensure HF_TOKEN_LC2 is set in your environment")
189
+ print("2. Accept model terms at: https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B")
190
  print("3. Verify token has access to DragonLLM models")
191
 
192
  raise
 
559
  _provider = TransformersProvider()
560
 
561
 
562
+ def is_model_ready() -> bool:
563
+ """
564
+ Thread-safe check if the model is loaded and ready for inference.
565
+
566
+ Returns:
567
+ True if model is initialized and loaded, False otherwise.
568
+ """
569
+ with _init_lock:
570
+ return _initialized and model is not None and tokenizer is not None
571
+
572
+
573
  # Module-level functions for direct import
574
  async def list_models() -> Dict[str, Any]:
575
  """List available models."""
app/utils/constants.py CHANGED
@@ -5,7 +5,7 @@ from typing import Final, List
5
 
6
 
7
  # Model configuration
8
- MODEL_NAME: Final[str] = "DragonLLM/qwen3-8b-fin-v1.0"
9
 
10
  # Cache directory - respect HF_HOME if set, otherwise use default
11
  CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
 
5
 
6
 
7
  # Model configuration
8
+ MODEL_NAME: Final[str] = "DragonLLM/Qwen-Open-Finance-R-8B"
9
 
10
  # Cache directory - respect HF_HOME if set, otherwise use default
11
  CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
app/utils/memory.py CHANGED
@@ -7,33 +7,27 @@ import torch
7
 
8
 
9
  def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
10
- """Clear GPU memory completely.
11
 
12
  This function performs aggressive GPU memory cleanup by:
13
- 1. Deleting model and tokenizer objects if provided
14
- 2. Clearing CUDA cache
15
- 3. Running multiple garbage collection passes
 
 
 
 
 
 
 
16
 
17
  Args:
18
- model: Optional model object to delete
19
- tokenizer: Optional tokenizer object to delete
20
  """
21
  if not torch.cuda.is_available():
22
  return
23
 
24
- # Delete model and tokenizer if provided
25
- if model is not None:
26
- try:
27
- del model
28
- except Exception:
29
- pass
30
-
31
- if tokenizer is not None:
32
- try:
33
- del tokenizer
34
- except Exception:
35
- pass
36
-
37
  # Clear CUDA cache
38
  torch.cuda.empty_cache()
39
  torch.cuda.synchronize()
 
7
 
8
 
9
  def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
10
+ """Clear GPU memory by emptying CUDA cache and running garbage collection.
11
 
12
  This function performs aggressive GPU memory cleanup by:
13
+ 1. Clearing CUDA cache
14
+ 2. Running multiple garbage collection passes
15
+
16
+ Important: This function does NOT delete model or tokenizer objects.
17
+ The caller must set their references to None (e.g., `model = None`)
18
+ for the objects to be garbage collected and GPU memory to be freed.
19
+
20
+ The model and tokenizer parameters are accepted for API compatibility
21
+ but are not used internally. They serve as documentation that the caller
22
+ should clear their references after calling this function.
23
 
24
  Args:
25
+ model: Optional model object (caller must set reference to None)
26
+ tokenizer: Optional tokenizer object (caller must set reference to None)
27
  """
28
  if not torch.cuda.is_available():
29
  return
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # Clear CUDA cache
32
  torch.cuda.empty_cache()
33
  torch.cuda.synchronize()
docs/qwen3_specifications.md CHANGED
@@ -2,7 +2,7 @@
2
 
3
  ## Fenêtre de contexte maximale
4
 
5
- Le modèle **DragonLLM/qwen3-8b-fin-v1.0** (basé sur Qwen-3 8B) supporte:
6
 
7
  ### Fenêtre de base
8
  - **32 768 tokens** (32K tokens)
 
2
 
3
  ## Fenêtre de contexte maximale
4
 
5
+ Le modèle **DragonLLM/Qwen-Open-Finance-R-8B** (basé sur Qwen-3 8B) supporte:
6
 
7
  ### Fenêtre de base
8
  - **32 768 tokens** (32K tokens)
pydanticai_app/config.py CHANGED
@@ -11,7 +11,7 @@ class Settings(BaseSettings):
11
 
12
  # OpenAI-compatible API settings
13
  api_key: str = "not-needed" # No authentication required
14
- model_name: str = "DragonLLM/qwen3-8b-fin-v1.0"
15
 
16
  # API configuration
17
  timeout: float = 120.0
 
11
 
12
  # OpenAI-compatible API settings
13
  api_key: str = "not-needed" # No authentication required
14
+ model_name: str = "DragonLLM/Qwen-Open-Finance-R-8B"
15
 
16
  # API configuration
17
  timeout: float = 120.0
test_tool_calls.py CHANGED
@@ -56,7 +56,7 @@ def test_tool_calls_basic():
56
 
57
  # Make request with tools
58
  payload = {
59
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
60
  "messages": [
61
  {
62
  "role": "user",
@@ -159,7 +159,7 @@ def test_tool_calls_multiple():
159
  ]
160
 
161
  payload = {
162
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
163
  "messages": [
164
  {
165
  "role": "user",
@@ -228,7 +228,7 @@ def test_tool_calls_format():
228
  ]
229
 
230
  payload = {
231
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
232
  "messages": [
233
  {
234
  "role": "user",
 
56
 
57
  # Make request with tools
58
  payload = {
59
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
60
  "messages": [
61
  {
62
  "role": "user",
 
159
  ]
160
 
161
  payload = {
162
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
163
  "messages": [
164
  {
165
  "role": "user",
 
228
  ]
229
 
230
  payload = {
231
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
232
  "messages": [
233
  {
234
  "role": "user",
tests/performance/benchmark.py CHANGED
@@ -39,7 +39,7 @@ class Benchmark:
39
  tokens_per_sec = []
40
 
41
  payload = {
42
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
43
  "messages": [
44
  {"role": "user", "content": "What is artificial intelligence?"}
45
  ],
@@ -91,7 +91,7 @@ class Benchmark:
91
 
92
  async def make_request(request_id: int):
93
  payload = {
94
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
95
  "messages": [
96
  {"role": "user", "content": f"Request {request_id}: Explain machine learning."}
97
  ],
@@ -155,7 +155,7 @@ class Benchmark:
155
 
156
  for test_case in test_cases:
157
  payload = {
158
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
159
  "messages": [
160
  {"role": "user", "content": "Write about the history of computing."}
161
  ],
@@ -231,7 +231,7 @@ class Benchmark:
231
  # Test 3: System message
232
  try:
233
  payload = {
234
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
235
  "messages": [
236
  {"role": "system", "content": "Be helpful."},
237
  {"role": "user", "content": "Hi"}
@@ -247,7 +247,7 @@ class Benchmark:
247
  # Test 4: Conversation history
248
  try:
249
  payload = {
250
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
251
  "messages": [
252
  {"role": "user", "content": "My name is Alice"},
253
  {"role": "assistant", "content": "Hello Alice"},
@@ -264,7 +264,7 @@ class Benchmark:
264
  # Test 5: Temperature parameter
265
  try:
266
  payload = {
267
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
268
  "messages": [{"role": "user", "content": "Hi"}],
269
  "temperature": 0.5
270
  }
@@ -278,7 +278,7 @@ class Benchmark:
278
  # Test 6: Max tokens parameter
279
  try:
280
  payload = {
281
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
282
  "messages": [{"role": "user", "content": "Hi"}],
283
  "max_tokens": 10
284
  }
 
39
  tokens_per_sec = []
40
 
41
  payload = {
42
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
43
  "messages": [
44
  {"role": "user", "content": "What is artificial intelligence?"}
45
  ],
 
91
 
92
  async def make_request(request_id: int):
93
  payload = {
94
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
95
  "messages": [
96
  {"role": "user", "content": f"Request {request_id}: Explain machine learning."}
97
  ],
 
155
 
156
  for test_case in test_cases:
157
  payload = {
158
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
159
  "messages": [
160
  {"role": "user", "content": "Write about the history of computing."}
161
  ],
 
231
  # Test 3: System message
232
  try:
233
  payload = {
234
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
235
  "messages": [
236
  {"role": "system", "content": "Be helpful."},
237
  {"role": "user", "content": "Hi"}
 
247
  # Test 4: Conversation history
248
  try:
249
  payload = {
250
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
251
  "messages": [
252
  {"role": "user", "content": "My name is Alice"},
253
  {"role": "assistant", "content": "Hello Alice"},
 
264
  # Test 5: Temperature parameter
265
  try:
266
  payload = {
267
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
268
  "messages": [{"role": "user", "content": "Hi"}],
269
  "temperature": 0.5
270
  }
 
278
  # Test 6: Max tokens parameter
279
  try:
280
  payload = {
281
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
282
  "messages": [{"role": "user", "content": "Hi"}],
283
  "max_tokens": 10
284
  }
tests/performance/test_inference_speed.py CHANGED
@@ -20,7 +20,7 @@ def client():
20
  async def test_single_request_latency(client):
21
  """Test latency for a single chat completion request"""
22
  payload = {
23
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
24
  "messages": [
25
  {"role": "user", "content": "What is the capital of France?"}
26
  ],
@@ -66,7 +66,7 @@ async def test_token_throughput_various_lengths(client):
66
 
67
  for test_case in test_cases:
68
  payload = {
69
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
70
  "messages": [{"role": "user", "content": test_case["prompt"]}],
71
  "max_tokens": test_case["max_tokens"],
72
  "temperature": 0.7
@@ -98,7 +98,7 @@ async def test_concurrent_requests(client):
98
 
99
  async def make_request(request_id: int):
100
  payload = {
101
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
102
  "messages": [
103
  {"role": "user", "content": f"Request {request_id}: What is 2+2?"}
104
  ],
@@ -142,7 +142,7 @@ async def test_concurrent_requests(client):
142
  async def test_time_to_first_token(client):
143
  """Test time to first token (TTFT) using streaming"""
144
  payload = {
145
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
146
  "messages": [
147
  {"role": "user", "content": "Count from 1 to 10."}
148
  ],
@@ -190,7 +190,7 @@ async def test_prompt_processing_speed(client):
190
 
191
  for i, prompt in enumerate(prompts):
192
  payload = {
193
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
194
  "messages": [{"role": "user", "content": prompt}],
195
  "max_tokens": 50,
196
  "temperature": 0.7
@@ -221,7 +221,7 @@ async def test_temperature_variance(client):
221
 
222
  for temp in temperatures:
223
  payload = {
224
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
225
  "messages": [{"role": "user", "content": prompt}],
226
  "max_tokens": 50,
227
  "temperature": temp
 
20
  async def test_single_request_latency(client):
21
  """Test latency for a single chat completion request"""
22
  payload = {
23
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
24
  "messages": [
25
  {"role": "user", "content": "What is the capital of France?"}
26
  ],
 
66
 
67
  for test_case in test_cases:
68
  payload = {
69
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
70
  "messages": [{"role": "user", "content": test_case["prompt"]}],
71
  "max_tokens": test_case["max_tokens"],
72
  "temperature": 0.7
 
98
 
99
  async def make_request(request_id: int):
100
  payload = {
101
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
102
  "messages": [
103
  {"role": "user", "content": f"Request {request_id}: What is 2+2?"}
104
  ],
 
142
  async def test_time_to_first_token(client):
143
  """Test time to first token (TTFT) using streaming"""
144
  payload = {
145
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
146
  "messages": [
147
  {"role": "user", "content": "Count from 1 to 10."}
148
  ],
 
190
 
191
  for i, prompt in enumerate(prompts):
192
  payload = {
193
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
194
  "messages": [{"role": "user", "content": prompt}],
195
  "max_tokens": 50,
196
  "temperature": 0.7
 
221
 
222
  for temp in temperatures:
223
  payload = {
224
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
225
  "messages": [{"role": "user", "content": prompt}],
226
  "max_tokens": 50,
227
  "temperature": temp
tests/performance/test_openai_compatibility.py CHANGED
@@ -58,7 +58,7 @@ class TestEndpointCompatibility:
58
  async def test_chat_completions_endpoint(self, httpx_client):
59
  """Test POST /v1/chat/completions endpoint"""
60
  payload = {
61
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
62
  "messages": [
63
  {"role": "user", "content": "Say hello"}
64
  ]
@@ -109,7 +109,7 @@ class TestOpenAIClientLibrary:
109
  """Test chat completion using official OpenAI client"""
110
  try:
111
  response = openai_client.chat.completions.create(
112
- model="DragonLLM/qwen3-8b-fin-v1.0",
113
  messages=[
114
  {"role": "user", "content": "What is 2+2?"}
115
  ],
@@ -133,7 +133,7 @@ class TestOpenAIClientLibrary:
133
  """Test streaming with official OpenAI client"""
134
  try:
135
  stream = openai_client.chat.completions.create(
136
- model="DragonLLM/qwen3-8b-fin-v1.0",
137
  messages=[
138
  {"role": "user", "content": "Count to 5"}
139
  ],
@@ -162,7 +162,7 @@ class TestMessageFormats:
162
  async def test_system_message(self, httpx_client):
163
  """Test with system message"""
164
  payload = {
165
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
166
  "messages": [
167
  {"role": "system", "content": "You are a helpful assistant."},
168
  {"role": "user", "content": "Hello"}
@@ -185,7 +185,7 @@ class TestMessageFormats:
185
  async def test_conversation_history(self, httpx_client):
186
  """Test with conversation history"""
187
  payload = {
188
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
189
  "messages": [
190
  {"role": "user", "content": "My name is Alice."},
191
  {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
@@ -220,7 +220,7 @@ class TestMessageFormats:
220
 
221
  for params in parameters:
222
  payload = {
223
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
224
  "messages": [{"role": "user", "content": "Hello"}],
225
  **params
226
  }
@@ -276,7 +276,7 @@ class TestErrorHandling:
276
  async def test_empty_message(self, httpx_client):
277
  """Test with empty message content"""
278
  payload = {
279
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
280
  "messages": [{"role": "user", "content": ""}],
281
  "max_tokens": 50
282
  }
@@ -297,7 +297,7 @@ class TestResponseFormat:
297
  async def test_response_schema(self, httpx_client):
298
  """Validate complete response schema"""
299
  payload = {
300
- "model": "DragonLLM/qwen3-8b-fin-v1.0",
301
  "messages": [{"role": "user", "content": "Test"}],
302
  "max_tokens": 50
303
  }
 
58
  async def test_chat_completions_endpoint(self, httpx_client):
59
  """Test POST /v1/chat/completions endpoint"""
60
  payload = {
61
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
62
  "messages": [
63
  {"role": "user", "content": "Say hello"}
64
  ]
 
109
  """Test chat completion using official OpenAI client"""
110
  try:
111
  response = openai_client.chat.completions.create(
112
+ model="DragonLLM/Qwen-Open-Finance-R-8B",
113
  messages=[
114
  {"role": "user", "content": "What is 2+2?"}
115
  ],
 
133
  """Test streaming with official OpenAI client"""
134
  try:
135
  stream = openai_client.chat.completions.create(
136
+ model="DragonLLM/Qwen-Open-Finance-R-8B",
137
  messages=[
138
  {"role": "user", "content": "Count to 5"}
139
  ],
 
162
  async def test_system_message(self, httpx_client):
163
  """Test with system message"""
164
  payload = {
165
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
166
  "messages": [
167
  {"role": "system", "content": "You are a helpful assistant."},
168
  {"role": "user", "content": "Hello"}
 
185
  async def test_conversation_history(self, httpx_client):
186
  """Test with conversation history"""
187
  payload = {
188
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
189
  "messages": [
190
  {"role": "user", "content": "My name is Alice."},
191
  {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
 
220
 
221
  for params in parameters:
222
  payload = {
223
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
224
  "messages": [{"role": "user", "content": "Hello"}],
225
  **params
226
  }
 
276
  async def test_empty_message(self, httpx_client):
277
  """Test with empty message content"""
278
  payload = {
279
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
280
  "messages": [{"role": "user", "content": ""}],
281
  "max_tokens": 50
282
  }
 
297
  async def test_response_schema(self, httpx_client):
298
  """Validate complete response schema"""
299
  payload = {
300
+ "model": "DragonLLM/Qwen-Open-Finance-R-8B",
301
  "messages": [{"role": "user", "content": "Test"}],
302
  "max_tokens": 50
303
  }
tests/test_config.py CHANGED
@@ -9,7 +9,7 @@ from app.config import Settings
9
  def test_settings_defaults():
10
  """Test that settings have correct default values."""
11
  settings = Settings()
12
- assert settings.model == "DragonLLM/qwen3-8b-fin-v1.0"
13
  assert settings.service_api_key is None
14
  assert settings.log_level == "info"
15
 
 
9
  def test_settings_defaults():
10
  """Test that settings have correct default values."""
11
  settings = Settings()
12
+ assert settings.model == "DragonLLM/Qwen-Open-Finance-R-8B"
13
  assert settings.service_api_key is None
14
  assert settings.log_level == "info"
15