Commit
·
9db586c
1
Parent(s):
192844a
Fix model ID and improve memory management
Browse files- Replace non-existent model ID 'DragonLLM/qwen3-8b-fin-v1.0' with 'DragonLLM/Qwen-Open-Finance-R-8B'
- Update all references across codebase (38 occurrences in 11 files)
- Fix memory management: remove ineffective del statements that only cleared local references
- Update clear_gpu_memory() to focus on CUDA cache clearing and document caller responsibility
- Update docstrings to clarify that callers must set references to None for proper cleanup
- README.md +8 -8
- app/config.py +1 -1
- app/main.py +35 -4
- app/middleware.py +6 -2
- app/providers/transformers_provider.py +12 -1
- app/utils/constants.py +1 -1
- app/utils/memory.py +13 -19
- docs/qwen3_specifications.md +1 -1
- pydanticai_app/config.py +1 -1
- test_tool_calls.py +3 -3
- tests/performance/benchmark.py +7 -7
- tests/performance/test_inference_speed.py +6 -6
- tests/performance/test_openai_compatibility.py +8 -8
- tests/test_config.py +1 -1
README.md
CHANGED
|
@@ -11,7 +11,7 @@ suggested_hardware: l4x1
|
|
| 11 |
|
| 12 |
# Open Finance LLM 8B
|
| 13 |
|
| 14 |
-
OpenAI-compatible API powered by DragonLLM/
|
| 15 |
|
| 16 |
## Overview
|
| 17 |
|
|
@@ -39,7 +39,7 @@ curl -X GET "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/models"
|
|
| 39 |
curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
|
| 40 |
-H "Content-Type: application/json" \
|
| 41 |
-d '{
|
| 42 |
-
"model": "DragonLLM/
|
| 43 |
"messages": [{"role": "user", "content": "What is compound interest?"}],
|
| 44 |
"temperature": 0.7,
|
| 45 |
"max_tokens": 500
|
|
@@ -51,7 +51,7 @@ curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completio
|
|
| 51 |
curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
|
| 52 |
-H "Content-Type: application/json" \
|
| 53 |
-d '{
|
| 54 |
-
"model": "DragonLLM/
|
| 55 |
"messages": [{"role": "user", "content": "Explain Value at Risk"}],
|
| 56 |
"stream": true
|
| 57 |
}'
|
|
@@ -84,7 +84,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
|
|
| 84 |
- `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
|
| 85 |
|
| 86 |
**Optional:**
|
| 87 |
-
- `MODEL` - Model name (default: DragonLLM/
|
| 88 |
- `SERVICE_API_KEY` - API key for authentication
|
| 89 |
- `LOG_LEVEL` - Logging level (default: info)
|
| 90 |
- `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
|
|
@@ -92,7 +92,7 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
|
|
| 92 |
|
| 93 |
Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
|
| 94 |
|
| 95 |
-
**Note:** Accept model terms at https://huggingface.co/DragonLLM/
|
| 96 |
|
| 97 |
## Integration
|
| 98 |
|
|
@@ -122,7 +122,7 @@ client = OpenAI(
|
|
| 122 |
)
|
| 123 |
|
| 124 |
response = client.chat.completions.create(
|
| 125 |
-
model="DragonLLM/
|
| 126 |
messages=[{"role": "user", "content": "What is compound interest?"}],
|
| 127 |
max_tokens=500
|
| 128 |
)
|
|
@@ -134,7 +134,7 @@ response = client.chat.completions.create(
|
|
| 134 |
import dspy
|
| 135 |
|
| 136 |
lm = dspy.OpenAI(
|
| 137 |
-
model="DragonLLM/
|
| 138 |
api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
|
| 139 |
)
|
| 140 |
```
|
|
@@ -142,7 +142,7 @@ lm = dspy.OpenAI(
|
|
| 142 |
## Technical Specifications
|
| 143 |
|
| 144 |
**Model:**
|
| 145 |
-
- DragonLLM/
|
| 146 |
- Fine-tuned on financial data
|
| 147 |
- English and French support
|
| 148 |
|
|
|
|
| 11 |
|
| 12 |
# Open Finance LLM 8B
|
| 13 |
|
| 14 |
+
OpenAI-compatible API powered by DragonLLM/Qwen-Open-Finance-R-8B using Transformers.
|
| 15 |
|
| 16 |
## Overview
|
| 17 |
|
|
|
|
| 39 |
curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
|
| 40 |
-H "Content-Type: application/json" \
|
| 41 |
-d '{
|
| 42 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 43 |
"messages": [{"role": "user", "content": "What is compound interest?"}],
|
| 44 |
"temperature": 0.7,
|
| 45 |
"max_tokens": 500
|
|
|
|
| 51 |
curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completions" \
|
| 52 |
-H "Content-Type: application/json" \
|
| 53 |
-d '{
|
| 54 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 55 |
"messages": [{"role": "user", "content": "Explain Value at Risk"}],
|
| 56 |
"stream": true
|
| 57 |
}'
|
|
|
|
| 84 |
- `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
|
| 85 |
|
| 86 |
**Optional:**
|
| 87 |
+
- `MODEL` - Model name (default: DragonLLM/Qwen-Open-Finance-R-8B)
|
| 88 |
- `SERVICE_API_KEY` - API key for authentication
|
| 89 |
- `LOG_LEVEL` - Logging level (default: info)
|
| 90 |
- `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
|
|
|
|
| 92 |
|
| 93 |
Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
|
| 94 |
|
| 95 |
+
**Note:** Accept model terms at https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B before use.
|
| 96 |
|
| 97 |
## Integration
|
| 98 |
|
|
|
|
| 122 |
)
|
| 123 |
|
| 124 |
response = client.chat.completions.create(
|
| 125 |
+
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 126 |
messages=[{"role": "user", "content": "What is compound interest?"}],
|
| 127 |
max_tokens=500
|
| 128 |
)
|
|
|
|
| 134 |
import dspy
|
| 135 |
|
| 136 |
lm = dspy.OpenAI(
|
| 137 |
+
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 138 |
api_base="https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1"
|
| 139 |
)
|
| 140 |
```
|
|
|
|
| 142 |
## Technical Specifications
|
| 143 |
|
| 144 |
**Model:**
|
| 145 |
+
- DragonLLM/Qwen-Open-Finance-R-8B (8B parameters)
|
| 146 |
- Fine-tuned on financial data
|
| 147 |
- English and French support
|
| 148 |
|
app/config.py
CHANGED
|
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
model: str = Field(
|
| 16 |
-
default="DragonLLM/
|
| 17 |
description="Hugging Face model identifier"
|
| 18 |
)
|
| 19 |
service_api_key: str | None = Field(
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
model: str = Field(
|
| 16 |
+
default="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 17 |
description="Hugging Face model identifier"
|
| 18 |
)
|
| 19 |
service_api_key: str | None = Field(
|
app/main.py
CHANGED
|
@@ -4,7 +4,8 @@ import logging
|
|
| 4 |
import threading
|
| 5 |
from typing import Dict
|
| 6 |
|
| 7 |
-
from fastapi import FastAPI
|
|
|
|
| 8 |
|
| 9 |
from app.config import settings
|
| 10 |
from app.middleware import api_key_guard
|
|
@@ -71,11 +72,41 @@ async def root() -> Dict[str, str]:
|
|
| 71 |
|
| 72 |
@app.get("/health")
|
| 73 |
async def health() -> Dict[str, str]:
|
| 74 |
-
"""
|
| 75 |
|
| 76 |
Returns:
|
| 77 |
-
Dictionary
|
| 78 |
"""
|
| 79 |
-
return {"status": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
|
|
|
| 4 |
import threading
|
| 5 |
from typing import Dict
|
| 6 |
|
| 7 |
+
from fastapi import FastAPI, status
|
| 8 |
+
from fastapi.responses import JSONResponse
|
| 9 |
|
| 10 |
from app.config import settings
|
| 11 |
from app.middleware import api_key_guard
|
|
|
|
| 72 |
|
| 73 |
@app.get("/health")
|
| 74 |
async def health() -> Dict[str, str]:
|
| 75 |
+
"""Liveness check endpoint for monitoring and load balancers.
|
| 76 |
|
| 77 |
Returns:
|
| 78 |
+
Dictionary indicating the service is alive.
|
| 79 |
"""
|
| 80 |
+
return {"status": "service alive", "service": "LLM Pro Finance API"}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@app.get("/ready")
|
| 84 |
+
async def ready() -> JSONResponse:
|
| 85 |
+
"""Readiness check endpoint for orchestrators and load balancers.
|
| 86 |
+
|
| 87 |
+
Checks if the model is loaded and ready to handle requests.
|
| 88 |
+
Returns 503 Service Unavailable if the model is not ready.
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
JSONResponse with ready/model_loaded fields and appropriate status code.
|
| 92 |
+
"""
|
| 93 |
+
from app.providers.transformers_provider import is_model_ready
|
| 94 |
+
|
| 95 |
+
model_loaded = is_model_ready()
|
| 96 |
+
ready_status = model_loaded
|
| 97 |
+
|
| 98 |
+
response_data = {
|
| 99 |
+
"ready": ready_status,
|
| 100 |
+
"model_loaded": model_loaded,
|
| 101 |
+
"service": "LLM Pro Finance API"
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
if ready_status:
|
| 105 |
+
return JSONResponse(content=response_data, status_code=status.HTTP_200_OK)
|
| 106 |
+
else:
|
| 107 |
+
return JSONResponse(
|
| 108 |
+
content=response_data,
|
| 109 |
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE
|
| 110 |
+
)
|
| 111 |
|
| 112 |
|
app/middleware.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from fastapi import Request
|
| 2 |
from fastapi.responses import JSONResponse, Response
|
| 3 |
from typing import Callable, Awaitable, Union
|
|
@@ -35,8 +36,11 @@ async def api_key_guard(request: Request, call_next: Callable[[Request], Awaitab
|
|
| 35 |
if auth_header.startswith("Bearer "):
|
| 36 |
api_key = auth_header.replace("Bearer ", "").strip()
|
| 37 |
|
| 38 |
-
if api_key
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
return JSONResponse(
|
| 42 |
content={"error": {"message": "unauthorized", "type": "authentication_error"}},
|
|
|
|
| 1 |
+
import hmac
|
| 2 |
from fastapi import Request
|
| 3 |
from fastapi.responses import JSONResponse, Response
|
| 4 |
from typing import Callable, Awaitable, Union
|
|
|
|
| 36 |
if auth_header.startswith("Bearer "):
|
| 37 |
api_key = auth_header.replace("Bearer ", "").strip()
|
| 38 |
|
| 39 |
+
if api_key:
|
| 40 |
+
# Use constant-time comparison to prevent timing attacks
|
| 41 |
+
expected_key = str(settings.service_api_key) if settings.service_api_key else ""
|
| 42 |
+
if hmac.compare_digest(str(api_key), expected_key):
|
| 43 |
+
return await call_next(request)
|
| 44 |
|
| 45 |
return JSONResponse(
|
| 46 |
content={"error": {"message": "unauthorized", "type": "authentication_error"}},
|
app/providers/transformers_provider.py
CHANGED
|
@@ -186,7 +186,7 @@ def initialize_model(force_reload: bool = False):
|
|
| 186 |
if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
|
| 187 |
print("\nAuthentication Error Detected!")
|
| 188 |
print("1. Ensure HF_TOKEN_LC2 is set in your environment")
|
| 189 |
-
print("2. Accept model terms at: https://huggingface.co/DragonLLM/
|
| 190 |
print("3. Verify token has access to DragonLLM models")
|
| 191 |
|
| 192 |
raise
|
|
@@ -559,6 +559,17 @@ class TransformersProvider:
|
|
| 559 |
_provider = TransformersProvider()
|
| 560 |
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
# Module-level functions for direct import
|
| 563 |
async def list_models() -> Dict[str, Any]:
|
| 564 |
"""List available models."""
|
|
|
|
| 186 |
if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
|
| 187 |
print("\nAuthentication Error Detected!")
|
| 188 |
print("1. Ensure HF_TOKEN_LC2 is set in your environment")
|
| 189 |
+
print("2. Accept model terms at: https://huggingface.co/DragonLLM/Qwen-Open-Finance-R-8B")
|
| 190 |
print("3. Verify token has access to DragonLLM models")
|
| 191 |
|
| 192 |
raise
|
|
|
|
| 559 |
_provider = TransformersProvider()
|
| 560 |
|
| 561 |
|
| 562 |
+
def is_model_ready() -> bool:
|
| 563 |
+
"""
|
| 564 |
+
Thread-safe check if the model is loaded and ready for inference.
|
| 565 |
+
|
| 566 |
+
Returns:
|
| 567 |
+
True if model is initialized and loaded, False otherwise.
|
| 568 |
+
"""
|
| 569 |
+
with _init_lock:
|
| 570 |
+
return _initialized and model is not None and tokenizer is not None
|
| 571 |
+
|
| 572 |
+
|
| 573 |
# Module-level functions for direct import
|
| 574 |
async def list_models() -> Dict[str, Any]:
|
| 575 |
"""List available models."""
|
app/utils/constants.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Final, List
|
|
| 5 |
|
| 6 |
|
| 7 |
# Model configuration
|
| 8 |
-
MODEL_NAME: Final[str] = "DragonLLM/
|
| 9 |
|
| 10 |
# Cache directory - respect HF_HOME if set, otherwise use default
|
| 11 |
CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
# Model configuration
|
| 8 |
+
MODEL_NAME: Final[str] = "DragonLLM/Qwen-Open-Finance-R-8B"
|
| 9 |
|
| 10 |
# Cache directory - respect HF_HOME if set, otherwise use default
|
| 11 |
CACHE_DIR: Final[str] = os.getenv("HF_HOME", "/tmp/huggingface")
|
app/utils/memory.py
CHANGED
|
@@ -7,33 +7,27 @@ import torch
|
|
| 7 |
|
| 8 |
|
| 9 |
def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
|
| 10 |
-
"""Clear GPU memory
|
| 11 |
|
| 12 |
This function performs aggressive GPU memory cleanup by:
|
| 13 |
-
1.
|
| 14 |
-
2.
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
Args:
|
| 18 |
-
model: Optional model object to
|
| 19 |
-
tokenizer: Optional tokenizer object to
|
| 20 |
"""
|
| 21 |
if not torch.cuda.is_available():
|
| 22 |
return
|
| 23 |
|
| 24 |
-
# Delete model and tokenizer if provided
|
| 25 |
-
if model is not None:
|
| 26 |
-
try:
|
| 27 |
-
del model
|
| 28 |
-
except Exception:
|
| 29 |
-
pass
|
| 30 |
-
|
| 31 |
-
if tokenizer is not None:
|
| 32 |
-
try:
|
| 33 |
-
del tokenizer
|
| 34 |
-
except Exception:
|
| 35 |
-
pass
|
| 36 |
-
|
| 37 |
# Clear CUDA cache
|
| 38 |
torch.cuda.empty_cache()
|
| 39 |
torch.cuda.synchronize()
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def clear_gpu_memory(model: Optional[Any] = None, tokenizer: Optional[Any] = None) -> None:
|
| 10 |
+
"""Clear GPU memory by emptying CUDA cache and running garbage collection.
|
| 11 |
|
| 12 |
This function performs aggressive GPU memory cleanup by:
|
| 13 |
+
1. Clearing CUDA cache
|
| 14 |
+
2. Running multiple garbage collection passes
|
| 15 |
+
|
| 16 |
+
Important: This function does NOT delete model or tokenizer objects.
|
| 17 |
+
The caller must set their references to None (e.g., `model = None`)
|
| 18 |
+
for the objects to be garbage collected and GPU memory to be freed.
|
| 19 |
+
|
| 20 |
+
The model and tokenizer parameters are accepted for API compatibility
|
| 21 |
+
but are not used internally. They serve as documentation that the caller
|
| 22 |
+
should clear their references after calling this function.
|
| 23 |
|
| 24 |
Args:
|
| 25 |
+
model: Optional model object (caller must set reference to None)
|
| 26 |
+
tokenizer: Optional tokenizer object (caller must set reference to None)
|
| 27 |
"""
|
| 28 |
if not torch.cuda.is_available():
|
| 29 |
return
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# Clear CUDA cache
|
| 32 |
torch.cuda.empty_cache()
|
| 33 |
torch.cuda.synchronize()
|
docs/qwen3_specifications.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
## Fenêtre de contexte maximale
|
| 4 |
|
| 5 |
-
Le modèle **DragonLLM/
|
| 6 |
|
| 7 |
### Fenêtre de base
|
| 8 |
- **32 768 tokens** (32K tokens)
|
|
|
|
| 2 |
|
| 3 |
## Fenêtre de contexte maximale
|
| 4 |
|
| 5 |
+
Le modèle **DragonLLM/Qwen-Open-Finance-R-8B** (basé sur Qwen-3 8B) supporte:
|
| 6 |
|
| 7 |
### Fenêtre de base
|
| 8 |
- **32 768 tokens** (32K tokens)
|
pydanticai_app/config.py
CHANGED
|
@@ -11,7 +11,7 @@ class Settings(BaseSettings):
|
|
| 11 |
|
| 12 |
# OpenAI-compatible API settings
|
| 13 |
api_key: str = "not-needed" # No authentication required
|
| 14 |
-
model_name: str = "DragonLLM/
|
| 15 |
|
| 16 |
# API configuration
|
| 17 |
timeout: float = 120.0
|
|
|
|
| 11 |
|
| 12 |
# OpenAI-compatible API settings
|
| 13 |
api_key: str = "not-needed" # No authentication required
|
| 14 |
+
model_name: str = "DragonLLM/Qwen-Open-Finance-R-8B"
|
| 15 |
|
| 16 |
# API configuration
|
| 17 |
timeout: float = 120.0
|
test_tool_calls.py
CHANGED
|
@@ -56,7 +56,7 @@ def test_tool_calls_basic():
|
|
| 56 |
|
| 57 |
# Make request with tools
|
| 58 |
payload = {
|
| 59 |
-
"model": "DragonLLM/
|
| 60 |
"messages": [
|
| 61 |
{
|
| 62 |
"role": "user",
|
|
@@ -159,7 +159,7 @@ def test_tool_calls_multiple():
|
|
| 159 |
]
|
| 160 |
|
| 161 |
payload = {
|
| 162 |
-
"model": "DragonLLM/
|
| 163 |
"messages": [
|
| 164 |
{
|
| 165 |
"role": "user",
|
|
@@ -228,7 +228,7 @@ def test_tool_calls_format():
|
|
| 228 |
]
|
| 229 |
|
| 230 |
payload = {
|
| 231 |
-
"model": "DragonLLM/
|
| 232 |
"messages": [
|
| 233 |
{
|
| 234 |
"role": "user",
|
|
|
|
| 56 |
|
| 57 |
# Make request with tools
|
| 58 |
payload = {
|
| 59 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 60 |
"messages": [
|
| 61 |
{
|
| 62 |
"role": "user",
|
|
|
|
| 159 |
]
|
| 160 |
|
| 161 |
payload = {
|
| 162 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 163 |
"messages": [
|
| 164 |
{
|
| 165 |
"role": "user",
|
|
|
|
| 228 |
]
|
| 229 |
|
| 230 |
payload = {
|
| 231 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 232 |
"messages": [
|
| 233 |
{
|
| 234 |
"role": "user",
|
tests/performance/benchmark.py
CHANGED
|
@@ -39,7 +39,7 @@ class Benchmark:
|
|
| 39 |
tokens_per_sec = []
|
| 40 |
|
| 41 |
payload = {
|
| 42 |
-
"model": "DragonLLM/
|
| 43 |
"messages": [
|
| 44 |
{"role": "user", "content": "What is artificial intelligence?"}
|
| 45 |
],
|
|
@@ -91,7 +91,7 @@ class Benchmark:
|
|
| 91 |
|
| 92 |
async def make_request(request_id: int):
|
| 93 |
payload = {
|
| 94 |
-
"model": "DragonLLM/
|
| 95 |
"messages": [
|
| 96 |
{"role": "user", "content": f"Request {request_id}: Explain machine learning."}
|
| 97 |
],
|
|
@@ -155,7 +155,7 @@ class Benchmark:
|
|
| 155 |
|
| 156 |
for test_case in test_cases:
|
| 157 |
payload = {
|
| 158 |
-
"model": "DragonLLM/
|
| 159 |
"messages": [
|
| 160 |
{"role": "user", "content": "Write about the history of computing."}
|
| 161 |
],
|
|
@@ -231,7 +231,7 @@ class Benchmark:
|
|
| 231 |
# Test 3: System message
|
| 232 |
try:
|
| 233 |
payload = {
|
| 234 |
-
"model": "DragonLLM/
|
| 235 |
"messages": [
|
| 236 |
{"role": "system", "content": "Be helpful."},
|
| 237 |
{"role": "user", "content": "Hi"}
|
|
@@ -247,7 +247,7 @@ class Benchmark:
|
|
| 247 |
# Test 4: Conversation history
|
| 248 |
try:
|
| 249 |
payload = {
|
| 250 |
-
"model": "DragonLLM/
|
| 251 |
"messages": [
|
| 252 |
{"role": "user", "content": "My name is Alice"},
|
| 253 |
{"role": "assistant", "content": "Hello Alice"},
|
|
@@ -264,7 +264,7 @@ class Benchmark:
|
|
| 264 |
# Test 5: Temperature parameter
|
| 265 |
try:
|
| 266 |
payload = {
|
| 267 |
-
"model": "DragonLLM/
|
| 268 |
"messages": [{"role": "user", "content": "Hi"}],
|
| 269 |
"temperature": 0.5
|
| 270 |
}
|
|
@@ -278,7 +278,7 @@ class Benchmark:
|
|
| 278 |
# Test 6: Max tokens parameter
|
| 279 |
try:
|
| 280 |
payload = {
|
| 281 |
-
"model": "DragonLLM/
|
| 282 |
"messages": [{"role": "user", "content": "Hi"}],
|
| 283 |
"max_tokens": 10
|
| 284 |
}
|
|
|
|
| 39 |
tokens_per_sec = []
|
| 40 |
|
| 41 |
payload = {
|
| 42 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 43 |
"messages": [
|
| 44 |
{"role": "user", "content": "What is artificial intelligence?"}
|
| 45 |
],
|
|
|
|
| 91 |
|
| 92 |
async def make_request(request_id: int):
|
| 93 |
payload = {
|
| 94 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 95 |
"messages": [
|
| 96 |
{"role": "user", "content": f"Request {request_id}: Explain machine learning."}
|
| 97 |
],
|
|
|
|
| 155 |
|
| 156 |
for test_case in test_cases:
|
| 157 |
payload = {
|
| 158 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 159 |
"messages": [
|
| 160 |
{"role": "user", "content": "Write about the history of computing."}
|
| 161 |
],
|
|
|
|
| 231 |
# Test 3: System message
|
| 232 |
try:
|
| 233 |
payload = {
|
| 234 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 235 |
"messages": [
|
| 236 |
{"role": "system", "content": "Be helpful."},
|
| 237 |
{"role": "user", "content": "Hi"}
|
|
|
|
| 247 |
# Test 4: Conversation history
|
| 248 |
try:
|
| 249 |
payload = {
|
| 250 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 251 |
"messages": [
|
| 252 |
{"role": "user", "content": "My name is Alice"},
|
| 253 |
{"role": "assistant", "content": "Hello Alice"},
|
|
|
|
| 264 |
# Test 5: Temperature parameter
|
| 265 |
try:
|
| 266 |
payload = {
|
| 267 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 268 |
"messages": [{"role": "user", "content": "Hi"}],
|
| 269 |
"temperature": 0.5
|
| 270 |
}
|
|
|
|
| 278 |
# Test 6: Max tokens parameter
|
| 279 |
try:
|
| 280 |
payload = {
|
| 281 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 282 |
"messages": [{"role": "user", "content": "Hi"}],
|
| 283 |
"max_tokens": 10
|
| 284 |
}
|
tests/performance/test_inference_speed.py
CHANGED
|
@@ -20,7 +20,7 @@ def client():
|
|
| 20 |
async def test_single_request_latency(client):
|
| 21 |
"""Test latency for a single chat completion request"""
|
| 22 |
payload = {
|
| 23 |
-
"model": "DragonLLM/
|
| 24 |
"messages": [
|
| 25 |
{"role": "user", "content": "What is the capital of France?"}
|
| 26 |
],
|
|
@@ -66,7 +66,7 @@ async def test_token_throughput_various_lengths(client):
|
|
| 66 |
|
| 67 |
for test_case in test_cases:
|
| 68 |
payload = {
|
| 69 |
-
"model": "DragonLLM/
|
| 70 |
"messages": [{"role": "user", "content": test_case["prompt"]}],
|
| 71 |
"max_tokens": test_case["max_tokens"],
|
| 72 |
"temperature": 0.7
|
|
@@ -98,7 +98,7 @@ async def test_concurrent_requests(client):
|
|
| 98 |
|
| 99 |
async def make_request(request_id: int):
|
| 100 |
payload = {
|
| 101 |
-
"model": "DragonLLM/
|
| 102 |
"messages": [
|
| 103 |
{"role": "user", "content": f"Request {request_id}: What is 2+2?"}
|
| 104 |
],
|
|
@@ -142,7 +142,7 @@ async def test_concurrent_requests(client):
|
|
| 142 |
async def test_time_to_first_token(client):
|
| 143 |
"""Test time to first token (TTFT) using streaming"""
|
| 144 |
payload = {
|
| 145 |
-
"model": "DragonLLM/
|
| 146 |
"messages": [
|
| 147 |
{"role": "user", "content": "Count from 1 to 10."}
|
| 148 |
],
|
|
@@ -190,7 +190,7 @@ async def test_prompt_processing_speed(client):
|
|
| 190 |
|
| 191 |
for i, prompt in enumerate(prompts):
|
| 192 |
payload = {
|
| 193 |
-
"model": "DragonLLM/
|
| 194 |
"messages": [{"role": "user", "content": prompt}],
|
| 195 |
"max_tokens": 50,
|
| 196 |
"temperature": 0.7
|
|
@@ -221,7 +221,7 @@ async def test_temperature_variance(client):
|
|
| 221 |
|
| 222 |
for temp in temperatures:
|
| 223 |
payload = {
|
| 224 |
-
"model": "DragonLLM/
|
| 225 |
"messages": [{"role": "user", "content": prompt}],
|
| 226 |
"max_tokens": 50,
|
| 227 |
"temperature": temp
|
|
|
|
| 20 |
async def test_single_request_latency(client):
|
| 21 |
"""Test latency for a single chat completion request"""
|
| 22 |
payload = {
|
| 23 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 24 |
"messages": [
|
| 25 |
{"role": "user", "content": "What is the capital of France?"}
|
| 26 |
],
|
|
|
|
| 66 |
|
| 67 |
for test_case in test_cases:
|
| 68 |
payload = {
|
| 69 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 70 |
"messages": [{"role": "user", "content": test_case["prompt"]}],
|
| 71 |
"max_tokens": test_case["max_tokens"],
|
| 72 |
"temperature": 0.7
|
|
|
|
| 98 |
|
| 99 |
async def make_request(request_id: int):
|
| 100 |
payload = {
|
| 101 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 102 |
"messages": [
|
| 103 |
{"role": "user", "content": f"Request {request_id}: What is 2+2?"}
|
| 104 |
],
|
|
|
|
| 142 |
async def test_time_to_first_token(client):
|
| 143 |
"""Test time to first token (TTFT) using streaming"""
|
| 144 |
payload = {
|
| 145 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 146 |
"messages": [
|
| 147 |
{"role": "user", "content": "Count from 1 to 10."}
|
| 148 |
],
|
|
|
|
| 190 |
|
| 191 |
for i, prompt in enumerate(prompts):
|
| 192 |
payload = {
|
| 193 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 194 |
"messages": [{"role": "user", "content": prompt}],
|
| 195 |
"max_tokens": 50,
|
| 196 |
"temperature": 0.7
|
|
|
|
| 221 |
|
| 222 |
for temp in temperatures:
|
| 223 |
payload = {
|
| 224 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 225 |
"messages": [{"role": "user", "content": prompt}],
|
| 226 |
"max_tokens": 50,
|
| 227 |
"temperature": temp
|
tests/performance/test_openai_compatibility.py
CHANGED
|
@@ -58,7 +58,7 @@ class TestEndpointCompatibility:
|
|
| 58 |
async def test_chat_completions_endpoint(self, httpx_client):
|
| 59 |
"""Test POST /v1/chat/completions endpoint"""
|
| 60 |
payload = {
|
| 61 |
-
"model": "DragonLLM/
|
| 62 |
"messages": [
|
| 63 |
{"role": "user", "content": "Say hello"}
|
| 64 |
]
|
|
@@ -109,7 +109,7 @@ class TestOpenAIClientLibrary:
|
|
| 109 |
"""Test chat completion using official OpenAI client"""
|
| 110 |
try:
|
| 111 |
response = openai_client.chat.completions.create(
|
| 112 |
-
model="DragonLLM/
|
| 113 |
messages=[
|
| 114 |
{"role": "user", "content": "What is 2+2?"}
|
| 115 |
],
|
|
@@ -133,7 +133,7 @@ class TestOpenAIClientLibrary:
|
|
| 133 |
"""Test streaming with official OpenAI client"""
|
| 134 |
try:
|
| 135 |
stream = openai_client.chat.completions.create(
|
| 136 |
-
model="DragonLLM/
|
| 137 |
messages=[
|
| 138 |
{"role": "user", "content": "Count to 5"}
|
| 139 |
],
|
|
@@ -162,7 +162,7 @@ class TestMessageFormats:
|
|
| 162 |
async def test_system_message(self, httpx_client):
|
| 163 |
"""Test with system message"""
|
| 164 |
payload = {
|
| 165 |
-
"model": "DragonLLM/
|
| 166 |
"messages": [
|
| 167 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 168 |
{"role": "user", "content": "Hello"}
|
|
@@ -185,7 +185,7 @@ class TestMessageFormats:
|
|
| 185 |
async def test_conversation_history(self, httpx_client):
|
| 186 |
"""Test with conversation history"""
|
| 187 |
payload = {
|
| 188 |
-
"model": "DragonLLM/
|
| 189 |
"messages": [
|
| 190 |
{"role": "user", "content": "My name is Alice."},
|
| 191 |
{"role": "assistant", "content": "Hello Alice! Nice to meet you."},
|
|
@@ -220,7 +220,7 @@ class TestMessageFormats:
|
|
| 220 |
|
| 221 |
for params in parameters:
|
| 222 |
payload = {
|
| 223 |
-
"model": "DragonLLM/
|
| 224 |
"messages": [{"role": "user", "content": "Hello"}],
|
| 225 |
**params
|
| 226 |
}
|
|
@@ -276,7 +276,7 @@ class TestErrorHandling:
|
|
| 276 |
async def test_empty_message(self, httpx_client):
|
| 277 |
"""Test with empty message content"""
|
| 278 |
payload = {
|
| 279 |
-
"model": "DragonLLM/
|
| 280 |
"messages": [{"role": "user", "content": ""}],
|
| 281 |
"max_tokens": 50
|
| 282 |
}
|
|
@@ -297,7 +297,7 @@ class TestResponseFormat:
|
|
| 297 |
async def test_response_schema(self, httpx_client):
|
| 298 |
"""Validate complete response schema"""
|
| 299 |
payload = {
|
| 300 |
-
"model": "DragonLLM/
|
| 301 |
"messages": [{"role": "user", "content": "Test"}],
|
| 302 |
"max_tokens": 50
|
| 303 |
}
|
|
|
|
| 58 |
async def test_chat_completions_endpoint(self, httpx_client):
|
| 59 |
"""Test POST /v1/chat/completions endpoint"""
|
| 60 |
payload = {
|
| 61 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 62 |
"messages": [
|
| 63 |
{"role": "user", "content": "Say hello"}
|
| 64 |
]
|
|
|
|
| 109 |
"""Test chat completion using official OpenAI client"""
|
| 110 |
try:
|
| 111 |
response = openai_client.chat.completions.create(
|
| 112 |
+
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 113 |
messages=[
|
| 114 |
{"role": "user", "content": "What is 2+2?"}
|
| 115 |
],
|
|
|
|
| 133 |
"""Test streaming with official OpenAI client"""
|
| 134 |
try:
|
| 135 |
stream = openai_client.chat.completions.create(
|
| 136 |
+
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 137 |
messages=[
|
| 138 |
{"role": "user", "content": "Count to 5"}
|
| 139 |
],
|
|
|
|
| 162 |
async def test_system_message(self, httpx_client):
|
| 163 |
"""Test with system message"""
|
| 164 |
payload = {
|
| 165 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 166 |
"messages": [
|
| 167 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 168 |
{"role": "user", "content": "Hello"}
|
|
|
|
| 185 |
async def test_conversation_history(self, httpx_client):
|
| 186 |
"""Test with conversation history"""
|
| 187 |
payload = {
|
| 188 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 189 |
"messages": [
|
| 190 |
{"role": "user", "content": "My name is Alice."},
|
| 191 |
{"role": "assistant", "content": "Hello Alice! Nice to meet you."},
|
|
|
|
| 220 |
|
| 221 |
for params in parameters:
|
| 222 |
payload = {
|
| 223 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 224 |
"messages": [{"role": "user", "content": "Hello"}],
|
| 225 |
**params
|
| 226 |
}
|
|
|
|
| 276 |
async def test_empty_message(self, httpx_client):
|
| 277 |
"""Test with empty message content"""
|
| 278 |
payload = {
|
| 279 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 280 |
"messages": [{"role": "user", "content": ""}],
|
| 281 |
"max_tokens": 50
|
| 282 |
}
|
|
|
|
| 297 |
async def test_response_schema(self, httpx_client):
|
| 298 |
"""Validate complete response schema"""
|
| 299 |
payload = {
|
| 300 |
+
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
| 301 |
"messages": [{"role": "user", "content": "Test"}],
|
| 302 |
"max_tokens": 50
|
| 303 |
}
|
tests/test_config.py
CHANGED
|
@@ -9,7 +9,7 @@ from app.config import Settings
|
|
| 9 |
def test_settings_defaults():
|
| 10 |
"""Test that settings have correct default values."""
|
| 11 |
settings = Settings()
|
| 12 |
-
assert settings.model == "DragonLLM/
|
| 13 |
assert settings.service_api_key is None
|
| 14 |
assert settings.log_level == "info"
|
| 15 |
|
|
|
|
| 9 |
def test_settings_defaults():
|
| 10 |
"""Test that settings have correct default values."""
|
| 11 |
settings = Settings()
|
| 12 |
+
assert settings.model == "DragonLLM/Qwen-Open-Finance-R-8B"
|
| 13 |
assert settings.service_api_key is None
|
| 14 |
assert settings.log_level == "info"
|
| 15 |
|