Fix critical bugs: OOM errors, race conditions, truncation, and French language support
Browse filesMajor fixes:
1. Thread-safe model initialization with locking to prevent concurrent loads
2. Proper GPU memory cleanup on failed initialization attempts
3. Added PYTORCH_CUDA_ALLOC_CONF for better memory fragmentation handling
4. Set max_memory limit (20GiB) to prevent OOM during model load
5. Automatic French language detection and system prompt injection
6. Increased default max_tokens from 1000 to 500 for better response quality
7. Removed min_new_tokens constraint that was causing truncation
8. Added comprehensive memory cleanup in finally blocks
Technical details:
- Added _init_lock to prevent race conditions
- Added _initializing and _initialized flags for state tracking
- Created _clear_gpu_memory() for thorough cleanup
- French detection based on accented characters and common French words
- Automatic French system prompt: 'Répondez TOUJOURS en français'
- Dockerfile +2 -0
- app/providers/transformers_provider.py +199 -100
- memory_test_results.txt +137 -0
- test_memory_stress.py +302 -0
|
@@ -10,6 +10,8 @@ RUN echo "Build cache bust: ${CACHE_BUST}" && \
|
|
| 10 |
ENV PYTHONUNBUFFERED=1
|
| 11 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 12 |
ENV BUILD_ID=transformers_backend_20250130
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Install Python 3.11 and build dependencies
|
| 15 |
RUN apt-get update && apt-get install -y \
|
|
|
|
| 10 |
ENV PYTHONUNBUFFERED=1
|
| 11 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 12 |
ENV BUILD_ID=transformers_backend_20250130
|
| 13 |
+
# PyTorch CUDA memory management to prevent fragmentation
|
| 14 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 15 |
|
| 16 |
# Install Python 3.11 and build dependencies
|
| 17 |
RUN apt-get update && apt-get install -y \
|
|
@@ -4,128 +4,205 @@ import gc
|
|
| 4 |
import torch
|
| 5 |
from typing import Dict, Any, AsyncIterator, Union
|
| 6 |
import asyncio
|
|
|
|
| 7 |
from huggingface_hub import login
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
| 9 |
-
from threading import Thread
|
| 10 |
|
| 11 |
# Model configuration
|
| 12 |
model_name = "DragonLLM/qwen3-8b-fin-v1.0"
|
| 13 |
model = None
|
| 14 |
tokenizer = None
|
| 15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def initialize_model():
|
| 18 |
"""Initialize Transformers model with Qwen3
|
| 19 |
|
|
|
|
| 20 |
Handles authentication with Hugging Face Hub for accessing DragonLLM models.
|
| 21 |
Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
|
| 22 |
"""
|
| 23 |
-
global model, tokenizer
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
os.getenv("HF_TOKEN_LC2") or
|
| 36 |
-
os.getenv("HF_TOKEN_LC") or
|
| 37 |
-
os.getenv("HF_TOKEN") or
|
| 38 |
-
os.getenv("HUGGING_FACE_HUB_TOKEN")
|
| 39 |
-
)
|
| 40 |
|
| 41 |
-
|
| 42 |
-
# Determine token source for logging
|
| 43 |
-
if os.getenv("HF_TOKEN_LC2"):
|
| 44 |
-
token_source = "HF_TOKEN_LC2"
|
| 45 |
-
elif os.getenv("HF_TOKEN_LC"):
|
| 46 |
-
token_source = "HF_TOKEN_LC"
|
| 47 |
-
elif os.getenv("HF_TOKEN"):
|
| 48 |
-
token_source = "HF_TOKEN"
|
| 49 |
-
else:
|
| 50 |
-
token_source = "HUGGING_FACE_HUB_TOKEN"
|
| 51 |
-
|
| 52 |
-
logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
|
| 53 |
-
print(f"✅ {token_source} found (length: {len(hf_token)})")
|
| 54 |
-
|
| 55 |
-
# Authenticate with Hugging Face Hub
|
| 56 |
-
try:
|
| 57 |
-
login(token=hf_token, add_to_git_credential=False)
|
| 58 |
-
logger.info("✅ Successfully authenticated with Hugging Face Hub")
|
| 59 |
-
print("✅ Successfully authenticated with Hugging Face Hub")
|
| 60 |
-
except Exception as e:
|
| 61 |
-
logger.warning(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
|
| 62 |
-
print(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
|
| 63 |
-
|
| 64 |
-
# Set all possible environment variables
|
| 65 |
-
os.environ["HF_TOKEN"] = hf_token
|
| 66 |
-
os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
| 67 |
-
os.environ["HF_API_TOKEN"] = hf_token
|
| 68 |
-
|
| 69 |
-
logger.info("✅ Hugging Face token environment variables set")
|
| 70 |
-
else:
|
| 71 |
-
logger.warning("⚠️ WARNING: No HF token found in environment!")
|
| 72 |
-
print("⚠️ WARNING: No HF token found in environment!")
|
| 73 |
-
print(f" Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
|
| 74 |
-
print(" ⚠️ Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
|
| 75 |
|
| 76 |
try:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
print(f"Model type: DragonLLM Qwen3 8B")
|
| 80 |
-
print(f"Device: {device}")
|
| 81 |
-
print(f"Trust remote code: True")
|
| 82 |
|
| 83 |
-
|
| 84 |
-
print("
|
| 85 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 86 |
-
model_name,
|
| 87 |
-
token=hf_token,
|
| 88 |
-
trust_remote_code=True,
|
| 89 |
-
cache_dir="/tmp/huggingface"
|
| 90 |
-
)
|
| 91 |
-
logger.info("✅ Tokenizer loaded")
|
| 92 |
-
print("✅ Tokenizer loaded")
|
| 93 |
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
device_map="auto",
|
| 102 |
-
cache_dir="/tmp/huggingface"
|
| 103 |
)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
class TransformersProvider:
|
|
@@ -162,9 +239,31 @@ class TransformersProvider:
|
|
| 162 |
|
| 163 |
messages = payload.get("messages", [])
|
| 164 |
temperature = payload.get("temperature", 0.7)
|
| 165 |
-
max_tokens = payload.get("max_tokens",
|
| 166 |
top_p = payload.get("top_p", 1.0)
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
# Convert messages to prompt using tokenizer's chat template
|
| 169 |
if hasattr(tokenizer, "apply_chat_template"):
|
| 170 |
prompt = tokenizer.apply_chat_template(
|
|
@@ -196,9 +295,9 @@ class TransformersProvider:
|
|
| 196 |
do_sample=temperature > 0,
|
| 197 |
pad_token_id=tokenizer.eos_token_id,
|
| 198 |
eos_token_id=tokenizer.eos_token_id,
|
| 199 |
-
#
|
| 200 |
-
|
| 201 |
-
|
| 202 |
)
|
| 203 |
|
| 204 |
# Save token counts before cleanup
|
|
|
|
| 4 |
import torch
|
| 5 |
from typing import Dict, Any, AsyncIterator, Union
|
| 6 |
import asyncio
|
| 7 |
+
from threading import Thread, Lock
|
| 8 |
from huggingface_hub import login
|
| 9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
|
|
|
| 10 |
|
| 11 |
# Model configuration
|
| 12 |
model_name = "DragonLLM/qwen3-8b-fin-v1.0"
|
| 13 |
model = None
|
| 14 |
tokenizer = None
|
| 15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
+
_init_lock = Lock() # Lock to prevent concurrent initialization
|
| 17 |
+
_initializing = False # Track if initialization is in progress
|
| 18 |
+
_initialized = False # Track if initialization completed successfully
|
| 19 |
+
|
| 20 |
+
def _clear_gpu_memory():
|
| 21 |
+
"""Clear GPU memory completely."""
|
| 22 |
+
global model, tokenizer
|
| 23 |
+
if torch.cuda.is_available():
|
| 24 |
+
if model is not None:
|
| 25 |
+
try:
|
| 26 |
+
del model
|
| 27 |
+
except:
|
| 28 |
+
pass
|
| 29 |
+
if tokenizer is not None:
|
| 30 |
+
try:
|
| 31 |
+
del tokenizer
|
| 32 |
+
except:
|
| 33 |
+
pass
|
| 34 |
+
torch.cuda.empty_cache()
|
| 35 |
+
torch.cuda.synchronize()
|
| 36 |
+
gc.collect()
|
| 37 |
+
# Force garbage collection multiple times
|
| 38 |
+
for _ in range(3):
|
| 39 |
+
gc.collect()
|
| 40 |
+
if torch.cuda.is_available():
|
| 41 |
+
torch.cuda.empty_cache()
|
| 42 |
|
| 43 |
def initialize_model():
|
| 44 |
"""Initialize Transformers model with Qwen3
|
| 45 |
|
| 46 |
+
Thread-safe initialization with proper memory cleanup on failure.
|
| 47 |
Handles authentication with Hugging Face Hub for accessing DragonLLM models.
|
| 48 |
Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
|
| 49 |
"""
|
| 50 |
+
global model, tokenizer, _initializing, _initialized
|
| 51 |
|
| 52 |
+
# If already initialized, return immediately
|
| 53 |
+
if _initialized and model is not None:
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
# Acquire lock to prevent concurrent initialization
|
| 57 |
+
with _init_lock:
|
| 58 |
+
# Double-check after acquiring lock
|
| 59 |
+
if _initialized and model is not None:
|
| 60 |
+
return
|
| 61 |
|
| 62 |
+
# If already initializing, wait
|
| 63 |
+
if _initializing:
|
| 64 |
+
import logging
|
| 65 |
+
logger = logging.getLogger(__name__)
|
| 66 |
+
logger.warning("Model initialization already in progress, waiting...")
|
| 67 |
+
# Wait for initialization to complete (with timeout)
|
| 68 |
+
wait_count = 0
|
| 69 |
+
while _initializing and wait_count < 300: # 5 minute timeout
|
| 70 |
+
time.sleep(1)
|
| 71 |
+
wait_count += 1
|
| 72 |
+
if _initialized and model is not None:
|
| 73 |
+
return
|
| 74 |
+
if wait_count >= 300:
|
| 75 |
+
logger.error("Model initialization timeout!")
|
| 76 |
+
raise RuntimeError("Model initialization timed out")
|
| 77 |
+
return
|
| 78 |
|
| 79 |
+
# Clear any previous failed attempts
|
| 80 |
+
if model is None and torch.cuda.is_available():
|
| 81 |
+
_clear_gpu_memory()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
_initializing = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
try:
|
| 86 |
+
import logging
|
| 87 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
logger.info(f"Initializing Transformers with model: {model_name}")
|
| 90 |
+
print(f"Initializing Transformers with model: {model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Get HF token from environment (Hugging Face Space secret)
|
| 93 |
+
# Priority: HF_TOKEN_LC2 (for DragonLLM access) > HF_TOKEN_LC > HF_TOKEN
|
| 94 |
+
hf_token = (
|
| 95 |
+
os.getenv("HF_TOKEN_LC2") or
|
| 96 |
+
os.getenv("HF_TOKEN_LC") or
|
| 97 |
+
os.getenv("HF_TOKEN") or
|
| 98 |
+
os.getenv("HUGGING_FACE_HUB_TOKEN")
|
|
|
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
+
if hf_token:
|
| 102 |
+
# Determine token source for logging
|
| 103 |
+
if os.getenv("HF_TOKEN_LC2"):
|
| 104 |
+
token_source = "HF_TOKEN_LC2"
|
| 105 |
+
elif os.getenv("HF_TOKEN_LC"):
|
| 106 |
+
token_source = "HF_TOKEN_LC"
|
| 107 |
+
elif os.getenv("HF_TOKEN"):
|
| 108 |
+
token_source = "HF_TOKEN"
|
| 109 |
+
else:
|
| 110 |
+
token_source = "HUGGING_FACE_HUB_TOKEN"
|
| 111 |
+
|
| 112 |
+
logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
|
| 113 |
+
print(f"✅ {token_source} found (length: {len(hf_token)})")
|
| 114 |
+
|
| 115 |
+
# Authenticate with Hugging Face Hub
|
| 116 |
+
try:
|
| 117 |
+
login(token=hf_token, add_to_git_credential=False)
|
| 118 |
+
logger.info("✅ Successfully authenticated with Hugging Face Hub")
|
| 119 |
+
print("✅ Successfully authenticated with Hugging Face Hub")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.warning(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
|
| 122 |
+
print(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
|
| 123 |
+
|
| 124 |
+
# Set all possible environment variables
|
| 125 |
+
os.environ["HF_TOKEN"] = hf_token
|
| 126 |
+
os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
| 127 |
+
os.environ["HF_API_TOKEN"] = hf_token
|
| 128 |
+
|
| 129 |
+
logger.info("✅ Hugging Face token environment variables set")
|
| 130 |
+
else:
|
| 131 |
+
logger.warning("⚠️ WARNING: No HF token found in environment!")
|
| 132 |
+
print("⚠️ WARNING: No HF token found in environment!")
|
| 133 |
+
print(f" Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
|
| 134 |
+
print(" ⚠️ Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
|
| 135 |
|
| 136 |
+
try:
|
| 137 |
+
logger.info(f"Loading model: {model_name}")
|
| 138 |
+
print(f"Loading model: {model_name}")
|
| 139 |
+
print(f"Model type: DragonLLM Qwen3 8B")
|
| 140 |
+
print(f"Device: {device}")
|
| 141 |
+
print(f"Trust remote code: True")
|
| 142 |
+
|
| 143 |
+
# Load tokenizer
|
| 144 |
+
print("📥 Loading tokenizer...")
|
| 145 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 146 |
+
model_name,
|
| 147 |
+
token=hf_token,
|
| 148 |
+
trust_remote_code=True,
|
| 149 |
+
cache_dir="/tmp/huggingface"
|
| 150 |
+
)
|
| 151 |
+
logger.info("✅ Tokenizer loaded")
|
| 152 |
+
print("✅ Tokenizer loaded")
|
| 153 |
+
|
| 154 |
+
# Clear GPU memory before loading model
|
| 155 |
+
if torch.cuda.is_available():
|
| 156 |
+
torch.cuda.empty_cache()
|
| 157 |
+
gc.collect()
|
| 158 |
+
|
| 159 |
+
# Load model with optimizations and memory limits
|
| 160 |
+
print("📥 Loading model (this may take a few minutes)...")
|
| 161 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 162 |
+
model_name,
|
| 163 |
+
token=hf_token,
|
| 164 |
+
trust_remote_code=True,
|
| 165 |
+
dtype=torch.bfloat16, # Use dtype instead of torch_dtype (newer API)
|
| 166 |
+
device_map="auto",
|
| 167 |
+
max_memory={0: "20GiB"} if torch.cuda.is_available() else None, # Leave 2GB buffer
|
| 168 |
+
cache_dir="/tmp/huggingface",
|
| 169 |
+
low_cpu_mem_usage=True
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Set to eval mode for inference
|
| 173 |
+
model.eval()
|
| 174 |
+
|
| 175 |
+
# Mark as initialized only after successful load
|
| 176 |
+
_initialized = True
|
| 177 |
+
|
| 178 |
+
print(f"✅ Model loaded successfully!")
|
| 179 |
+
logger.info("✅ Model initialized successfully")
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
error_msg = f"❌ Error initializing model: {e}"
|
| 183 |
+
logger.error(error_msg, exc_info=True)
|
| 184 |
+
print(error_msg)
|
| 185 |
+
|
| 186 |
+
# Clear memory on failure
|
| 187 |
+
_clear_gpu_memory()
|
| 188 |
+
model = None
|
| 189 |
+
tokenizer = None
|
| 190 |
+
|
| 191 |
+
# Provide helpful error message for authentication issues
|
| 192 |
+
if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
|
| 193 |
+
print("\n🔐 Authentication Error Detected!")
|
| 194 |
+
print(" This usually means:")
|
| 195 |
+
print(" 1. HF_TOKEN_LC2 is missing or invalid")
|
| 196 |
+
print(" 2. You haven't accepted the model's terms on Hugging Face")
|
| 197 |
+
print(" 3. The token doesn't have access to DragonLLM models")
|
| 198 |
+
print("\n To fix:")
|
| 199 |
+
print(" 1. Visit: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
|
| 200 |
+
print(" 2. Accept the model's terms of use")
|
| 201 |
+
print(" 3. Ensure HF_TOKEN_LC2 is set as a secret in your HF Space")
|
| 202 |
+
|
| 203 |
+
raise
|
| 204 |
+
finally:
|
| 205 |
+
_initializing = False
|
| 206 |
|
| 207 |
|
| 208 |
class TransformersProvider:
|
|
|
|
| 239 |
|
| 240 |
messages = payload.get("messages", [])
|
| 241 |
temperature = payload.get("temperature", 0.7)
|
| 242 |
+
max_tokens = payload.get("max_tokens", 500) # Increased default for complete answers
|
| 243 |
top_p = payload.get("top_p", 1.0)
|
| 244 |
|
| 245 |
+
# Detect if French language is requested and add system prompt
|
| 246 |
+
user_messages = [msg for msg in messages if msg.get("role") == "user"]
|
| 247 |
+
system_messages = [msg for msg in messages if msg.get("role") == "system"]
|
| 248 |
+
|
| 249 |
+
# Check if any user message is in French or explicitly requests French
|
| 250 |
+
is_french_request = False
|
| 251 |
+
for msg in user_messages:
|
| 252 |
+
content = msg.get("content", "").lower()
|
| 253 |
+
if any(phrase in content for phrase in ["répondez en français", "en français", "réponse française", "répondez uniquement en français"]):
|
| 254 |
+
is_french_request = True
|
| 255 |
+
break
|
| 256 |
+
# Simple French detection - check for common French words
|
| 257 |
+
if any(word in content for word in ["expliquez", "qu'est", "comment", "pourquoi", "quel", "quelle", "définir", "définition"]):
|
| 258 |
+
# Additional check: has French characters or common French words
|
| 259 |
+
if any(char in content for char in ["é", "è", "ê", "à", "ç", "ù", "ô"]) or "c'est" in content:
|
| 260 |
+
is_french_request = True
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
# Add French system prompt if needed and not already present
|
| 264 |
+
if is_french_request and not any("français" in msg.get("content", "").lower() for msg in system_messages):
|
| 265 |
+
messages = [{"role": "system", "content": "Vous êtes un assistant financier expert. Répondez TOUJOURS en français. Utilisez uniquement le français dans vos réponses, y compris dans les calculs et explications."}] + messages
|
| 266 |
+
|
| 267 |
# Convert messages to prompt using tokenizer's chat template
|
| 268 |
if hasattr(tokenizer, "apply_chat_template"):
|
| 269 |
prompt = tokenizer.apply_chat_template(
|
|
|
|
| 295 |
do_sample=temperature > 0,
|
| 296 |
pad_token_id=tokenizer.eos_token_id,
|
| 297 |
eos_token_id=tokenizer.eos_token_id,
|
| 298 |
+
# Don't set min_new_tokens too high - let model finish naturally
|
| 299 |
+
repetition_penalty=1.05,
|
| 300 |
+
length_penalty=1.0
|
| 301 |
)
|
| 302 |
|
| 303 |
# Save token counts before cleanup
|
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Starting comprehensive tests...
|
| 2 |
+
|
| 3 |
+
================================================================================
|
| 4 |
+
MEMORY STRESS TEST - 15 sequential requests
|
| 5 |
+
================================================================================
|
| 6 |
+
|
| 7 |
+
[Request 1/15]
|
| 8 |
+
✅ Status: stop
|
| 9 |
+
⏱️ Time: 17.12s
|
| 10 |
+
📝 Tokens: 250/285
|
| 11 |
+
📄 Length: 829 chars
|
| 12 |
+
✅ Complete: No
|
| 13 |
+
⚠️ WARNING: Response may be truncated!
|
| 14 |
+
Last 100 chars: ...ears. So the formula becomes A = 5000*(1 + 0.04/1)^(1*2). That simplifies to 5000*(1.04)^2.
|
| 15 |
+
|
| 16 |
+
Calcul
|
| 17 |
+
|
| 18 |
+
[Request 2/15]
|
| 19 |
+
✅ Status: stop
|
| 20 |
+
⏱️ Time: 16.81s
|
| 21 |
+
📝 Tokens: 250/285
|
| 22 |
+
📄 Length: 864 chars
|
| 23 |
+
✅ Complete: Yes
|
| 24 |
+
|
| 25 |
+
[Request 3/15]
|
| 26 |
+
✅ Status: stop
|
| 27 |
+
⏱️ Time: 16.81s
|
| 28 |
+
📝 Tokens: 250/285
|
| 29 |
+
📄 Length: 871 chars
|
| 30 |
+
✅ Complete: No
|
| 31 |
+
⚠️ WARNING: Response may be truncated!
|
| 32 |
+
Last 100 chars: ...ut step by step.
|
| 33 |
+
|
| 34 |
+
First, calculate the rate per period: r/n = 0.04 / 1 = 0.04. Then add 1 to that: 1
|
| 35 |
+
|
| 36 |
+
[Request 4/15]
|
| 37 |
+
✅ Status: stop
|
| 38 |
+
⏱️ Time: 16.82s
|
| 39 |
+
📝 Tokens: 250/285
|
| 40 |
+
📄 Length: 764 chars
|
| 41 |
+
✅ Complete: No
|
| 42 |
+
⚠️ WARNING: Response may be truncated!
|
| 43 |
+
Last 100 chars: ...t simplifies to 5000*(1.04)^2. Calculating 1.04 squared... 1.04 * 1.04 is 1.0816. Then multiply by 5
|
| 44 |
+
|
| 45 |
+
[Request 5/15]
|
| 46 |
+
❌ Error: Exception: The read operation timed out
|
| 47 |
+
|
| 48 |
+
[Request 6/15]
|
| 49 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
|
| 50 |
+
|
| 51 |
+
[Request 7/15]
|
| 52 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 53 |
+
|
| 54 |
+
[Request 8/15]
|
| 55 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 56 |
+
|
| 57 |
+
[Request 9/15]
|
| 58 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 59 |
+
|
| 60 |
+
[Request 10/15]
|
| 61 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 62 |
+
|
| 63 |
+
[Request 11/15]
|
| 64 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 65 |
+
|
| 66 |
+
[Request 12/15]
|
| 67 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 68 |
+
|
| 69 |
+
[Request 13/15]
|
| 70 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 71 |
+
|
| 72 |
+
[Request 14/15]
|
| 73 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 74 |
+
|
| 75 |
+
[Request 15/15]
|
| 76 |
+
❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 77 |
+
|
| 78 |
+
================================================================================
|
| 79 |
+
MEMORY STRESS TEST SUMMARY
|
| 80 |
+
================================================================================
|
| 81 |
+
Total requests: 15
|
| 82 |
+
Successful: 4
|
| 83 |
+
Failed: 11
|
| 84 |
+
|
| 85 |
+
❌ Errors:
|
| 86 |
+
Request 5: Exception: The read operation timed out
|
| 87 |
+
Request 6: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
|
| 88 |
+
Request 7: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 89 |
+
Request 8: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 90 |
+
Request 9: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 91 |
+
Request 10: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 92 |
+
Request 11: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 93 |
+
Request 12: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 94 |
+
Request 13: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 95 |
+
Request 14: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 96 |
+
Request 15: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 97 |
+
|
| 98 |
+
📊 Performance:
|
| 99 |
+
Average time: 16.89s
|
| 100 |
+
Min time: 16.81s
|
| 101 |
+
Max time: 17.12s
|
| 102 |
+
Average tokens: 250
|
| 103 |
+
|
| 104 |
+
================================================================================
|
| 105 |
+
FRENCH LANGUAGE TEST
|
| 106 |
+
================================================================================
|
| 107 |
+
|
| 108 |
+
[Test 1/4] Simple French question
|
| 109 |
+
Prompt: Expliquez brièvement ce qu'est une obligation (bond).
|
| 110 |
+
❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 111 |
+
|
| 112 |
+
[Test 2/4] French with explicit instruction
|
| 113 |
+
Prompt: Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.
|
| 114 |
+
❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 115 |
+
|
| 116 |
+
[Test 3/4] French calculation
|
| 117 |
+
Prompt: Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.
|
| 118 |
+
❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 119 |
+
|
| 120 |
+
[Test 4/4] French finance terms
|
| 121 |
+
Prompt: Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.
|
| 122 |
+
❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
|
| 123 |
+
|
| 124 |
+
================================================================================
|
| 125 |
+
FRENCH LANGUAGE TEST SUMMARY
|
| 126 |
+
================================================================================
|
| 127 |
+
Total tests: 4
|
| 128 |
+
French answers: 0/4
|
| 129 |
+
Complete answers: 0/4
|
| 130 |
+
|
| 131 |
+
❌ Some answers are not in French!
|
| 132 |
+
|
| 133 |
+
================================================================================
|
| 134 |
+
FINAL SUMMARY
|
| 135 |
+
================================================================================
|
| 136 |
+
Memory management: ❌ FAIL
|
| 137 |
+
French language: ❌ FAIL
|
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Stress test memory management with multiple sequential requests.
|
| 4 |
+
Also checks if responses are complete and in French when requested.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
import sys
|
| 11 |
+
from typing import List, Dict, Any
|
| 12 |
+
|
| 13 |
+
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
|
| 14 |
+
|
| 15 |
+
def test_memory_stability(num_requests: int = 10):
|
| 16 |
+
"""Send multiple requests sequentially to test memory cleanup."""
|
| 17 |
+
print("="*80)
|
| 18 |
+
print(f"MEMORY STRESS TEST - {num_requests} sequential requests")
|
| 19 |
+
print("="*80)
|
| 20 |
+
|
| 21 |
+
errors = []
|
| 22 |
+
times = []
|
| 23 |
+
token_counts = []
|
| 24 |
+
|
| 25 |
+
for i in range(1, num_requests + 1):
|
| 26 |
+
print(f"\n[Request {i}/{num_requests}]")
|
| 27 |
+
start_time = time.time()
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
response = httpx.post(
|
| 31 |
+
f"{BASE_URL}/v1/chat/completions",
|
| 32 |
+
json={
|
| 33 |
+
"model": "DragonLLM/qwen3-8b-fin-v1.0",
|
| 34 |
+
"messages": [
|
| 35 |
+
{
|
| 36 |
+
"role": "user",
|
| 37 |
+
"content": f"Question {i}: Calculate compound interest on $5,000 at 4% for 2 years. Show your work."
|
| 38 |
+
}
|
| 39 |
+
],
|
| 40 |
+
"max_tokens": 250,
|
| 41 |
+
"temperature": 0.3
|
| 42 |
+
},
|
| 43 |
+
timeout=60.0
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
elapsed = time.time() - start_time
|
| 47 |
+
|
| 48 |
+
if response.status_code != 200:
|
| 49 |
+
error_msg = f"HTTP {response.status_code}: {response.text}"
|
| 50 |
+
print(f"❌ Error: {error_msg}")
|
| 51 |
+
errors.append((i, error_msg))
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
data = response.json()
|
| 55 |
+
|
| 56 |
+
if "error" in data:
|
| 57 |
+
error_msg = data["error"]["message"]
|
| 58 |
+
print(f"❌ API Error: {error_msg}")
|
| 59 |
+
errors.append((i, error_msg))
|
| 60 |
+
|
| 61 |
+
# Check if it's an OOM error
|
| 62 |
+
if "out of memory" in error_msg.lower() or "cuda" in error_msg.lower():
|
| 63 |
+
print(f"🚨 MEMORY ERROR DETECTED at request {i}!")
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# Extract response data
|
| 67 |
+
choice = data.get("choices", [{}])[0]
|
| 68 |
+
message = choice.get("message", {})
|
| 69 |
+
content = message.get("content", "")
|
| 70 |
+
finish_reason = choice.get("finish_reason", "unknown")
|
| 71 |
+
usage = data.get("usage", {})
|
| 72 |
+
|
| 73 |
+
prompt_tokens = usage.get("prompt_tokens", 0)
|
| 74 |
+
completion_tokens = usage.get("completion_tokens", 0)
|
| 75 |
+
total_tokens = usage.get("total_tokens", 0)
|
| 76 |
+
|
| 77 |
+
times.append(elapsed)
|
| 78 |
+
token_counts.append(completion_tokens)
|
| 79 |
+
|
| 80 |
+
# Check if response is complete
|
| 81 |
+
is_complete = finish_reason == "stop"
|
| 82 |
+
is_truncated = finish_reason == "length"
|
| 83 |
+
|
| 84 |
+
# Check if answer seems complete (doesn't end mid-sentence)
|
| 85 |
+
ends_properly = (
|
| 86 |
+
content.strip().endswith(".") or
|
| 87 |
+
content.strip().endswith("!") or
|
| 88 |
+
content.strip().endswith("?") or
|
| 89 |
+
content.strip().endswith("€") or
|
| 90 |
+
content.strip().endswith("$")
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
print(f" ✅ Status: {finish_reason}")
|
| 94 |
+
print(f" ⏱️ Time: {elapsed:.2f}s")
|
| 95 |
+
print(f" 📝 Tokens: {completion_tokens}/{total_tokens}")
|
| 96 |
+
print(f" 📄 Length: {len(content)} chars")
|
| 97 |
+
print(f" ✅ Complete: {'Yes' if is_complete and ends_properly else 'No'}")
|
| 98 |
+
|
| 99 |
+
if is_truncated or (not is_complete) or (not ends_properly):
|
| 100 |
+
print(f" ⚠️ WARNING: Response may be truncated!")
|
| 101 |
+
print(f" Last 100 chars: ...{content[-100:]}")
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
elapsed = time.time() - start_time
|
| 105 |
+
error_msg = f"Exception: {str(e)}"
|
| 106 |
+
print(f"❌ Error: {error_msg}")
|
| 107 |
+
errors.append((i, error_msg))
|
| 108 |
+
|
| 109 |
+
# Small delay between requests
|
| 110 |
+
if i < num_requests:
|
| 111 |
+
time.sleep(1)
|
| 112 |
+
|
| 113 |
+
# Summary
|
| 114 |
+
print("\n" + "="*80)
|
| 115 |
+
print("MEMORY STRESS TEST SUMMARY")
|
| 116 |
+
print("="*80)
|
| 117 |
+
print(f"Total requests: {num_requests}")
|
| 118 |
+
print(f"Successful: {num_requests - len(errors)}")
|
| 119 |
+
print(f"Failed: {len(errors)}")
|
| 120 |
+
|
| 121 |
+
if errors:
|
| 122 |
+
print("\n❌ Errors:")
|
| 123 |
+
for req_num, error in errors:
|
| 124 |
+
print(f" Request {req_num}: {error}")
|
| 125 |
+
|
| 126 |
+
if times:
|
| 127 |
+
print(f"\n📊 Performance:")
|
| 128 |
+
print(f" Average time: {sum(times)/len(times):.2f}s")
|
| 129 |
+
print(f" Min time: {min(times):.2f}s")
|
| 130 |
+
print(f" Max time: {max(times):.2f}s")
|
| 131 |
+
print(f" Average tokens: {sum(token_counts)/len(token_counts):.0f}")
|
| 132 |
+
|
| 133 |
+
# Check for memory leaks (increasing response times)
|
| 134 |
+
if len(times) > 3:
|
| 135 |
+
first_half = sum(times[:len(times)//2]) / (len(times)//2)
|
| 136 |
+
second_half = sum(times[len(times)//2:]) / (len(times) - len(times)//2)
|
| 137 |
+
if second_half > first_half * 1.5:
|
| 138 |
+
print(f" ⚠️ WARNING: Response times increasing ({first_half:.2f}s → {second_half:.2f}s)")
|
| 139 |
+
print(f" This may indicate memory leak!")
|
| 140 |
+
|
| 141 |
+
return len(errors) == 0
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def test_french_language():
|
| 145 |
+
"""Test if French prompts produce French answers."""
|
| 146 |
+
print("\n" + "="*80)
|
| 147 |
+
print("FRENCH LANGUAGE TEST")
|
| 148 |
+
print("="*80)
|
| 149 |
+
|
| 150 |
+
test_questions = [
|
| 151 |
+
{
|
| 152 |
+
"name": "Simple French question",
|
| 153 |
+
"prompt": "Expliquez brièvement ce qu'est une obligation (bond).",
|
| 154 |
+
"max_tokens": 200
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"name": "French with explicit instruction",
|
| 158 |
+
"prompt": "Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.",
|
| 159 |
+
"max_tokens": 250
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"name": "French calculation",
|
| 163 |
+
"prompt": "Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.",
|
| 164 |
+
"max_tokens": 300
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"name": "French finance terms",
|
| 168 |
+
"prompt": "Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.",
|
| 169 |
+
"max_tokens": 350
|
| 170 |
+
}
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
results = []
|
| 174 |
+
|
| 175 |
+
for i, test in enumerate(test_questions, 1):
|
| 176 |
+
print(f"\n[Test {i}/{len(test_questions)}] {test['name']}")
|
| 177 |
+
print(f"Prompt: {test['prompt']}")
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
response = httpx.post(
|
| 181 |
+
f"{BASE_URL}/v1/chat/completions",
|
| 182 |
+
json={
|
| 183 |
+
"model": "DragonLLM/qwen3-8b-fin-v1.0",
|
| 184 |
+
"messages": [
|
| 185 |
+
{
|
| 186 |
+
"role": "system",
|
| 187 |
+
"content": "Vous êtes un assistant financier expert. Répondez toujours en français."
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"role": "user",
|
| 191 |
+
"content": test["prompt"]
|
| 192 |
+
}
|
| 193 |
+
],
|
| 194 |
+
"max_tokens": test["max_tokens"],
|
| 195 |
+
"temperature": 0.3
|
| 196 |
+
},
|
| 197 |
+
timeout=60.0
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
if response.status_code != 200:
|
| 201 |
+
print(f"❌ HTTP {response.status_code}: {response.text}")
|
| 202 |
+
results.append({"test": test["name"], "status": "error", "error": response.text})
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
data = response.json()
|
| 206 |
+
|
| 207 |
+
if "error" in data:
|
| 208 |
+
print(f"❌ API Error: {data['error']['message']}")
|
| 209 |
+
results.append({"test": test["name"], "status": "error", "error": data["error"]["message"]})
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
choice = data.get("choices", [{}])[0]
|
| 213 |
+
message = choice.get("message", {})
|
| 214 |
+
content = message.get("content", "")
|
| 215 |
+
finish_reason = choice.get("finish_reason", "unknown")
|
| 216 |
+
|
| 217 |
+
# Check if answer is in French (simple heuristic)
|
| 218 |
+
# Remove reasoning tags for analysis
|
| 219 |
+
answer_only = content
|
| 220 |
+
if "<think>" in answer_only:
|
| 221 |
+
parts = answer_only.split("</think>")
|
| 222 |
+
if len(parts) > 1:
|
| 223 |
+
answer_only = parts[-1].strip()
|
| 224 |
+
|
| 225 |
+
# Check for French words
|
| 226 |
+
french_indicators = ["est", "sont", "pour", "dans", "avec", "comme", "une", "le", "la", "les", "l'", "c'est", "qu'est", "fonctionne"]
|
| 227 |
+
english_indicators = ["is", "are", "for", "in", "with", "the", "a", "an", "it's", "what's", "works"]
|
| 228 |
+
|
| 229 |
+
french_count = sum(1 for word in french_indicators if word.lower() in answer_only.lower())
|
| 230 |
+
english_count = sum(1 for word in english_indicators if word.lower() in answer_only.lower())
|
| 231 |
+
|
| 232 |
+
is_french = french_count > english_count * 2 or french_count > 3
|
| 233 |
+
|
| 234 |
+
# Check completeness
|
| 235 |
+
is_complete = finish_reason == "stop"
|
| 236 |
+
ends_properly = answer_only.strip().endswith((".", "!", "?", "€", "$", ":"))
|
| 237 |
+
|
| 238 |
+
print(f"\n📄 Full Response (first 500 chars):")
|
| 239 |
+
print(content[:500] + ("..." if len(content) > 500 else ""))
|
| 240 |
+
|
| 241 |
+
print(f"\n📄 Answer Only (after reasoning):")
|
| 242 |
+
print(answer_only[:400] + ("..." if len(answer_only) > 400 else ""))
|
| 243 |
+
|
| 244 |
+
print(f"\n📊 Analysis:")
|
| 245 |
+
print(f" Finish reason: {finish_reason}")
|
| 246 |
+
print(f" French words found: {french_count}")
|
| 247 |
+
print(f" English words found: {english_count}")
|
| 248 |
+
print(f" Is French: {'✅ Yes' if is_french else '❌ No'}")
|
| 249 |
+
print(f" Is complete: {'✅ Yes' if is_complete and ends_properly else '❌ No'}")
|
| 250 |
+
|
| 251 |
+
if not is_french:
|
| 252 |
+
print(f" ⚠️ WARNING: Answer appears to be in English!")
|
| 253 |
+
|
| 254 |
+
results.append({
|
| 255 |
+
"test": test["name"],
|
| 256 |
+
"status": "success" if is_french and is_complete else "partial",
|
| 257 |
+
"is_french": is_french,
|
| 258 |
+
"is_complete": is_complete,
|
| 259 |
+
"content": content,
|
| 260 |
+
"answer_only": answer_only
|
| 261 |
+
})
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
print(f"❌ Exception: {str(e)}")
|
| 265 |
+
results.append({"test": test["name"], "status": "error", "error": str(e)})
|
| 266 |
+
|
| 267 |
+
# Summary
|
| 268 |
+
print("\n" + "="*80)
|
| 269 |
+
print("FRENCH LANGUAGE TEST SUMMARY")
|
| 270 |
+
print("="*80)
|
| 271 |
+
|
| 272 |
+
french_count = sum(1 for r in results if r.get("is_french", False))
|
| 273 |
+
complete_count = sum(1 for r in results if r.get("is_complete", False))
|
| 274 |
+
|
| 275 |
+
print(f"Total tests: {len(results)}")
|
| 276 |
+
print(f"French answers: {french_count}/{len(results)}")
|
| 277 |
+
print(f"Complete answers: {complete_count}/{len(results)}")
|
| 278 |
+
|
| 279 |
+
if french_count < len(results):
|
| 280 |
+
print("\n❌ Some answers are not in French!")
|
| 281 |
+
|
| 282 |
+
return french_count == len(results) and complete_count == len(results)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
print("Starting comprehensive tests...\n")
|
| 287 |
+
|
| 288 |
+
# Test memory stability
|
| 289 |
+
memory_ok = test_memory_stability(num_requests=15)
|
| 290 |
+
|
| 291 |
+
# Test French language
|
| 292 |
+
french_ok = test_french_language()
|
| 293 |
+
|
| 294 |
+
# Final summary
|
| 295 |
+
print("\n" + "="*80)
|
| 296 |
+
print("FINAL SUMMARY")
|
| 297 |
+
print("="*80)
|
| 298 |
+
print(f"Memory management: {'✅ PASS' if memory_ok else '❌ FAIL'}")
|
| 299 |
+
print(f"French language: {'✅ PASS' if french_ok else '❌ FAIL'}")
|
| 300 |
+
|
| 301 |
+
sys.exit(0 if (memory_ok and french_ok) else 1)
|
| 302 |
+
|