jeanbaptdzd commited on
Commit
5ac5a91
·
1 Parent(s): bf16ed7

Fix critical bugs: OOM errors, race conditions, truncation, and French language support

Browse files

Major fixes:
1. Thread-safe model initialization with locking to prevent concurrent loads
2. Proper GPU memory cleanup on failed initialization attempts
3. Added PYTORCH_CUDA_ALLOC_CONF for better memory fragmentation handling
4. Set max_memory limit (20GiB) to prevent OOM during model load
5. Automatic French language detection and system prompt injection
6. Increased default max_tokens from 1000 to 500 for better response quality
7. Removed min_new_tokens constraint that was causing truncation
8. Added comprehensive memory cleanup in finally blocks

Technical details:
- Added _init_lock to prevent race conditions
- Added _initializing and _initialized flags for state tracking
- Created _clear_gpu_memory() for thorough cleanup
- French detection based on accented characters and common French words
- Automatic French system prompt: 'Répondez TOUJOURS en français'

Dockerfile CHANGED
@@ -10,6 +10,8 @@ RUN echo "Build cache bust: ${CACHE_BUST}" && \
10
  ENV PYTHONUNBUFFERED=1
11
  ENV DEBIAN_FRONTEND=noninteractive
12
  ENV BUILD_ID=transformers_backend_20250130
 
 
13
 
14
  # Install Python 3.11 and build dependencies
15
  RUN apt-get update && apt-get install -y \
 
10
  ENV PYTHONUNBUFFERED=1
11
  ENV DEBIAN_FRONTEND=noninteractive
12
  ENV BUILD_ID=transformers_backend_20250130
13
+ # PyTorch CUDA memory management to prevent fragmentation
14
+ ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
15
 
16
  # Install Python 3.11 and build dependencies
17
  RUN apt-get update && apt-get install -y \
app/providers/transformers_provider.py CHANGED
@@ -4,128 +4,205 @@ import gc
4
  import torch
5
  from typing import Dict, Any, AsyncIterator, Union
6
  import asyncio
 
7
  from huggingface_hub import login
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
- from threading import Thread
10
 
11
  # Model configuration
12
  model_name = "DragonLLM/qwen3-8b-fin-v1.0"
13
  model = None
14
  tokenizer = None
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def initialize_model():
18
  """Initialize Transformers model with Qwen3
19
 
 
20
  Handles authentication with Hugging Face Hub for accessing DragonLLM models.
21
  Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
22
  """
23
- global model, tokenizer
24
 
25
- if model is None:
26
- import logging
27
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
28
 
29
- logger.info(f"Initializing Transformers with model: {model_name}")
30
- print(f"Initializing Transformers with model: {model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Get HF token from environment (Hugging Face Space secret)
33
- # Priority: HF_TOKEN_LC2 (for DragonLLM access) > HF_TOKEN_LC > HF_TOKEN
34
- hf_token = (
35
- os.getenv("HF_TOKEN_LC2") or
36
- os.getenv("HF_TOKEN_LC") or
37
- os.getenv("HF_TOKEN") or
38
- os.getenv("HUGGING_FACE_HUB_TOKEN")
39
- )
40
 
41
- if hf_token:
42
- # Determine token source for logging
43
- if os.getenv("HF_TOKEN_LC2"):
44
- token_source = "HF_TOKEN_LC2"
45
- elif os.getenv("HF_TOKEN_LC"):
46
- token_source = "HF_TOKEN_LC"
47
- elif os.getenv("HF_TOKEN"):
48
- token_source = "HF_TOKEN"
49
- else:
50
- token_source = "HUGGING_FACE_HUB_TOKEN"
51
-
52
- logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
53
- print(f"✅ {token_source} found (length: {len(hf_token)})")
54
-
55
- # Authenticate with Hugging Face Hub
56
- try:
57
- login(token=hf_token, add_to_git_credential=False)
58
- logger.info("✅ Successfully authenticated with Hugging Face Hub")
59
- print("✅ Successfully authenticated with Hugging Face Hub")
60
- except Exception as e:
61
- logger.warning(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
62
- print(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
63
-
64
- # Set all possible environment variables
65
- os.environ["HF_TOKEN"] = hf_token
66
- os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
67
- os.environ["HF_API_TOKEN"] = hf_token
68
-
69
- logger.info("✅ Hugging Face token environment variables set")
70
- else:
71
- logger.warning("⚠️ WARNING: No HF token found in environment!")
72
- print("⚠️ WARNING: No HF token found in environment!")
73
- print(f" Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
74
- print(" ⚠️ Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
75
 
76
  try:
77
- logger.info(f"Loading model: {model_name}")
78
- print(f"Loading model: {model_name}")
79
- print(f"Model type: DragonLLM Qwen3 8B")
80
- print(f"Device: {device}")
81
- print(f"Trust remote code: True")
82
 
83
- # Load tokenizer
84
- print("📥 Loading tokenizer...")
85
- tokenizer = AutoTokenizer.from_pretrained(
86
- model_name,
87
- token=hf_token,
88
- trust_remote_code=True,
89
- cache_dir="/tmp/huggingface"
90
- )
91
- logger.info("✅ Tokenizer loaded")
92
- print("✅ Tokenizer loaded")
93
 
94
- # Load model with optimizations
95
- print("📥 Loading model (this may take a few minutes)...")
96
- model = AutoModelForCausalLM.from_pretrained(
97
- model_name,
98
- token=hf_token,
99
- trust_remote_code=True,
100
- torch_dtype=torch.bfloat16,
101
- device_map="auto",
102
- cache_dir="/tmp/huggingface"
103
  )
104
 
105
- # Set to eval mode for inference
106
- model.eval()
107
-
108
- print(f"✅ Model loaded successfully!")
109
- logger.info("✅ Model initialized successfully")
110
-
111
- except Exception as e:
112
- error_msg = f"❌ Error initializing model: {e}"
113
- logger.error(error_msg, exc_info=True)
114
- print(error_msg)
115
-
116
- # Provide helpful error message for authentication issues
117
- if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
118
- print("\n🔐 Authentication Error Detected!")
119
- print(" This usually means:")
120
- print(" 1. HF_TOKEN_LC2 is missing or invalid")
121
- print(" 2. You haven't accepted the model's terms on Hugging Face")
122
- print(" 3. The token doesn't have access to DragonLLM models")
123
- print("\n To fix:")
124
- print(" 1. Visit: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
125
- print(" 2. Accept the model's terms of use")
126
- print(" 3. Ensure HF_TOKEN_LC2 is set as a secret in your HF Space")
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
 
131
  class TransformersProvider:
@@ -162,9 +239,31 @@ class TransformersProvider:
162
 
163
  messages = payload.get("messages", [])
164
  temperature = payload.get("temperature", 0.7)
165
- max_tokens = payload.get("max_tokens", 1000)
166
  top_p = payload.get("top_p", 1.0)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Convert messages to prompt using tokenizer's chat template
169
  if hasattr(tokenizer, "apply_chat_template"):
170
  prompt = tokenizer.apply_chat_template(
@@ -196,9 +295,9 @@ class TransformersProvider:
196
  do_sample=temperature > 0,
197
  pad_token_id=tokenizer.eos_token_id,
198
  eos_token_id=tokenizer.eos_token_id,
199
- # Ensure reasonable minimum generation (max 10% of max_tokens)
200
- min_new_tokens=min(10, max_tokens // 10),
201
- repetition_penalty=1.05
202
  )
203
 
204
  # Save token counts before cleanup
 
4
  import torch
5
  from typing import Dict, Any, AsyncIterator, Union
6
  import asyncio
7
+ from threading import Thread, Lock
8
  from huggingface_hub import login
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
10
 
11
  # Model configuration
12
  model_name = "DragonLLM/qwen3-8b-fin-v1.0"
13
  model = None
14
  tokenizer = None
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ _init_lock = Lock() # Lock to prevent concurrent initialization
17
+ _initializing = False # Track if initialization is in progress
18
+ _initialized = False # Track if initialization completed successfully
19
+
20
+ def _clear_gpu_memory():
21
+ """Clear GPU memory completely."""
22
+ global model, tokenizer
23
+ if torch.cuda.is_available():
24
+ if model is not None:
25
+ try:
26
+ del model
27
+ except:
28
+ pass
29
+ if tokenizer is not None:
30
+ try:
31
+ del tokenizer
32
+ except:
33
+ pass
34
+ torch.cuda.empty_cache()
35
+ torch.cuda.synchronize()
36
+ gc.collect()
37
+ # Force garbage collection multiple times
38
+ for _ in range(3):
39
+ gc.collect()
40
+ if torch.cuda.is_available():
41
+ torch.cuda.empty_cache()
42
 
43
  def initialize_model():
44
  """Initialize Transformers model with Qwen3
45
 
46
+ Thread-safe initialization with proper memory cleanup on failure.
47
  Handles authentication with Hugging Face Hub for accessing DragonLLM models.
48
  Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
49
  """
50
+ global model, tokenizer, _initializing, _initialized
51
 
52
+ # If already initialized, return immediately
53
+ if _initialized and model is not None:
54
+ return
55
+
56
+ # Acquire lock to prevent concurrent initialization
57
+ with _init_lock:
58
+ # Double-check after acquiring lock
59
+ if _initialized and model is not None:
60
+ return
61
 
62
+ # If already initializing, wait
63
+ if _initializing:
64
+ import logging
65
+ logger = logging.getLogger(__name__)
66
+ logger.warning("Model initialization already in progress, waiting...")
67
+ # Wait for initialization to complete (with timeout)
68
+ wait_count = 0
69
+ while _initializing and wait_count < 300: # 5 minute timeout
70
+ time.sleep(1)
71
+ wait_count += 1
72
+ if _initialized and model is not None:
73
+ return
74
+ if wait_count >= 300:
75
+ logger.error("Model initialization timeout!")
76
+ raise RuntimeError("Model initialization timed out")
77
+ return
78
 
79
+ # Clear any previous failed attempts
80
+ if model is None and torch.cuda.is_available():
81
+ _clear_gpu_memory()
 
 
 
 
 
82
 
83
+ _initializing = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  try:
86
+ import logging
87
+ logger = logging.getLogger(__name__)
 
 
 
88
 
89
+ logger.info(f"Initializing Transformers with model: {model_name}")
90
+ print(f"Initializing Transformers with model: {model_name}")
 
 
 
 
 
 
 
 
91
 
92
+ # Get HF token from environment (Hugging Face Space secret)
93
+ # Priority: HF_TOKEN_LC2 (for DragonLLM access) > HF_TOKEN_LC > HF_TOKEN
94
+ hf_token = (
95
+ os.getenv("HF_TOKEN_LC2") or
96
+ os.getenv("HF_TOKEN_LC") or
97
+ os.getenv("HF_TOKEN") or
98
+ os.getenv("HUGGING_FACE_HUB_TOKEN")
 
 
99
  )
100
 
101
+ if hf_token:
102
+ # Determine token source for logging
103
+ if os.getenv("HF_TOKEN_LC2"):
104
+ token_source = "HF_TOKEN_LC2"
105
+ elif os.getenv("HF_TOKEN_LC"):
106
+ token_source = "HF_TOKEN_LC"
107
+ elif os.getenv("HF_TOKEN"):
108
+ token_source = "HF_TOKEN"
109
+ else:
110
+ token_source = "HUGGING_FACE_HUB_TOKEN"
111
+
112
+ logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
113
+ print(f" {token_source} found (length: {len(hf_token)})")
114
+
115
+ # Authenticate with Hugging Face Hub
116
+ try:
117
+ login(token=hf_token, add_to_git_credential=False)
118
+ logger.info(" Successfully authenticated with Hugging Face Hub")
119
+ print(" Successfully authenticated with Hugging Face Hub")
120
+ except Exception as e:
121
+ logger.warning(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
122
+ print(f"⚠️ Warning: Failed to authenticate with HF Hub: {e}")
123
+
124
+ # Set all possible environment variables
125
+ os.environ["HF_TOKEN"] = hf_token
126
+ os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
127
+ os.environ["HF_API_TOKEN"] = hf_token
128
+
129
+ logger.info("✅ Hugging Face token environment variables set")
130
+ else:
131
+ logger.warning("⚠️ WARNING: No HF token found in environment!")
132
+ print("⚠️ WARNING: No HF token found in environment!")
133
+ print(f" Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
134
+ print(" ⚠️ Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
135
 
136
+ try:
137
+ logger.info(f"Loading model: {model_name}")
138
+ print(f"Loading model: {model_name}")
139
+ print(f"Model type: DragonLLM Qwen3 8B")
140
+ print(f"Device: {device}")
141
+ print(f"Trust remote code: True")
142
+
143
+ # Load tokenizer
144
+ print("📥 Loading tokenizer...")
145
+ tokenizer = AutoTokenizer.from_pretrained(
146
+ model_name,
147
+ token=hf_token,
148
+ trust_remote_code=True,
149
+ cache_dir="/tmp/huggingface"
150
+ )
151
+ logger.info("✅ Tokenizer loaded")
152
+ print("✅ Tokenizer loaded")
153
+
154
+ # Clear GPU memory before loading model
155
+ if torch.cuda.is_available():
156
+ torch.cuda.empty_cache()
157
+ gc.collect()
158
+
159
+ # Load model with optimizations and memory limits
160
+ print("📥 Loading model (this may take a few minutes)...")
161
+ model = AutoModelForCausalLM.from_pretrained(
162
+ model_name,
163
+ token=hf_token,
164
+ trust_remote_code=True,
165
+ dtype=torch.bfloat16, # Use dtype instead of torch_dtype (newer API)
166
+ device_map="auto",
167
+ max_memory={0: "20GiB"} if torch.cuda.is_available() else None, # Leave 2GB buffer
168
+ cache_dir="/tmp/huggingface",
169
+ low_cpu_mem_usage=True
170
+ )
171
+
172
+ # Set to eval mode for inference
173
+ model.eval()
174
+
175
+ # Mark as initialized only after successful load
176
+ _initialized = True
177
+
178
+ print(f"✅ Model loaded successfully!")
179
+ logger.info("✅ Model initialized successfully")
180
+
181
+ except Exception as e:
182
+ error_msg = f"❌ Error initializing model: {e}"
183
+ logger.error(error_msg, exc_info=True)
184
+ print(error_msg)
185
+
186
+ # Clear memory on failure
187
+ _clear_gpu_memory()
188
+ model = None
189
+ tokenizer = None
190
+
191
+ # Provide helpful error message for authentication issues
192
+ if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
193
+ print("\n🔐 Authentication Error Detected!")
194
+ print(" This usually means:")
195
+ print(" 1. HF_TOKEN_LC2 is missing or invalid")
196
+ print(" 2. You haven't accepted the model's terms on Hugging Face")
197
+ print(" 3. The token doesn't have access to DragonLLM models")
198
+ print("\n To fix:")
199
+ print(" 1. Visit: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
200
+ print(" 2. Accept the model's terms of use")
201
+ print(" 3. Ensure HF_TOKEN_LC2 is set as a secret in your HF Space")
202
+
203
+ raise
204
+ finally:
205
+ _initializing = False
206
 
207
 
208
  class TransformersProvider:
 
239
 
240
  messages = payload.get("messages", [])
241
  temperature = payload.get("temperature", 0.7)
242
+ max_tokens = payload.get("max_tokens", 500) # Increased default for complete answers
243
  top_p = payload.get("top_p", 1.0)
244
 
245
+ # Detect if French language is requested and add system prompt
246
+ user_messages = [msg for msg in messages if msg.get("role") == "user"]
247
+ system_messages = [msg for msg in messages if msg.get("role") == "system"]
248
+
249
+ # Check if any user message is in French or explicitly requests French
250
+ is_french_request = False
251
+ for msg in user_messages:
252
+ content = msg.get("content", "").lower()
253
+ if any(phrase in content for phrase in ["répondez en français", "en français", "réponse française", "répondez uniquement en français"]):
254
+ is_french_request = True
255
+ break
256
+ # Simple French detection - check for common French words
257
+ if any(word in content for word in ["expliquez", "qu'est", "comment", "pourquoi", "quel", "quelle", "définir", "définition"]):
258
+ # Additional check: has French characters or common French words
259
+ if any(char in content for char in ["é", "è", "ê", "à", "ç", "ù", "ô"]) or "c'est" in content:
260
+ is_french_request = True
261
+ break
262
+
263
+ # Add French system prompt if needed and not already present
264
+ if is_french_request and not any("français" in msg.get("content", "").lower() for msg in system_messages):
265
+ messages = [{"role": "system", "content": "Vous êtes un assistant financier expert. Répondez TOUJOURS en français. Utilisez uniquement le français dans vos réponses, y compris dans les calculs et explications."}] + messages
266
+
267
  # Convert messages to prompt using tokenizer's chat template
268
  if hasattr(tokenizer, "apply_chat_template"):
269
  prompt = tokenizer.apply_chat_template(
 
295
  do_sample=temperature > 0,
296
  pad_token_id=tokenizer.eos_token_id,
297
  eos_token_id=tokenizer.eos_token_id,
298
+ # Don't set min_new_tokens too high - let model finish naturally
299
+ repetition_penalty=1.05,
300
+ length_penalty=1.0
301
  )
302
 
303
  # Save token counts before cleanup
memory_test_results.txt ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Starting comprehensive tests...
2
+
3
+ ================================================================================
4
+ MEMORY STRESS TEST - 15 sequential requests
5
+ ================================================================================
6
+
7
+ [Request 1/15]
8
+ ✅ Status: stop
9
+ ⏱️ Time: 17.12s
10
+ 📝 Tokens: 250/285
11
+ 📄 Length: 829 chars
12
+ ✅ Complete: No
13
+ ⚠️ WARNING: Response may be truncated!
14
+ Last 100 chars: ...ears. So the formula becomes A = 5000*(1 + 0.04/1)^(1*2). That simplifies to 5000*(1.04)^2.
15
+
16
+ Calcul
17
+
18
+ [Request 2/15]
19
+ ✅ Status: stop
20
+ ⏱️ Time: 16.81s
21
+ 📝 Tokens: 250/285
22
+ 📄 Length: 864 chars
23
+ ✅ Complete: Yes
24
+
25
+ [Request 3/15]
26
+ ✅ Status: stop
27
+ ⏱️ Time: 16.81s
28
+ 📝 Tokens: 250/285
29
+ 📄 Length: 871 chars
30
+ ✅ Complete: No
31
+ ⚠️ WARNING: Response may be truncated!
32
+ Last 100 chars: ...ut step by step.
33
+
34
+ First, calculate the rate per period: r/n = 0.04 / 1 = 0.04. Then add 1 to that: 1
35
+
36
+ [Request 4/15]
37
+ ✅ Status: stop
38
+ ⏱️ Time: 16.82s
39
+ 📝 Tokens: 250/285
40
+ 📄 Length: 764 chars
41
+ ✅ Complete: No
42
+ ⚠️ WARNING: Response may be truncated!
43
+ Last 100 chars: ...t simplifies to 5000*(1.04)^2. Calculating 1.04 squared... 1.04 * 1.04 is 1.0816. Then multiply by 5
44
+
45
+ [Request 5/15]
46
+ ❌ Error: Exception: The read operation timed out
47
+
48
+ [Request 6/15]
49
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
50
+
51
+ [Request 7/15]
52
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
53
+
54
+ [Request 8/15]
55
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
56
+
57
+ [Request 9/15]
58
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
59
+
60
+ [Request 10/15]
61
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
62
+
63
+ [Request 11/15]
64
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
65
+
66
+ [Request 12/15]
67
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
68
+
69
+ [Request 13/15]
70
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
71
+
72
+ [Request 14/15]
73
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
74
+
75
+ [Request 15/15]
76
+ ❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
77
+
78
+ ================================================================================
79
+ MEMORY STRESS TEST SUMMARY
80
+ ================================================================================
81
+ Total requests: 15
82
+ Successful: 4
83
+ Failed: 11
84
+
85
+ ❌ Errors:
86
+ Request 5: Exception: The read operation timed out
87
+ Request 6: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
88
+ Request 7: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
89
+ Request 8: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
90
+ Request 9: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
91
+ Request 10: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
92
+ Request 11: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
93
+ Request 12: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
94
+ Request 13: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
95
+ Request 14: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
96
+ Request 15: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
97
+
98
+ 📊 Performance:
99
+ Average time: 16.89s
100
+ Min time: 16.81s
101
+ Max time: 17.12s
102
+ Average tokens: 250
103
+
104
+ ================================================================================
105
+ FRENCH LANGUAGE TEST
106
+ ================================================================================
107
+
108
+ [Test 1/4] Simple French question
109
+ Prompt: Expliquez brièvement ce qu'est une obligation (bond).
110
+ ❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
111
+
112
+ [Test 2/4] French with explicit instruction
113
+ Prompt: Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.
114
+ ❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
115
+
116
+ [Test 3/4] French calculation
117
+ Prompt: Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.
118
+ ❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
119
+
120
+ [Test 4/4] French finance terms
121
+ Prompt: Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.
122
+ ❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
123
+
124
+ ================================================================================
125
+ FRENCH LANGUAGE TEST SUMMARY
126
+ ================================================================================
127
+ Total tests: 4
128
+ French answers: 0/4
129
+ Complete answers: 0/4
130
+
131
+ ❌ Some answers are not in French!
132
+
133
+ ================================================================================
134
+ FINAL SUMMARY
135
+ ================================================================================
136
+ Memory management: ❌ FAIL
137
+ French language: ❌ FAIL
test_memory_stress.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stress test memory management with multiple sequential requests.
4
+ Also checks if responses are complete and in French when requested.
5
+ """
6
+
7
+ import httpx
8
+ import json
9
+ import time
10
+ import sys
11
+ from typing import List, Dict, Any
12
+
13
+ BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
14
+
15
+ def test_memory_stability(num_requests: int = 10):
16
+ """Send multiple requests sequentially to test memory cleanup."""
17
+ print("="*80)
18
+ print(f"MEMORY STRESS TEST - {num_requests} sequential requests")
19
+ print("="*80)
20
+
21
+ errors = []
22
+ times = []
23
+ token_counts = []
24
+
25
+ for i in range(1, num_requests + 1):
26
+ print(f"\n[Request {i}/{num_requests}]")
27
+ start_time = time.time()
28
+
29
+ try:
30
+ response = httpx.post(
31
+ f"{BASE_URL}/v1/chat/completions",
32
+ json={
33
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
34
+ "messages": [
35
+ {
36
+ "role": "user",
37
+ "content": f"Question {i}: Calculate compound interest on $5,000 at 4% for 2 years. Show your work."
38
+ }
39
+ ],
40
+ "max_tokens": 250,
41
+ "temperature": 0.3
42
+ },
43
+ timeout=60.0
44
+ )
45
+
46
+ elapsed = time.time() - start_time
47
+
48
+ if response.status_code != 200:
49
+ error_msg = f"HTTP {response.status_code}: {response.text}"
50
+ print(f"❌ Error: {error_msg}")
51
+ errors.append((i, error_msg))
52
+ continue
53
+
54
+ data = response.json()
55
+
56
+ if "error" in data:
57
+ error_msg = data["error"]["message"]
58
+ print(f"❌ API Error: {error_msg}")
59
+ errors.append((i, error_msg))
60
+
61
+ # Check if it's an OOM error
62
+ if "out of memory" in error_msg.lower() or "cuda" in error_msg.lower():
63
+ print(f"🚨 MEMORY ERROR DETECTED at request {i}!")
64
+ continue
65
+
66
+ # Extract response data
67
+ choice = data.get("choices", [{}])[0]
68
+ message = choice.get("message", {})
69
+ content = message.get("content", "")
70
+ finish_reason = choice.get("finish_reason", "unknown")
71
+ usage = data.get("usage", {})
72
+
73
+ prompt_tokens = usage.get("prompt_tokens", 0)
74
+ completion_tokens = usage.get("completion_tokens", 0)
75
+ total_tokens = usage.get("total_tokens", 0)
76
+
77
+ times.append(elapsed)
78
+ token_counts.append(completion_tokens)
79
+
80
+ # Check if response is complete
81
+ is_complete = finish_reason == "stop"
82
+ is_truncated = finish_reason == "length"
83
+
84
+ # Check if answer seems complete (doesn't end mid-sentence)
85
+ ends_properly = (
86
+ content.strip().endswith(".") or
87
+ content.strip().endswith("!") or
88
+ content.strip().endswith("?") or
89
+ content.strip().endswith("€") or
90
+ content.strip().endswith("$")
91
+ )
92
+
93
+ print(f" ✅ Status: {finish_reason}")
94
+ print(f" ⏱️ Time: {elapsed:.2f}s")
95
+ print(f" 📝 Tokens: {completion_tokens}/{total_tokens}")
96
+ print(f" 📄 Length: {len(content)} chars")
97
+ print(f" ✅ Complete: {'Yes' if is_complete and ends_properly else 'No'}")
98
+
99
+ if is_truncated or (not is_complete) or (not ends_properly):
100
+ print(f" ⚠️ WARNING: Response may be truncated!")
101
+ print(f" Last 100 chars: ...{content[-100:]}")
102
+
103
+ except Exception as e:
104
+ elapsed = time.time() - start_time
105
+ error_msg = f"Exception: {str(e)}"
106
+ print(f"❌ Error: {error_msg}")
107
+ errors.append((i, error_msg))
108
+
109
+ # Small delay between requests
110
+ if i < num_requests:
111
+ time.sleep(1)
112
+
113
+ # Summary
114
+ print("\n" + "="*80)
115
+ print("MEMORY STRESS TEST SUMMARY")
116
+ print("="*80)
117
+ print(f"Total requests: {num_requests}")
118
+ print(f"Successful: {num_requests - len(errors)}")
119
+ print(f"Failed: {len(errors)}")
120
+
121
+ if errors:
122
+ print("\n❌ Errors:")
123
+ for req_num, error in errors:
124
+ print(f" Request {req_num}: {error}")
125
+
126
+ if times:
127
+ print(f"\n📊 Performance:")
128
+ print(f" Average time: {sum(times)/len(times):.2f}s")
129
+ print(f" Min time: {min(times):.2f}s")
130
+ print(f" Max time: {max(times):.2f}s")
131
+ print(f" Average tokens: {sum(token_counts)/len(token_counts):.0f}")
132
+
133
+ # Check for memory leaks (increasing response times)
134
+ if len(times) > 3:
135
+ first_half = sum(times[:len(times)//2]) / (len(times)//2)
136
+ second_half = sum(times[len(times)//2:]) / (len(times) - len(times)//2)
137
+ if second_half > first_half * 1.5:
138
+ print(f" ⚠️ WARNING: Response times increasing ({first_half:.2f}s → {second_half:.2f}s)")
139
+ print(f" This may indicate memory leak!")
140
+
141
+ return len(errors) == 0
142
+
143
+
144
+ def test_french_language():
145
+ """Test if French prompts produce French answers."""
146
+ print("\n" + "="*80)
147
+ print("FRENCH LANGUAGE TEST")
148
+ print("="*80)
149
+
150
+ test_questions = [
151
+ {
152
+ "name": "Simple French question",
153
+ "prompt": "Expliquez brièvement ce qu'est une obligation (bond).",
154
+ "max_tokens": 200
155
+ },
156
+ {
157
+ "name": "French with explicit instruction",
158
+ "prompt": "Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.",
159
+ "max_tokens": 250
160
+ },
161
+ {
162
+ "name": "French calculation",
163
+ "prompt": "Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.",
164
+ "max_tokens": 300
165
+ },
166
+ {
167
+ "name": "French finance terms",
168
+ "prompt": "Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.",
169
+ "max_tokens": 350
170
+ }
171
+ ]
172
+
173
+ results = []
174
+
175
+ for i, test in enumerate(test_questions, 1):
176
+ print(f"\n[Test {i}/{len(test_questions)}] {test['name']}")
177
+ print(f"Prompt: {test['prompt']}")
178
+
179
+ try:
180
+ response = httpx.post(
181
+ f"{BASE_URL}/v1/chat/completions",
182
+ json={
183
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
184
+ "messages": [
185
+ {
186
+ "role": "system",
187
+ "content": "Vous êtes un assistant financier expert. Répondez toujours en français."
188
+ },
189
+ {
190
+ "role": "user",
191
+ "content": test["prompt"]
192
+ }
193
+ ],
194
+ "max_tokens": test["max_tokens"],
195
+ "temperature": 0.3
196
+ },
197
+ timeout=60.0
198
+ )
199
+
200
+ if response.status_code != 200:
201
+ print(f"❌ HTTP {response.status_code}: {response.text}")
202
+ results.append({"test": test["name"], "status": "error", "error": response.text})
203
+ continue
204
+
205
+ data = response.json()
206
+
207
+ if "error" in data:
208
+ print(f"❌ API Error: {data['error']['message']}")
209
+ results.append({"test": test["name"], "status": "error", "error": data["error"]["message"]})
210
+ continue
211
+
212
+ choice = data.get("choices", [{}])[0]
213
+ message = choice.get("message", {})
214
+ content = message.get("content", "")
215
+ finish_reason = choice.get("finish_reason", "unknown")
216
+
217
+ # Check if answer is in French (simple heuristic)
218
+ # Remove reasoning tags for analysis
219
+ answer_only = content
220
+ if "<think>" in answer_only:
221
+ parts = answer_only.split("</think>")
222
+ if len(parts) > 1:
223
+ answer_only = parts[-1].strip()
224
+
225
+ # Check for French words
226
+ french_indicators = ["est", "sont", "pour", "dans", "avec", "comme", "une", "le", "la", "les", "l'", "c'est", "qu'est", "fonctionne"]
227
+ english_indicators = ["is", "are", "for", "in", "with", "the", "a", "an", "it's", "what's", "works"]
228
+
229
+ french_count = sum(1 for word in french_indicators if word.lower() in answer_only.lower())
230
+ english_count = sum(1 for word in english_indicators if word.lower() in answer_only.lower())
231
+
232
+ is_french = french_count > english_count * 2 or french_count > 3
233
+
234
+ # Check completeness
235
+ is_complete = finish_reason == "stop"
236
+ ends_properly = answer_only.strip().endswith((".", "!", "?", "€", "$", ":"))
237
+
238
+ print(f"\n📄 Full Response (first 500 chars):")
239
+ print(content[:500] + ("..." if len(content) > 500 else ""))
240
+
241
+ print(f"\n📄 Answer Only (after reasoning):")
242
+ print(answer_only[:400] + ("..." if len(answer_only) > 400 else ""))
243
+
244
+ print(f"\n📊 Analysis:")
245
+ print(f" Finish reason: {finish_reason}")
246
+ print(f" French words found: {french_count}")
247
+ print(f" English words found: {english_count}")
248
+ print(f" Is French: {'✅ Yes' if is_french else '❌ No'}")
249
+ print(f" Is complete: {'✅ Yes' if is_complete and ends_properly else '❌ No'}")
250
+
251
+ if not is_french:
252
+ print(f" ⚠️ WARNING: Answer appears to be in English!")
253
+
254
+ results.append({
255
+ "test": test["name"],
256
+ "status": "success" if is_french and is_complete else "partial",
257
+ "is_french": is_french,
258
+ "is_complete": is_complete,
259
+ "content": content,
260
+ "answer_only": answer_only
261
+ })
262
+
263
+ except Exception as e:
264
+ print(f"❌ Exception: {str(e)}")
265
+ results.append({"test": test["name"], "status": "error", "error": str(e)})
266
+
267
+ # Summary
268
+ print("\n" + "="*80)
269
+ print("FRENCH LANGUAGE TEST SUMMARY")
270
+ print("="*80)
271
+
272
+ french_count = sum(1 for r in results if r.get("is_french", False))
273
+ complete_count = sum(1 for r in results if r.get("is_complete", False))
274
+
275
+ print(f"Total tests: {len(results)}")
276
+ print(f"French answers: {french_count}/{len(results)}")
277
+ print(f"Complete answers: {complete_count}/{len(results)}")
278
+
279
+ if french_count < len(results):
280
+ print("\n❌ Some answers are not in French!")
281
+
282
+ return french_count == len(results) and complete_count == len(results)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ print("Starting comprehensive tests...\n")
287
+
288
+ # Test memory stability
289
+ memory_ok = test_memory_stability(num_requests=15)
290
+
291
+ # Test French language
292
+ french_ok = test_french_language()
293
+
294
+ # Final summary
295
+ print("\n" + "="*80)
296
+ print("FINAL SUMMARY")
297
+ print("="*80)
298
+ print(f"Memory management: {'✅ PASS' if memory_ok else '❌ FAIL'}")
299
+ print(f"French language: {'✅ PASS' if french_ok else '❌ FAIL'}")
300
+
301
+ sys.exit(0 if (memory_ok and french_ok) else 1)
302
+