Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

jeanbaptdzd commited on Nov 2

Commit

5ac5a91

1 Parent(s): bf16ed7

Fix critical bugs: OOM errors, race conditions, truncation, and French language support

Major fixes:
1. Thread-safe model initialization with locking to prevent concurrent loads
2. Proper GPU memory cleanup on failed initialization attempts
3. Added PYTORCH_CUDA_ALLOC_CONF for better memory fragmentation handling
4. Set max_memory limit (20GiB) to prevent OOM during model load
5. Automatic French language detection and system prompt injection
6. Increased default max_tokens from 1000 to 500 for better response quality
7. Removed min_new_tokens constraint that was causing truncation
8. Added comprehensive memory cleanup in finally blocks

Technical details:
- Added _init_lock to prevent race conditions
- Added _initializing and _initialized flags for state tracking
- Created _clear_gpu_memory() for thorough cleanup
- French detection based on accented characters and common French words
- Automatic French system prompt: 'Répondez TOUJOURS en français'

Files changed (4) hide show

Dockerfile +2 -0
app/providers/transformers_provider.py +199 -100
memory_test_results.txt +137 -0
test_memory_stress.py +302 -0

Dockerfile CHANGED Viewed

@@ -10,6 +10,8 @@ RUN echo "Build cache bust: ${CACHE_BUST}" && \
 ENV PYTHONUNBUFFERED=1
 ENV DEBIAN_FRONTEND=noninteractive
 ENV BUILD_ID=transformers_backend_20250130
 # Install Python 3.11 and build dependencies
 RUN apt-get update && apt-get install -y \

 ENV PYTHONUNBUFFERED=1
 ENV DEBIAN_FRONTEND=noninteractive
 ENV BUILD_ID=transformers_backend_20250130
+# PyTorch CUDA memory management to prevent fragmentation
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # Install Python 3.11 and build dependencies
 RUN apt-get update && apt-get install -y \

app/providers/transformers_provider.py CHANGED Viewed

@@ -4,128 +4,205 @@ import gc
 import torch
 from typing import Dict, Any, AsyncIterator, Union
 import asyncio
 from huggingface_hub import login
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
 # Model configuration
 model_name = "DragonLLM/qwen3-8b-fin-v1.0"
 model = None
 tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def initialize_model():
     """Initialize Transformers model with Qwen3
     Handles authentication with Hugging Face Hub for accessing DragonLLM models.
     Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
     """
-    global model, tokenizer
-    if model is None:
-        import logging
-        logger = logging.getLogger(__name__)
-        logger.info(f"Initializing Transformers with model: {model_name}")
-        print(f"Initializing Transformers with model: {model_name}")
-        # Get HF token from environment (Hugging Face Space secret)
-        # Priority: HF_TOKEN_LC2 (for DragonLLM access) > HF_TOKEN_LC > HF_TOKEN
-        hf_token = (
-            os.getenv("HF_TOKEN_LC2") or
-            os.getenv("HF_TOKEN_LC") or
-            os.getenv("HF_TOKEN") or
-            os.getenv("HUGGING_FACE_HUB_TOKEN")
-        )
-        if hf_token:
-            # Determine token source for logging
-            if os.getenv("HF_TOKEN_LC2"):
-                token_source = "HF_TOKEN_LC2"
-            elif os.getenv("HF_TOKEN_LC"):
-                token_source = "HF_TOKEN_LC"
-            elif os.getenv("HF_TOKEN"):
-                token_source = "HF_TOKEN"
-            else:
-                token_source = "HUGGING_FACE_HUB_TOKEN"
-            logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
-            print(f"✅ {token_source} found (length: {len(hf_token)})")
-            # Authenticate with Hugging Face Hub
-            try:
-                login(token=hf_token, add_to_git_credential=False)
-                logger.info("✅ Successfully authenticated with Hugging Face Hub")
-                print("✅ Successfully authenticated with Hugging Face Hub")
-            except Exception as e:
-                logger.warning(f"⚠️  Warning: Failed to authenticate with HF Hub: {e}")
-                print(f"⚠️  Warning: Failed to authenticate with HF Hub: {e}")
-            # Set all possible environment variables
-            os.environ["HF_TOKEN"] = hf_token
-            os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
-            os.environ["HF_API_TOKEN"] = hf_token
-            logger.info("✅ Hugging Face token environment variables set")
-        else:
-            logger.warning("⚠️  WARNING: No HF token found in environment!")
-            print("⚠️  WARNING: No HF token found in environment!")
-            print(f"   Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
-            print("   ⚠️  Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
         try:
-            logger.info(f"Loading model: {model_name}")
-            print(f"Loading model: {model_name}")
-            print(f"Model type: DragonLLM Qwen3 8B")
-            print(f"Device: {device}")
-            print(f"Trust remote code: True")
-            # Load tokenizer
-            print("📥 Loading tokenizer...")
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                token=hf_token,
-                trust_remote_code=True,
-                cache_dir="/tmp/huggingface"
-            )
-            logger.info("✅ Tokenizer loaded")
-            print("✅ Tokenizer loaded")
-            # Load model with optimizations
-            print("📥 Loading model (this may take a few minutes)...")
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                token=hf_token,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                cache_dir="/tmp/huggingface"
             )
-            # Set to eval mode for inference
-            model.eval()
-            print(f"✅ Model loaded successfully!")
-            logger.info("✅ Model initialized successfully")
-        except Exception as e:
-            error_msg = f"❌ Error initializing model: {e}"
-            logger.error(error_msg, exc_info=True)
-            print(error_msg)
-            # Provide helpful error message for authentication issues
-            if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
-                print("\n🔐 Authentication Error Detected!")
-                print("   This usually means:")
-                print("   1. HF_TOKEN_LC2 is missing or invalid")
-                print("   2. You haven't accepted the model's terms on Hugging Face")
-                print("   3. The token doesn't have access to DragonLLM models")
-                print("\n   To fix:")
-                print("   1. Visit: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
-                print("   2. Accept the model's terms of use")
-                print("   3. Ensure HF_TOKEN_LC2 is set as a secret in your HF Space")
-            raise
 class TransformersProvider:
@@ -162,9 +239,31 @@ class TransformersProvider:
             messages = payload.get("messages", [])
             temperature = payload.get("temperature", 0.7)
-            max_tokens = payload.get("max_tokens", 1000)
             top_p = payload.get("top_p", 1.0)
             # Convert messages to prompt using tokenizer's chat template
             if hasattr(tokenizer, "apply_chat_template"):
                 prompt = tokenizer.apply_chat_template(
@@ -196,9 +295,9 @@ class TransformersProvider:
                         do_sample=temperature > 0,
                         pad_token_id=tokenizer.eos_token_id,
                         eos_token_id=tokenizer.eos_token_id,
-                        # Ensure reasonable minimum generation (max 10% of max_tokens)
-                        min_new_tokens=min(10, max_tokens // 10),
-                        repetition_penalty=1.05
                     )
                 # Save token counts before cleanup

 import torch
 from typing import Dict, Any, AsyncIterator, Union
 import asyncio
+from threading import Thread, Lock
 from huggingface_hub import login
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # Model configuration
 model_name = "DragonLLM/qwen3-8b-fin-v1.0"
 model = None
 tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
+_init_lock = Lock()  # Lock to prevent concurrent initialization
+_initializing = False  # Track if initialization is in progress
+_initialized = False  # Track if initialization completed successfully
+def _clear_gpu_memory():
+    """Clear GPU memory completely."""
+    global model, tokenizer
+    if torch.cuda.is_available():
+        if model is not None:
+            try:
+                del model
+            except:
+                pass
+        if tokenizer is not None:
+            try:
+                del tokenizer
+            except:
+                pass
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        gc.collect()
+        # Force garbage collection multiple times
+        for _ in range(3):
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
 def initialize_model():
     """Initialize Transformers model with Qwen3
+    Thread-safe initialization with proper memory cleanup on failure.
     Handles authentication with Hugging Face Hub for accessing DragonLLM models.
     Prioritizes HF_TOKEN_LC2 (DragonLLM access) over HF_TOKEN_LC.
     """
+    global model, tokenizer, _initializing, _initialized
+    # If already initialized, return immediately
+    if _initialized and model is not None:
+        return
+    # Acquire lock to prevent concurrent initialization
+    with _init_lock:
+        # Double-check after acquiring lock
+        if _initialized and model is not None:
+            return
+        # If already initializing, wait
+        if _initializing:
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.warning("Model initialization already in progress, waiting...")
+            # Wait for initialization to complete (with timeout)
+            wait_count = 0
+            while _initializing and wait_count < 300:  # 5 minute timeout
+                time.sleep(1)
+                wait_count += 1
+                if _initialized and model is not None:
+                    return
+            if wait_count >= 300:
+                logger.error("Model initialization timeout!")
+                raise RuntimeError("Model initialization timed out")
+            return
+        # Clear any previous failed attempts
+        if model is None and torch.cuda.is_available():
+            _clear_gpu_memory()
+        _initializing = True
         try:
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.info(f"Initializing Transformers with model: {model_name}")
+            print(f"Initializing Transformers with model: {model_name}")
+            # Get HF token from environment (Hugging Face Space secret)
+            # Priority: HF_TOKEN_LC2 (for DragonLLM access) > HF_TOKEN_LC > HF_TOKEN
+            hf_token = (
+                os.getenv("HF_TOKEN_LC2") or
+                os.getenv("HF_TOKEN_LC") or
+                os.getenv("HF_TOKEN") or
+                os.getenv("HUGGING_FACE_HUB_TOKEN")
             )
+            if hf_token:
+                # Determine token source for logging
+                if os.getenv("HF_TOKEN_LC2"):
+                    token_source = "HF_TOKEN_LC2"
+                elif os.getenv("HF_TOKEN_LC"):
+                    token_source = "HF_TOKEN_LC"
+                elif os.getenv("HF_TOKEN"):
+                    token_source = "HF_TOKEN"
+                else:
+                    token_source = "HUGGING_FACE_HUB_TOKEN"
+                logger.info(f"✅ {token_source} found (length: {len(hf_token)})")
+                print(f"✅ {token_source} found (length: {len(hf_token)})")
+                # Authenticate with Hugging Face Hub
+                try:
+                    login(token=hf_token, add_to_git_credential=False)
+                    logger.info("✅ Successfully authenticated with Hugging Face Hub")
+                    print("✅ Successfully authenticated with Hugging Face Hub")
+                except Exception as e:
+                    logger.warning(f"⚠️  Warning: Failed to authenticate with HF Hub: {e}")
+                    print(f"⚠️  Warning: Failed to authenticate with HF Hub: {e}")
+                # Set all possible environment variables
+                os.environ["HF_TOKEN"] = hf_token
+                os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
+                os.environ["HF_API_TOKEN"] = hf_token
+                logger.info("✅ Hugging Face token environment variables set")
+            else:
+                logger.warning("⚠️  WARNING: No HF token found in environment!")
+                print("⚠️  WARNING: No HF token found in environment!")
+                print(f"   Checked: HF_TOKEN_LC2, HF_TOKEN_LC, HF_TOKEN, HUGGING_FACE_HUB_TOKEN")
+                print("   ⚠️  Model download may fail if DragonLLM/qwen3-8b-fin-v1.0 is gated!")
+            try:
+                logger.info(f"Loading model: {model_name}")
+                print(f"Loading model: {model_name}")
+                print(f"Model type: DragonLLM Qwen3 8B")
+                print(f"Device: {device}")
+                print(f"Trust remote code: True")
+                # Load tokenizer
+                print("📥 Loading tokenizer...")
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_name,
+                    token=hf_token,
+                    trust_remote_code=True,
+                    cache_dir="/tmp/huggingface"
+                )
+                logger.info("✅ Tokenizer loaded")
+                print("✅ Tokenizer loaded")
+                # Clear GPU memory before loading model
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                # Load model with optimizations and memory limits
+                print("📥 Loading model (this may take a few minutes)...")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    token=hf_token,
+                    trust_remote_code=True,
+                    dtype=torch.bfloat16,  # Use dtype instead of torch_dtype (newer API)
+                    device_map="auto",
+                    max_memory={0: "20GiB"} if torch.cuda.is_available() else None,  # Leave 2GB buffer
+                    cache_dir="/tmp/huggingface",
+                    low_cpu_mem_usage=True
+                )
+                # Set to eval mode for inference
+                model.eval()
+                # Mark as initialized only after successful load
+                _initialized = True
+                print(f"✅ Model loaded successfully!")
+                logger.info("✅ Model initialized successfully")
+            except Exception as e:
+                error_msg = f"❌ Error initializing model: {e}"
+                logger.error(error_msg, exc_info=True)
+                print(error_msg)
+                # Clear memory on failure
+                _clear_gpu_memory()
+                model = None
+                tokenizer = None
+                # Provide helpful error message for authentication issues
+                if "401" in str(e) or "Unauthorized" in str(e) or "authentication" in str(e).lower():
+                    print("\n🔐 Authentication Error Detected!")
+                    print("   This usually means:")
+                    print("   1. HF_TOKEN_LC2 is missing or invalid")
+                    print("   2. You haven't accepted the model's terms on Hugging Face")
+                    print("   3. The token doesn't have access to DragonLLM models")
+                    print("\n   To fix:")
+                    print("   1. Visit: https://huggingface.co/DragonLLM/qwen3-8b-fin-v1.0")
+                    print("   2. Accept the model's terms of use")
+                    print("   3. Ensure HF_TOKEN_LC2 is set as a secret in your HF Space")
+                raise
+        finally:
+            _initializing = False
 class TransformersProvider:
             messages = payload.get("messages", [])
             temperature = payload.get("temperature", 0.7)
+            max_tokens = payload.get("max_tokens", 500)  # Increased default for complete answers
             top_p = payload.get("top_p", 1.0)
+            # Detect if French language is requested and add system prompt
+            user_messages = [msg for msg in messages if msg.get("role") == "user"]
+            system_messages = [msg for msg in messages if msg.get("role") == "system"]
+            # Check if any user message is in French or explicitly requests French
+            is_french_request = False
+            for msg in user_messages:
+                content = msg.get("content", "").lower()
+                if any(phrase in content for phrase in ["répondez en français", "en français", "réponse française", "répondez uniquement en français"]):
+                    is_french_request = True
+                    break
+                # Simple French detection - check for common French words
+                if any(word in content for word in ["expliquez", "qu'est", "comment", "pourquoi", "quel", "quelle", "définir", "définition"]):
+                    # Additional check: has French characters or common French words
+                    if any(char in content for char in ["é", "è", "ê", "à", "ç", "ù", "ô"]) or "c'est" in content:
+                        is_french_request = True
+                        break
+            # Add French system prompt if needed and not already present
+            if is_french_request and not any("français" in msg.get("content", "").lower() for msg in system_messages):
+                messages = [{"role": "system", "content": "Vous êtes un assistant financier expert. Répondez TOUJOURS en français. Utilisez uniquement le français dans vos réponses, y compris dans les calculs et explications."}] + messages
             # Convert messages to prompt using tokenizer's chat template
             if hasattr(tokenizer, "apply_chat_template"):
                 prompt = tokenizer.apply_chat_template(
                         do_sample=temperature > 0,
                         pad_token_id=tokenizer.eos_token_id,
                         eos_token_id=tokenizer.eos_token_id,
+                        # Don't set min_new_tokens too high - let model finish naturally
+                        repetition_penalty=1.05,
+                        length_penalty=1.0
                     )
                 # Save token counts before cleanup

memory_test_results.txt ADDED Viewed

	@@ -0,0 +1,137 @@

+Starting comprehensive tests...
+================================================================================
+MEMORY STRESS TEST - 15 sequential requests
+================================================================================
+[Request 1/15]
+  ✅ Status: stop
+  ⏱️  Time: 17.12s
+  📝 Tokens: 250/285
+  📄 Length: 829 chars
+  ✅ Complete: No
+  ⚠️  WARNING: Response may be truncated!
+     Last 100 chars: ...ears. So the formula becomes A = 5000*(1 + 0.04/1)^(1*2). That simplifies to 5000*(1.04)^2.
+Calcul
+[Request 2/15]
+  ✅ Status: stop
+  ⏱️  Time: 16.81s
+  📝 Tokens: 250/285
+  📄 Length: 864 chars
+  ✅ Complete: Yes
+[Request 3/15]
+  ✅ Status: stop
+  ⏱️  Time: 16.81s
+  📝 Tokens: 250/285
+  📄 Length: 871 chars
+  ✅ Complete: No
+  ⚠️  WARNING: Response may be truncated!
+     Last 100 chars: ...ut step by step.
+First, calculate the rate per period: r/n = 0.04 / 1 = 0.04. Then add 1 to that: 1
+[Request 4/15]
+  ✅ Status: stop
+  ⏱️  Time: 16.82s
+  📝 Tokens: 250/285
+  📄 Length: 764 chars
+  ✅ Complete: No
+  ⚠️  WARNING: Response may be truncated!
+     Last 100 chars: ...t simplifies to 5000*(1.04)^2. Calculating 1.04 squared... 1.04 * 1.04 is 1.0816. Then multiply by 5
+[Request 5/15]
+❌ Error: Exception: The read operation timed out
+[Request 6/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
+[Request 7/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 8/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 9/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 10/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 11/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 12/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 13/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 14/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Request 15/15]
+❌ Error: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+================================================================================
+MEMORY STRESS TEST SUMMARY
+================================================================================
+Total requests: 15
+Successful: 4
+Failed: 11
+❌ Errors:
+  Request 5: Exception: The read operation timed out
+  Request 6: HTTP 500: {"error":{"message":"CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 21.12 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 11.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)","type":"internal_error"}}
+  Request 7: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 8: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 9: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 10: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 11: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 12: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 13: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 14: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+  Request 15: HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+📊 Performance:
+  Average time: 16.89s
+  Min time: 16.81s
+  Max time: 17.12s
+  Average tokens: 250
+================================================================================
+FRENCH LANGUAGE TEST
+================================================================================
+[Test 1/4] Simple French question
+Prompt: Expliquez brièvement ce qu'est une obligation (bond).
+❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Test 2/4] French with explicit instruction
+Prompt: Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.
+❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Test 3/4] French calculation
+Prompt: Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.
+❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+[Test 4/4] French finance terms
+Prompt: Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.
+❌ HTTP 500: {"error":{"message":"CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","type":"internal_error"}}
+================================================================================
+FRENCH LANGUAGE TEST SUMMARY
+================================================================================
+Total tests: 4
+French answers: 0/4
+Complete answers: 0/4
+❌ Some answers are not in French!
+================================================================================
+FINAL SUMMARY
+================================================================================
+Memory management: ❌ FAIL
+French language: ❌ FAIL

test_memory_stress.py ADDED Viewed

	@@ -0,0 +1,302 @@

+#!/usr/bin/env python3
+"""
+Stress test memory management with multiple sequential requests.
+Also checks if responses are complete and in French when requested.
+"""
+import httpx
+import json
+import time
+import sys
+from typing import List, Dict, Any
+BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
+def test_memory_stability(num_requests: int = 10):
+    """Send multiple requests sequentially to test memory cleanup."""
+    print("="*80)
+    print(f"MEMORY STRESS TEST - {num_requests} sequential requests")
+    print("="*80)
+    errors = []
+    times = []
+    token_counts = []
+    for i in range(1, num_requests + 1):
+        print(f"\n[Request {i}/{num_requests}]")
+        start_time = time.time()
+        try:
+            response = httpx.post(
+                f"{BASE_URL}/v1/chat/completions",
+                json={
+                    "model": "DragonLLM/qwen3-8b-fin-v1.0",
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": f"Question {i}: Calculate compound interest on $5,000 at 4% for 2 years. Show your work."
+                        }
+                    ],
+                    "max_tokens": 250,
+                    "temperature": 0.3
+                },
+                timeout=60.0
+            )
+            elapsed = time.time() - start_time
+            if response.status_code != 200:
+                error_msg = f"HTTP {response.status_code}: {response.text}"
+                print(f"❌ Error: {error_msg}")
+                errors.append((i, error_msg))
+                continue
+            data = response.json()
+            if "error" in data:
+                error_msg = data["error"]["message"]
+                print(f"❌ API Error: {error_msg}")
+                errors.append((i, error_msg))
+                # Check if it's an OOM error
+                if "out of memory" in error_msg.lower() or "cuda" in error_msg.lower():
+                    print(f"🚨 MEMORY ERROR DETECTED at request {i}!")
+                continue
+            # Extract response data
+            choice = data.get("choices", [{}])[0]
+            message = choice.get("message", {})
+            content = message.get("content", "")
+            finish_reason = choice.get("finish_reason", "unknown")
+            usage = data.get("usage", {})
+            prompt_tokens = usage.get("prompt_tokens", 0)
+            completion_tokens = usage.get("completion_tokens", 0)
+            total_tokens = usage.get("total_tokens", 0)
+            times.append(elapsed)
+            token_counts.append(completion_tokens)
+            # Check if response is complete
+            is_complete = finish_reason == "stop"
+            is_truncated = finish_reason == "length"
+            # Check if answer seems complete (doesn't end mid-sentence)
+            ends_properly = (
+                content.strip().endswith(".") or
+                content.strip().endswith("!") or
+                content.strip().endswith("?") or
+                content.strip().endswith("€") or
+                content.strip().endswith("$")
+            )
+            print(f"  ✅ Status: {finish_reason}")
+            print(f"  ⏱️  Time: {elapsed:.2f}s")
+            print(f"  📝 Tokens: {completion_tokens}/{total_tokens}")
+            print(f"  📄 Length: {len(content)} chars")
+            print(f"  ✅ Complete: {'Yes' if is_complete and ends_properly else 'No'}")
+            if is_truncated or (not is_complete) or (not ends_properly):
+                print(f"  ⚠️  WARNING: Response may be truncated!")
+                print(f"     Last 100 chars: ...{content[-100:]}")
+        except Exception as e:
+            elapsed = time.time() - start_time
+            error_msg = f"Exception: {str(e)}"
+            print(f"❌ Error: {error_msg}")
+            errors.append((i, error_msg))
+        # Small delay between requests
+        if i < num_requests:
+            time.sleep(1)
+    # Summary
+    print("\n" + "="*80)
+    print("MEMORY STRESS TEST SUMMARY")
+    print("="*80)
+    print(f"Total requests: {num_requests}")
+    print(f"Successful: {num_requests - len(errors)}")
+    print(f"Failed: {len(errors)}")
+    if errors:
+        print("\n❌ Errors:")
+        for req_num, error in errors:
+            print(f"  Request {req_num}: {error}")
+    if times:
+        print(f"\n📊 Performance:")
+        print(f"  Average time: {sum(times)/len(times):.2f}s")
+        print(f"  Min time: {min(times):.2f}s")
+        print(f"  Max time: {max(times):.2f}s")
+        print(f"  Average tokens: {sum(token_counts)/len(token_counts):.0f}")
+        # Check for memory leaks (increasing response times)
+        if len(times) > 3:
+            first_half = sum(times[:len(times)//2]) / (len(times)//2)
+            second_half = sum(times[len(times)//2:]) / (len(times) - len(times)//2)
+            if second_half > first_half * 1.5:
+                print(f"  ⚠️  WARNING: Response times increasing ({first_half:.2f}s → {second_half:.2f}s)")
+                print(f"     This may indicate memory leak!")
+    return len(errors) == 0
+def test_french_language():
+    """Test if French prompts produce French answers."""
+    print("\n" + "="*80)
+    print("FRENCH LANGUAGE TEST")
+    print("="*80)
+    test_questions = [
+        {
+            "name": "Simple French question",
+            "prompt": "Expliquez brièvement ce qu'est une obligation (bond).",
+            "max_tokens": 200
+        },
+        {
+            "name": "French with explicit instruction",
+            "prompt": "Expliquez ce qu'est le CAC 40. Répondez UNIQUEMENT en français, sans utiliser d'anglais.",
+            "max_tokens": 250
+        },
+        {
+            "name": "French calculation",
+            "prompt": "Si j'investis 10 000€ à 5% pendant 3 ans, combien aurai-je? Montrez le calcul. Répondez en français.",
+            "max_tokens": 300
+        },
+        {
+            "name": "French finance terms",
+            "prompt": "Qu'est-ce qu'une SICAV et comment fonctionne-t-elle? Expliquez en français.",
+            "max_tokens": 350
+        }
+    ]
+    results = []
+    for i, test in enumerate(test_questions, 1):
+        print(f"\n[Test {i}/{len(test_questions)}] {test['name']}")
+        print(f"Prompt: {test['prompt']}")
+        try:
+            response = httpx.post(
+                f"{BASE_URL}/v1/chat/completions",
+                json={
+                    "model": "DragonLLM/qwen3-8b-fin-v1.0",
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "Vous êtes un assistant financier expert. Répondez toujours en français."
+                        },
+                        {
+                            "role": "user",
+                            "content": test["prompt"]
+                        }
+                    ],
+                    "max_tokens": test["max_tokens"],
+                    "temperature": 0.3
+                },
+                timeout=60.0
+            )
+            if response.status_code != 200:
+                print(f"❌ HTTP {response.status_code}: {response.text}")
+                results.append({"test": test["name"], "status": "error", "error": response.text})
+                continue
+            data = response.json()
+            if "error" in data:
+                print(f"❌ API Error: {data['error']['message']}")
+                results.append({"test": test["name"], "status": "error", "error": data["error"]["message"]})
+                continue
+            choice = data.get("choices", [{}])[0]
+            message = choice.get("message", {})
+            content = message.get("content", "")
+            finish_reason = choice.get("finish_reason", "unknown")
+            # Check if answer is in French (simple heuristic)
+            # Remove reasoning tags for analysis
+            answer_only = content
+            if "<think>" in answer_only:
+                parts = answer_only.split("</think>")
+                if len(parts) > 1:
+                    answer_only = parts[-1].strip()
+            # Check for French words
+            french_indicators = ["est", "sont", "pour", "dans", "avec", "comme", "une", "le", "la", "les", "l'", "c'est", "qu'est", "fonctionne"]
+            english_indicators = ["is", "are", "for", "in", "with", "the", "a", "an", "it's", "what's", "works"]
+            french_count = sum(1 for word in french_indicators if word.lower() in answer_only.lower())
+            english_count = sum(1 for word in english_indicators if word.lower() in answer_only.lower())
+            is_french = french_count > english_count * 2 or french_count > 3
+            # Check completeness
+            is_complete = finish_reason == "stop"
+            ends_properly = answer_only.strip().endswith((".", "!", "?", "€", "$", ":"))
+            print(f"\n📄 Full Response (first 500 chars):")
+            print(content[:500] + ("..." if len(content) > 500 else ""))
+            print(f"\n📄 Answer Only (after reasoning):")
+            print(answer_only[:400] + ("..." if len(answer_only) > 400 else ""))
+            print(f"\n📊 Analysis:")
+            print(f"  Finish reason: {finish_reason}")
+            print(f"  French words found: {french_count}")
+            print(f"  English words found: {english_count}")
+            print(f"  Is French: {'✅ Yes' if is_french else '❌ No'}")
+            print(f"  Is complete: {'✅ Yes' if is_complete and ends_properly else '❌ No'}")
+            if not is_french:
+                print(f"  ⚠️  WARNING: Answer appears to be in English!")
+            results.append({
+                "test": test["name"],
+                "status": "success" if is_french and is_complete else "partial",
+                "is_french": is_french,
+                "is_complete": is_complete,
+                "content": content,
+                "answer_only": answer_only
+            })
+        except Exception as e:
+            print(f"❌ Exception: {str(e)}")
+            results.append({"test": test["name"], "status": "error", "error": str(e)})
+    # Summary
+    print("\n" + "="*80)
+    print("FRENCH LANGUAGE TEST SUMMARY")
+    print("="*80)
+    french_count = sum(1 for r in results if r.get("is_french", False))
+    complete_count = sum(1 for r in results if r.get("is_complete", False))
+    print(f"Total tests: {len(results)}")
+    print(f"French answers: {french_count}/{len(results)}")
+    print(f"Complete answers: {complete_count}/{len(results)}")
+    if french_count < len(results):
+        print("\n❌ Some answers are not in French!")
+    return french_count == len(results) and complete_count == len(results)
+if __name__ == "__main__":
+    print("Starting comprehensive tests...\n")
+    # Test memory stability
+    memory_ok = test_memory_stability(num_requests=15)
+    # Test French language
+    french_ok = test_french_language()
+    # Final summary
+    print("\n" + "="*80)
+    print("FINAL SUMMARY")
+    print("="*80)
+    print(f"Memory management: {'✅ PASS' if memory_ok else '❌ FAIL'}")
+    print(f"French language: {'✅ PASS' if french_ok else '❌ FAIL'}")
+    sys.exit(0 if (memory_ok and french_ok) else 1)