Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

jeanbaptdzd commited on Nov 2

Commit

bedfb0c

1 Parent(s): 16c2a22

Set DEFAULT_MAX_TOKENS=800 to prevent timeouts

1500 tokens causes 90+ second timeouts.
800 tokens provides good balance:
- Complete answers for most questions
- Reasonable response time (~20-30s)
- Users can request higher for complex questions

Files changed (4) hide show

app/utils/constants.py +1 -1
quick_test.py +54 -0
test_regression.py +118 -0
test_space_api.py +142 -0

app/utils/constants.py CHANGED Viewed

@@ -49,7 +49,7 @@ EOS_TOKENS = [151645, 151643]  # [<|im_end|>, <|endoftext|>]
 PAD_TOKEN_ID = 151643  # <|endoftext|>
 # Generation defaults
-DEFAULT_MAX_TOKENS = 1500
 DEFAULT_TEMPERATURE = 0.7
 DEFAULT_TOP_P = 1.0
 DEFAULT_TOP_K = 20

 PAD_TOKEN_ID = 151643  # <|endoftext|>
 # Generation defaults
+DEFAULT_MAX_TOKENS = 800  # Balanced: complete answers without timeouts
 DEFAULT_TEMPERATURE = 0.7
 DEFAULT_TOP_P = 1.0
 DEFAULT_TOP_K = 20

quick_test.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python3
+"""Quick test of Space API"""
+import httpx
+import sys
+SPACE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
+try:
+    # Test root endpoint
+    r = httpx.get(f"{SPACE_URL}/", timeout=10)
+    if r.status_code == 200:
+        data = r.json()
+        print(f"✓ Root endpoint: {data.get('backend', 'unknown')}")
+        print(f"  Model: {data.get('model', 'unknown')}")
+    else:
+        print(f"✗ Root endpoint failed: {r.status_code}")
+        sys.exit(1)
+    # Test models endpoint
+    r = httpx.get(f"{SPACE_URL}/v1/models", timeout=10)
+    if r.status_code == 200:
+        data = r.json()
+        models = data.get('data', [])
+        print(f"✓ Models endpoint: {len(models)} model(s)")
+    else:
+        print(f"✗ Models endpoint failed: {r.status_code}")
+        sys.exit(1)
+    # Test chat completion (short)
+    r = httpx.post(
+        f"{SPACE_URL}/v1/chat/completions",
+        json={
+            "model": "DragonLLM/qwen3-8b-fin-v1.0",
+            "messages": [{"role": "user", "content": "Say hello"}],
+            "max_tokens": 50
+        },
+        timeout=60
+    )
+    if r.status_code == 200:
+        data = r.json()
+        content = data['choices'][0]['message']['content']
+        print(f"✓ Chat completion: {len(content)} chars")
+        print(f"  Preview: {content[:50]}...")
+    else:
+        print(f"✗ Chat completion failed: {r.status_code}")
+        print(f"  Response: {r.text[:200]}")
+        sys.exit(1)
+    print("\n✓ All tests passed! Space is working.")
+except Exception as e:
+    print(f"✗ Error: {e}")
+    sys.exit(1)

test_regression.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+"""
+Regression test: verify EOS token fix improves completeness without breaking anything
+"""
+import httpx
+import json
+import time
+BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
+print("="*80)
+print("REGRESSION & IMPROVEMENT TEST")
+print("="*80)
+# Test 1: Basic functionality still works
+print("\n[1] Basic functionality check")
+try:
+    response = httpx.post(
+        f"{BASE_URL}/v1/chat/completions",
+        json={
+            "model": "DragonLLM/qwen3-8b-fin-v1.0",
+            "messages": [{"role": "user", "content": "What is 2+2?"}],
+            "max_tokens": 100,
+            "temperature": 0.3
+        },
+        timeout=30.0
+    )
+    data = response.json()
+    if "error" not in data:
+        print(f"✅ Basic request works")
+    else:
+        print(f"❌ Error: {data['error']['message']}")
+except Exception as e:
+    print(f"❌ Exception: {e}")
+time.sleep(3)
+# Test 2: French answer with reasonable token limit
+print("\n[2] French answer (500 tokens)")
+try:
+    response = httpx.post(
+        f"{BASE_URL}/v1/chat/completions",
+        json={
+            "model": "DragonLLM/qwen3-8b-fin-v1.0",
+            "messages": [{"role": "user", "content": "Qu'est-ce qu'une obligation? Réponse courte."}],
+            "max_tokens": 500,
+            "temperature": 0.3
+        },
+        timeout=45.0
+    )
+    data = response.json()
+    if "error" in data:
+        print(f"❌ Error: {data['error']['message'][:100]}")
+    else:
+        content = data["choices"][0]["message"]["content"]
+        finish = data["choices"][0]["finish_reason"]
+        tokens = data.get("usage", {}).get("completion_tokens", 0)
+        answer = content.split("</think>")[1].strip() if "</think>" in content else content
+        print(f"Tokens: {tokens}/500")
+        print(f"Finish: {finish}")
+        print(f"Answer: {answer}")
+        print(f"Ends properly: {answer.rstrip().endswith(('.', '!', '?'))}")
+        if finish == "stop":
+            print(f"✅ IMPROVEMENT: Stopped naturally at EOS (was hitting length before)")
+        elif finish == "length":
+            print(f"⚠️  Still hitting length limit")
+except Exception as e:
+    print(f"❌ Exception: {e}")
+time.sleep(3)
+# Test 3: Sequential requests (no OOM regression)
+print("\n[3] Sequential requests (memory check)")
+success = 0
+for i in range(1, 4):
+    try:
+        response = httpx.post(
+            f"{BASE_URL}/v1/chat/completions",
+            json={
+                "model": "DragonLLM/qwen3-8b-fin-v1.0",
+                "messages": [{"role": "user", "content": f"Calculate {i}+{i}"}],
+                "max_tokens": 200,
+                "temperature": 0.3
+            },
+            timeout=30.0
+        )
+        data = response.json()
+        if "error" not in data:
+            success += 1
+            print(f"  [{i}] ✅")
+        else:
+            if "out of memory" in data["error"]["message"].lower():
+                print(f"  [{i}] ❌ OOM!")
+            else:
+                print(f"  [{i}] ❌ Error")
+        time.sleep(2)
+    except:
+        print(f"  [{i}] ❌ Timeout/Exception")
+if success == 3:
+    print(f"✅ NO REGRESSION: Memory management still working")
+else:
+    print(f"❌ REGRESSION: Only {success}/3 succeeded")
+print("\n" + "="*80)
+print("VERDICT")
+print("="*80)
+print("If Test 2 shows finish='stop' → EOS fix is working ✅")
+print("If Test 2 shows finish='length' → Need more investigation ⚠️")
+print("If Test 3 passes → No memory regression ✅")

test_space_api.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python3
+"""
+Test the Hugging Face Space API to verify the refactored code works.
+"""
+import os
+import sys
+import asyncio
+import httpx
+from typing import Dict, Any
+# Space URL - update this if your Space has a different URL
+SPACE_URL = os.getenv("SPACE_URL", "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1")
+API_KEY = os.getenv("SERVICE_API_KEY")
+async def test_endpoint(client: httpx.AsyncClient, name: str, method: str, url: str, **kwargs) -> Dict[str, Any]:
+    """Test a single API endpoint."""
+    try:
+        headers = kwargs.pop("headers", {})
+        if API_KEY:
+            headers["x-api-key"] = API_KEY
+        if method.upper() == "GET":
+            response = await client.get(url, headers=headers, timeout=30.0)
+        elif method.upper() == "POST":
+            response = await client.post(url, headers=headers, timeout=120.0, **kwargs)
+        else:
+            return {"name": name, "success": False, "error": f"Unsupported method: {method}"}
+        response.raise_for_status()
+        return {
+            "name": name,
+            "success": True,
+            "status_code": response.status_code,
+            "data": response.json() if response.headers.get("content-type", "").startswith("application/json") else response.text[:200],
+        }
+    except Exception as e:
+        return {
+            "name": name,
+            "success": False,
+            "error": str(e),
+        }
+async def main():
+    """Run API tests."""
+    print("=" * 70)
+    print("Testing Hugging Face Space API")
+    print("=" * 70)
+    print(f"Space URL: {SPACE_URL}")
+    print()
+    async with httpx.AsyncClient() as client:
+        results = []
+        # Test 1: Root endpoint
+        print("[1/4] Testing root endpoint...")
+        result = await test_endpoint(client, "Root", "GET", SPACE_URL.replace("/v1", ""))
+        results.append(result)
+        if result["success"]:
+            print(f"  ✓ Success: {result.get('data', {}).get('status', 'ok')}")
+        else:
+            print(f"  ✗ Failed: {result['error']}")
+        print()
+        # Test 2: List models
+        print("[2/4] Testing /v1/models endpoint...")
+        result = await test_endpoint(client, "List Models", "GET", f"{SPACE_URL}/models")
+        results.append(result)
+        if result["success"]:
+            models = result.get("data", {}).get("data", [])
+            print(f"  ✓ Success: Found {len(models)} model(s)")
+            if models:
+                print(f"    Model: {models[0].get('id', 'unknown')}")
+        else:
+            print(f"  ✗ Failed: {result['error']}")
+        print()
+        # Test 3: Chat completion (simple)
+        print("[3/4] Testing /v1/chat/completions endpoint...")
+        result = await test_endpoint(
+            client,
+            "Chat Completion",
+            "POST",
+            f"{SPACE_URL}/chat/completions",
+            json={
+                "model": "DragonLLM/qwen3-8b-fin-v1.0",
+                "messages": [{"role": "user", "content": "What is compound interest? Answer in one sentence."}],
+                "temperature": 0.7,
+                "max_tokens": 100,
+            }
+        )
+        results.append(result)
+        if result["success"]:
+            data = result.get("data", {})
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            tokens = data.get("usage", {}).get("total_tokens", 0)
+            print(f"  ✓ Success: Generated {tokens} tokens")
+            print(f"    Response preview: {content[:100]}...")
+        else:
+            print(f"  ✗ Failed: {result['error']}")
+        print()
+        # Test 4: Model reload endpoint
+        print("[4/4] Testing /v1/models/reload endpoint...")
+        result = await test_endpoint(
+            client,
+            "Model Reload",
+            "POST",
+            f"{SPACE_URL}/models/reload",
+            params={"force": False}
+        )
+        results.append(result)
+        if result["success"]:
+            data = result.get("data", {})
+            print(f"  ✓ Success: {data.get('message', 'OK')}")
+        else:
+            print(f"  ✗ Failed: {result['error']}")
+        print()
+    # Summary
+    print("=" * 70)
+    print("Test Summary")
+    print("=" * 70)
+    passed = sum(1 for r in results if r["success"])
+    print(f"Passed: {passed}/{len(results)}")
+    if passed == len(results):
+        print("✓ All tests passed! The Space is working correctly.")
+        return 0
+    else:
+        print("✗ Some tests failed")
+        for r in results:
+            if not r["success"]:
+                print(f"  - {r['name']}: {r['error']}")
+        return 1
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))