jeanbaptdzd commited on
Commit
bedfb0c
Β·
1 Parent(s): 16c2a22

Set DEFAULT_MAX_TOKENS=800 to prevent timeouts

Browse files

1500 tokens causes 90+ second timeouts.
800 tokens provides good balance:
- Complete answers for most questions
- Reasonable response time (~20-30s)
- Users can request higher for complex questions

Files changed (4) hide show
  1. app/utils/constants.py +1 -1
  2. quick_test.py +54 -0
  3. test_regression.py +118 -0
  4. test_space_api.py +142 -0
app/utils/constants.py CHANGED
@@ -49,7 +49,7 @@ EOS_TOKENS = [151645, 151643] # [<|im_end|>, <|endoftext|>]
49
  PAD_TOKEN_ID = 151643 # <|endoftext|>
50
 
51
  # Generation defaults
52
- DEFAULT_MAX_TOKENS = 1500
53
  DEFAULT_TEMPERATURE = 0.7
54
  DEFAULT_TOP_P = 1.0
55
  DEFAULT_TOP_K = 20
 
49
  PAD_TOKEN_ID = 151643 # <|endoftext|>
50
 
51
  # Generation defaults
52
+ DEFAULT_MAX_TOKENS = 800 # Balanced: complete answers without timeouts
53
  DEFAULT_TEMPERATURE = 0.7
54
  DEFAULT_TOP_P = 1.0
55
  DEFAULT_TOP_K = 20
quick_test.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Quick test of Space API"""
3
+ import httpx
4
+ import sys
5
+
6
+ SPACE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
7
+
8
+ try:
9
+ # Test root endpoint
10
+ r = httpx.get(f"{SPACE_URL}/", timeout=10)
11
+ if r.status_code == 200:
12
+ data = r.json()
13
+ print(f"βœ“ Root endpoint: {data.get('backend', 'unknown')}")
14
+ print(f" Model: {data.get('model', 'unknown')}")
15
+ else:
16
+ print(f"βœ— Root endpoint failed: {r.status_code}")
17
+ sys.exit(1)
18
+
19
+ # Test models endpoint
20
+ r = httpx.get(f"{SPACE_URL}/v1/models", timeout=10)
21
+ if r.status_code == 200:
22
+ data = r.json()
23
+ models = data.get('data', [])
24
+ print(f"βœ“ Models endpoint: {len(models)} model(s)")
25
+ else:
26
+ print(f"βœ— Models endpoint failed: {r.status_code}")
27
+ sys.exit(1)
28
+
29
+ # Test chat completion (short)
30
+ r = httpx.post(
31
+ f"{SPACE_URL}/v1/chat/completions",
32
+ json={
33
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
34
+ "messages": [{"role": "user", "content": "Say hello"}],
35
+ "max_tokens": 50
36
+ },
37
+ timeout=60
38
+ )
39
+ if r.status_code == 200:
40
+ data = r.json()
41
+ content = data['choices'][0]['message']['content']
42
+ print(f"βœ“ Chat completion: {len(content)} chars")
43
+ print(f" Preview: {content[:50]}...")
44
+ else:
45
+ print(f"βœ— Chat completion failed: {r.status_code}")
46
+ print(f" Response: {r.text[:200]}")
47
+ sys.exit(1)
48
+
49
+ print("\nβœ“ All tests passed! Space is working.")
50
+
51
+ except Exception as e:
52
+ print(f"βœ— Error: {e}")
53
+ sys.exit(1)
54
+
test_regression.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Regression test: verify EOS token fix improves completeness without breaking anything
4
+ """
5
+ import httpx
6
+ import json
7
+ import time
8
+
9
+ BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
10
+
11
+ print("="*80)
12
+ print("REGRESSION & IMPROVEMENT TEST")
13
+ print("="*80)
14
+
15
+ # Test 1: Basic functionality still works
16
+ print("\n[1] Basic functionality check")
17
+ try:
18
+ response = httpx.post(
19
+ f"{BASE_URL}/v1/chat/completions",
20
+ json={
21
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
22
+ "messages": [{"role": "user", "content": "What is 2+2?"}],
23
+ "max_tokens": 100,
24
+ "temperature": 0.3
25
+ },
26
+ timeout=30.0
27
+ )
28
+
29
+ data = response.json()
30
+ if "error" not in data:
31
+ print(f"βœ… Basic request works")
32
+ else:
33
+ print(f"❌ Error: {data['error']['message']}")
34
+ except Exception as e:
35
+ print(f"❌ Exception: {e}")
36
+
37
+ time.sleep(3)
38
+
39
+ # Test 2: French answer with reasonable token limit
40
+ print("\n[2] French answer (500 tokens)")
41
+ try:
42
+ response = httpx.post(
43
+ f"{BASE_URL}/v1/chat/completions",
44
+ json={
45
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
46
+ "messages": [{"role": "user", "content": "Qu'est-ce qu'une obligation? RΓ©ponse courte."}],
47
+ "max_tokens": 500,
48
+ "temperature": 0.3
49
+ },
50
+ timeout=45.0
51
+ )
52
+
53
+ data = response.json()
54
+ if "error" in data:
55
+ print(f"❌ Error: {data['error']['message'][:100]}")
56
+ else:
57
+ content = data["choices"][0]["message"]["content"]
58
+ finish = data["choices"][0]["finish_reason"]
59
+ tokens = data.get("usage", {}).get("completion_tokens", 0)
60
+
61
+ answer = content.split("</think>")[1].strip() if "</think>" in content else content
62
+
63
+ print(f"Tokens: {tokens}/500")
64
+ print(f"Finish: {finish}")
65
+ print(f"Answer: {answer}")
66
+ print(f"Ends properly: {answer.rstrip().endswith(('.', '!', '?'))}")
67
+
68
+ if finish == "stop":
69
+ print(f"βœ… IMPROVEMENT: Stopped naturally at EOS (was hitting length before)")
70
+ elif finish == "length":
71
+ print(f"⚠️ Still hitting length limit")
72
+
73
+ except Exception as e:
74
+ print(f"❌ Exception: {e}")
75
+
76
+ time.sleep(3)
77
+
78
+ # Test 3: Sequential requests (no OOM regression)
79
+ print("\n[3] Sequential requests (memory check)")
80
+ success = 0
81
+ for i in range(1, 4):
82
+ try:
83
+ response = httpx.post(
84
+ f"{BASE_URL}/v1/chat/completions",
85
+ json={
86
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
87
+ "messages": [{"role": "user", "content": f"Calculate {i}+{i}"}],
88
+ "max_tokens": 200,
89
+ "temperature": 0.3
90
+ },
91
+ timeout=30.0
92
+ )
93
+
94
+ data = response.json()
95
+ if "error" not in data:
96
+ success += 1
97
+ print(f" [{i}] βœ…")
98
+ else:
99
+ if "out of memory" in data["error"]["message"].lower():
100
+ print(f" [{i}] ❌ OOM!")
101
+ else:
102
+ print(f" [{i}] ❌ Error")
103
+ time.sleep(2)
104
+ except:
105
+ print(f" [{i}] ❌ Timeout/Exception")
106
+
107
+ if success == 3:
108
+ print(f"βœ… NO REGRESSION: Memory management still working")
109
+ else:
110
+ print(f"❌ REGRESSION: Only {success}/3 succeeded")
111
+
112
+ print("\n" + "="*80)
113
+ print("VERDICT")
114
+ print("="*80)
115
+ print("If Test 2 shows finish='stop' β†’ EOS fix is working βœ…")
116
+ print("If Test 2 shows finish='length' β†’ Need more investigation ⚠️")
117
+ print("If Test 3 passes β†’ No memory regression βœ…")
118
+
test_space_api.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test the Hugging Face Space API to verify the refactored code works.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import asyncio
9
+ import httpx
10
+ from typing import Dict, Any
11
+
12
+ # Space URL - update this if your Space has a different URL
13
+ SPACE_URL = os.getenv("SPACE_URL", "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1")
14
+ API_KEY = os.getenv("SERVICE_API_KEY")
15
+
16
+
17
+ async def test_endpoint(client: httpx.AsyncClient, name: str, method: str, url: str, **kwargs) -> Dict[str, Any]:
18
+ """Test a single API endpoint."""
19
+ try:
20
+ headers = kwargs.pop("headers", {})
21
+ if API_KEY:
22
+ headers["x-api-key"] = API_KEY
23
+
24
+ if method.upper() == "GET":
25
+ response = await client.get(url, headers=headers, timeout=30.0)
26
+ elif method.upper() == "POST":
27
+ response = await client.post(url, headers=headers, timeout=120.0, **kwargs)
28
+ else:
29
+ return {"name": name, "success": False, "error": f"Unsupported method: {method}"}
30
+
31
+ response.raise_for_status()
32
+ return {
33
+ "name": name,
34
+ "success": True,
35
+ "status_code": response.status_code,
36
+ "data": response.json() if response.headers.get("content-type", "").startswith("application/json") else response.text[:200],
37
+ }
38
+ except Exception as e:
39
+ return {
40
+ "name": name,
41
+ "success": False,
42
+ "error": str(e),
43
+ }
44
+
45
+
46
+ async def main():
47
+ """Run API tests."""
48
+ print("=" * 70)
49
+ print("Testing Hugging Face Space API")
50
+ print("=" * 70)
51
+ print(f"Space URL: {SPACE_URL}")
52
+ print()
53
+
54
+ async with httpx.AsyncClient() as client:
55
+ results = []
56
+
57
+ # Test 1: Root endpoint
58
+ print("[1/4] Testing root endpoint...")
59
+ result = await test_endpoint(client, "Root", "GET", SPACE_URL.replace("/v1", ""))
60
+ results.append(result)
61
+ if result["success"]:
62
+ print(f" βœ“ Success: {result.get('data', {}).get('status', 'ok')}")
63
+ else:
64
+ print(f" βœ— Failed: {result['error']}")
65
+ print()
66
+
67
+ # Test 2: List models
68
+ print("[2/4] Testing /v1/models endpoint...")
69
+ result = await test_endpoint(client, "List Models", "GET", f"{SPACE_URL}/models")
70
+ results.append(result)
71
+ if result["success"]:
72
+ models = result.get("data", {}).get("data", [])
73
+ print(f" βœ“ Success: Found {len(models)} model(s)")
74
+ if models:
75
+ print(f" Model: {models[0].get('id', 'unknown')}")
76
+ else:
77
+ print(f" βœ— Failed: {result['error']}")
78
+ print()
79
+
80
+ # Test 3: Chat completion (simple)
81
+ print("[3/4] Testing /v1/chat/completions endpoint...")
82
+ result = await test_endpoint(
83
+ client,
84
+ "Chat Completion",
85
+ "POST",
86
+ f"{SPACE_URL}/chat/completions",
87
+ json={
88
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
89
+ "messages": [{"role": "user", "content": "What is compound interest? Answer in one sentence."}],
90
+ "temperature": 0.7,
91
+ "max_tokens": 100,
92
+ }
93
+ )
94
+ results.append(result)
95
+ if result["success"]:
96
+ data = result.get("data", {})
97
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
98
+ tokens = data.get("usage", {}).get("total_tokens", 0)
99
+ print(f" βœ“ Success: Generated {tokens} tokens")
100
+ print(f" Response preview: {content[:100]}...")
101
+ else:
102
+ print(f" βœ— Failed: {result['error']}")
103
+ print()
104
+
105
+ # Test 4: Model reload endpoint
106
+ print("[4/4] Testing /v1/models/reload endpoint...")
107
+ result = await test_endpoint(
108
+ client,
109
+ "Model Reload",
110
+ "POST",
111
+ f"{SPACE_URL}/models/reload",
112
+ params={"force": False}
113
+ )
114
+ results.append(result)
115
+ if result["success"]:
116
+ data = result.get("data", {})
117
+ print(f" βœ“ Success: {data.get('message', 'OK')}")
118
+ else:
119
+ print(f" βœ— Failed: {result['error']}")
120
+ print()
121
+
122
+ # Summary
123
+ print("=" * 70)
124
+ print("Test Summary")
125
+ print("=" * 70)
126
+ passed = sum(1 for r in results if r["success"])
127
+ print(f"Passed: {passed}/{len(results)}")
128
+
129
+ if passed == len(results):
130
+ print("βœ“ All tests passed! The Space is working correctly.")
131
+ return 0
132
+ else:
133
+ print("βœ— Some tests failed")
134
+ for r in results:
135
+ if not r["success"]:
136
+ print(f" - {r['name']}: {r['error']}")
137
+ return 1
138
+
139
+
140
+ if __name__ == "__main__":
141
+ sys.exit(asyncio.run(main()))
142
+