open-finance-llm-8b / test_regression.py
jeanbaptdzd's picture
Set DEFAULT_MAX_TOKENS=800 to prevent timeouts
bedfb0c
raw
history blame
3.56 kB
#!/usr/bin/env python3
"""
Regression test: verify EOS token fix improves completeness without breaking anything
"""
import httpx
import json
import time
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
print("="*80)
print("REGRESSION & IMPROVEMENT TEST")
print("="*80)
# Test 1: Basic functionality still works
print("\n[1] Basic functionality check")
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json={
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [{"role": "user", "content": "What is 2+2?"}],
"max_tokens": 100,
"temperature": 0.3
},
timeout=30.0
)
data = response.json()
if "error" not in data:
print(f"βœ… Basic request works")
else:
print(f"❌ Error: {data['error']['message']}")
except Exception as e:
print(f"❌ Exception: {e}")
time.sleep(3)
# Test 2: French answer with reasonable token limit
print("\n[2] French answer (500 tokens)")
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json={
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [{"role": "user", "content": "Qu'est-ce qu'une obligation? RΓ©ponse courte."}],
"max_tokens": 500,
"temperature": 0.3
},
timeout=45.0
)
data = response.json()
if "error" in data:
print(f"❌ Error: {data['error']['message'][:100]}")
else:
content = data["choices"][0]["message"]["content"]
finish = data["choices"][0]["finish_reason"]
tokens = data.get("usage", {}).get("completion_tokens", 0)
answer = content.split("</think>")[1].strip() if "</think>" in content else content
print(f"Tokens: {tokens}/500")
print(f"Finish: {finish}")
print(f"Answer: {answer}")
print(f"Ends properly: {answer.rstrip().endswith(('.', '!', '?'))}")
if finish == "stop":
print(f"βœ… IMPROVEMENT: Stopped naturally at EOS (was hitting length before)")
elif finish == "length":
print(f"⚠️ Still hitting length limit")
except Exception as e:
print(f"❌ Exception: {e}")
time.sleep(3)
# Test 3: Sequential requests (no OOM regression)
print("\n[3] Sequential requests (memory check)")
success = 0
for i in range(1, 4):
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json={
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [{"role": "user", "content": f"Calculate {i}+{i}"}],
"max_tokens": 200,
"temperature": 0.3
},
timeout=30.0
)
data = response.json()
if "error" not in data:
success += 1
print(f" [{i}] βœ…")
else:
if "out of memory" in data["error"]["message"].lower():
print(f" [{i}] ❌ OOM!")
else:
print(f" [{i}] ❌ Error")
time.sleep(2)
except:
print(f" [{i}] ❌ Timeout/Exception")
if success == 3:
print(f"βœ… NO REGRESSION: Memory management still working")
else:
print(f"❌ REGRESSION: Only {success}/3 succeeded")
print("\n" + "="*80)
print("VERDICT")
print("="*80)
print("If Test 2 shows finish='stop' β†’ EOS fix is working βœ…")
print("If Test 2 shows finish='length' β†’ Need more investigation ⚠️")
print("If Test 3 passes β†’ No memory regression βœ…")