import requests import json import time import sys BASE_URL = "http://localhost:8000/v1" MODEL_NAME = "RWKV-GLM-4.7-Flash-Preview-v0.1" # ========================================================== # Utility # ========================================================== def print_section(title): print("\n" + "=" * 60) print(title) print("=" * 60) def safe_json(resp): try: return resp.json() except: print("❌ JSON decode failed") print(resp.text) sys.exit(1) # ========================================================== # 1️⃣ Models API # ========================================================== def test_models(): print_section("TEST: /v1/models") resp = requests.get(f"{BASE_URL}/models") assert resp.status_code == 200, "Models API failed" data = safe_json(resp) assert "data" in data, "No model list returned" assert len(data["data"]) > 0, "Empty model list" print("✅ Models endpoint OK") print("Available models:", [m["id"] for m in data["data"]]) # ========================================================== # 2️⃣ Non-stream basic # ========================================================== def test_basic_completion(): print_section("TEST: Basic Non-Streaming Completion") payload = { "model": MODEL_NAME, "messages": [{"role": "user", "content": "Say hello."}], "max_tokens": 30, "stream": False } resp = requests.post( f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload) ) assert resp.status_code == 200, "Completion failed" data = safe_json(resp) assert "choices" in data, "No choices returned" assert "usage" in data, "No usage returned" print("Assistant:", data["choices"][0]["message"]["content"]) print("Usage:", data["usage"]) print("✅ Basic completion OK") # ========================================================== # 3️⃣ Streaming # ========================================================== def test_streaming(): print_section("TEST: Streaming Completion") payload = { "model": MODEL_NAME, "messages": [{"role": "user", "content": "Count from 1 to 5."}], "max_tokens": 50, "stream": True } full_text = "" with requests.post( f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload), stream=True ) as resp: assert resp.status_code == 200, "Streaming failed" for line in resp.iter_lines(): if line: decoded = line.decode("utf-8") if decoded.startswith("data: "): content = decoded[len("data: "):] if content == "[DONE]": break chunk = json.loads(content) delta = chunk["choices"][0]["delta"] if "content" in delta: print(delta["content"], end="", flush=True) full_text += delta["content"] print("\n\n✅ Streaming OK") assert len(full_text) > 0, "Streaming returned empty" # ========================================================== # 4️⃣ Sampling Variations # ========================================================== def test_sampling_variations(): print_section("TEST: Sampling Variations") base_payload = { "model": MODEL_NAME, "messages": [{"role": "user", "content": "Write a creative sentence about AI."}], "max_tokens": 50, "stream": False } configs = [ {"temperature": 0.0}, {"temperature": 0.7}, {"top_p": 0.8}, {"top_k": 20}, {"repetition_penalty": 1.2}, {"presence_penalty": 0.5}, {"frequency_penalty": 0.5} ] for cfg in configs: payload = base_payload.copy() payload.update(cfg) resp = requests.post( f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload) ) assert resp.status_code == 200, f"Sampling failed: {cfg}" data = safe_json(resp) text = data["choices"][0]["message"]["content"] print(f"\nConfig: {cfg}") print("Output:", text[:120], "...") print("\n✅ Sampling parameter variations OK") # ========================================================== # 5️⃣ Deterministic Check (temperature=0) # ========================================================== def test_deterministic(): print_section("TEST: Deterministic Mode (temperature=0)") payload = { "model": MODEL_NAME, "messages": [{"role": "user", "content": "Define gravity in one sentence."}], "temperature": 0.0, "max_tokens": 50, "stream": False } resp1 = requests.post(f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload)) resp2 = requests.post(f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload)) out1 = safe_json(resp1)["choices"][0]["message"]["content"] out2 = safe_json(resp2)["choices"][0]["message"]["content"] print("Run1:", out1) print("Run2:", out2) assert out1 == out2, "❌ Deterministic mode not deterministic" print("✅ Deterministic check OK") # ========================================================== # 6️⃣ Error Handling # ========================================================== def test_error_handling(): print_section("TEST: Error Handling") payload = { "model": MODEL_NAME, # missing messages intentionally } resp = requests.post( f"{BASE_URL}/chat/completions", headers={"Content-Type": "application/json"}, data=json.dumps(payload) ) if resp.status_code != 200: print("✅ Server correctly handled bad request") else: print("⚠️ Warning: server did not reject bad request") # ========================================================== # Main # ========================================================== if __name__ == "__main__": start = time.time() test_models() test_basic_completion() test_streaming() test_sampling_variations() test_deterministic() test_error_handling() print_section("ALL TESTS PASSED") print(f"Total time: {round(time.time() - start, 2)} sec")