import json import requests import time API_URL = "http://localhost:8000/v1/chat/completions" HEADERS = { "Content-Type": "application/json", "Authorization": "Bearer 0", } def run_test(prompt: str, max_tokens=500): payload = { "model": "custom-model", "messages": [ {"role": "system", "content": "Answer the user question about Markie Voss."}, {"role": "user", "content": prompt}, ], "max_tokens": max_tokens, "do_sample": True, "temperature": 0.6, "top_p": 0.8, "eos_token_id": [ 151645, 151643, 151668 ], "max_tokens": 1024, "enable_thinking": True, "stream": True, } print("=" * 80) print("Prompt:", prompt) print("Streaming response:\n") with requests.post( API_URL, headers=HEADERS, json=payload, stream=True, # 🔴 stream HTTP response timeout=60, ) as r: print("HTTP status:", r.status_code) r.raise_for_status() full_text = "" for line in r.iter_lines(decode_unicode=True): if not line: continue # OpenAI-style streaming uses "data: {...}" if line.startswith("data:"): data = line[len("data:"):].strip() if data == "[DONE]": break try: chunk = json.loads(data) except json.JSONDecodeError: continue delta = chunk["choices"][0]["delta"] if "content" in delta: token = delta["content"] full_text += token print(token, end="", flush=True) print("\n\n--- END OF STREAM ---") print("✅ Full content repr:", repr(full_text)) if __name__ == "__main__": print("Warming up...") time.sleep(1) while True: p = input("User: ") run_test(p)