import json
import requests
import time

API_URL = "http://localhost:8000/v1/chat/completions"

HEADERS = {
    "Content-Type": "application/json",
    "Authorization": "Bearer 0",
}

def run_test(prompt: str, max_tokens=500):
    payload = {
        "model": "custom-model",
        "messages": [
            {"role": "system", "content": "Answer the user question about Markie Voss."},
            {"role": "user", "content": prompt},
        ],
        "max_tokens": max_tokens,
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.8,
        "eos_token_id": [
            151645,
            151643,
            151668
        ],
        "max_tokens": 1024,
        "enable_thinking": True,
        "stream": True,
    }

    print("=" * 80)
    print("Prompt:", prompt)
    print("Streaming response:\n")

    with requests.post(
        API_URL,
        headers=HEADERS,
        json=payload,
        stream=True,             # 🔴 stream HTTP response
        timeout=60,
    ) as r:

        print("HTTP status:", r.status_code)
        r.raise_for_status()

        full_text = ""

        for line in r.iter_lines(decode_unicode=True):
            if not line:
                continue

            # OpenAI-style streaming uses "data: {...}"
            if line.startswith("data:"):
                data = line[len("data:"):].strip()

                if data == "[DONE]":
                    break

                try:
                    chunk = json.loads(data)
                except json.JSONDecodeError:
                    continue

                delta = chunk["choices"][0]["delta"]

                if "content" in delta:
                    token = delta["content"]
                    full_text += token
                    print(token, end="", flush=True)

        print("\n\n--- END OF STREAM ---")
        print("✅ Full content repr:", repr(full_text))


if __name__ == "__main__":
    print("Warming up...")
    time.sleep(1)

    while True:
        p = input("User: ")
        run_test(p)