"""
test_endpoint.py — quick check of the deployed Modal llama-server OpenAI endpoint.

Usage (PowerShell):
    $env:LLAMACPP_BASE_URL = "https://soumyaray532--adpd-llama-serve.modal.run/v1"
    $env:LLAMACPP_API_KEY  = "sk-the-value-you-gave-the-adpd-llama-secret"
    python scripts/test_endpoint.py

First call cold-starts the GPU and (on the very first run ever) downloads ~23 GB, so
it can take up to ~20 min before you see a reply. Later calls are fast.
"""
import os
import sys

from openai import OpenAI

BASE_URL = os.environ.get(
    "LLAMACPP_BASE_URL", "https://soumyaray532--adpd-llama-serve.modal.run/v1"
)
API_KEY = os.environ.get("LLAMACPP_API_KEY")
MODEL_ID = os.environ.get("LLM_MODEL_ID", "unsloth/Nemotron-3-Nano-30B-A3B")

if not API_KEY:
    sys.exit(
        "Set LLAMACPP_API_KEY first (the same value you gave the `adpd-llama` Modal "
        "secret). In PowerShell:  $env:LLAMACPP_API_KEY = \"sk-...\""
    )

client = OpenAI(base_url=BASE_URL, api_key=API_KEY, timeout=20 * 60)

print(f"→ {BASE_URL}  (model: {MODEL_ID})")
print("waiting for first response (cold start can take a while) ...\n")

resp = client.chat.completions.create(
    model=MODEL_ID,
    messages=[{"role": "user", "content": "Say hello in exactly three words."}],
    temperature=0.6,
)
print("reply:\n" + resp.choices[0].message.content)