""" test_endpoint.py — quick check of the deployed Modal llama-server OpenAI endpoint. Usage (PowerShell): $env:LLAMACPP_BASE_URL = "https://soumyaray532--adpd-llama-serve.modal.run/v1" $env:LLAMACPP_API_KEY = "sk-the-value-you-gave-the-adpd-llama-secret" python scripts/test_endpoint.py First call cold-starts the GPU and (on the very first run ever) downloads ~23 GB, so it can take up to ~20 min before you see a reply. Later calls are fast. """ import os import sys from openai import OpenAI BASE_URL = os.environ.get( "LLAMACPP_BASE_URL", "https://soumyaray532--adpd-llama-serve.modal.run/v1" ) API_KEY = os.environ.get("LLAMACPP_API_KEY") MODEL_ID = os.environ.get("LLM_MODEL_ID", "unsloth/Nemotron-3-Nano-30B-A3B") if not API_KEY: sys.exit( "Set LLAMACPP_API_KEY first (the same value you gave the `adpd-llama` Modal " "secret). In PowerShell: $env:LLAMACPP_API_KEY = \"sk-...\"" ) client = OpenAI(base_url=BASE_URL, api_key=API_KEY, timeout=20 * 60) print(f"→ {BASE_URL} (model: {MODEL_ID})") print("waiting for first response (cold start can take a while) ...\n") resp = client.chat.completions.create( model=MODEL_ID, messages=[{"role": "user", "content": "Say hello in exactly three words."}], temperature=0.6, ) print("reply:\n" + resp.choices[0].message.content)