AmazingDigitalPetDentures / scripts /test_endpoint.py
VirusDumb's picture
Sync from GitHub via hub-sync
ba3faf3 verified
Raw
History Blame Contribute Delete
1.34 kB
"""
test_endpoint.py — quick check of the deployed Modal llama-server OpenAI endpoint.
Usage (PowerShell):
$env:LLAMACPP_BASE_URL = "https://soumyaray532--adpd-llama-serve.modal.run/v1"
$env:LLAMACPP_API_KEY = "sk-the-value-you-gave-the-adpd-llama-secret"
python scripts/test_endpoint.py
First call cold-starts the GPU and (on the very first run ever) downloads ~23 GB, so
it can take up to ~20 min before you see a reply. Later calls are fast.
"""
import os
import sys
from openai import OpenAI
BASE_URL = os.environ.get(
"LLAMACPP_BASE_URL", "https://soumyaray532--adpd-llama-serve.modal.run/v1"
)
API_KEY = os.environ.get("LLAMACPP_API_KEY")
MODEL_ID = os.environ.get("LLM_MODEL_ID", "unsloth/Nemotron-3-Nano-30B-A3B")
if not API_KEY:
sys.exit(
"Set LLAMACPP_API_KEY first (the same value you gave the `adpd-llama` Modal "
"secret). In PowerShell: $env:LLAMACPP_API_KEY = \"sk-...\""
)
client = OpenAI(base_url=BASE_URL, api_key=API_KEY, timeout=20 * 60)
print(f"→ {BASE_URL} (model: {MODEL_ID})")
print("waiting for first response (cold start can take a while) ...\n")
resp = client.chat.completions.create(
model=MODEL_ID,
messages=[{"role": "user", "content": "Say hello in exactly three words."}],
temperature=0.6,
)
print("reply:\n" + resp.choices[0].message.content)