"""Tiny OpenAI-compatible client for smoke-testing a deployed endpoint. Usage: python modal/client.py \\ --base-url https://--google-llms-gemma-4-12b.modal.run/v1 \\ --model google/gemma-4-12B \\ --prompt "Describe a mossy ticket booth in the wood." The endpoint URL is https://---.modal.run/v1, where is the modal.App (nvidia-llms / openbmb-llms / google-llms) and is the per-model slug. --model is the served model id (the HF repo id), NOT the URL slug. The endpoints speak the OpenAI REST API, so the official ``openai`` SDK works unchanged — the engine reaches them via the LiteLLM gateway, binding each profile from ``modal/catalogue.py`` + ``MODAL_WORKSPACE`` (or a single ``MODAL_LLM_BASE_URL``). """ from __future__ import annotations import argparse import os def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--base-url", required=True, help="endpoint URL ending in /v1") parser.add_argument("--model", required=True, help="served model id") parser.add_argument("--prompt", default="Say hello in one sentence.") parser.add_argument("--max-tokens", type=int, default=256) args = parser.parse_args() from openai import OpenAI # Bearer token from the env var (set LLM_API_KEY to the value of the # `llm-api-key` Modal Secret). Any value works when the server has no auth. client = OpenAI(base_url=args.base_url, api_key=os.environ.get("LLM_API_KEY", "EMPTY")) response = client.chat.completions.create( model=args.model, messages=[{"role": "user", "content": args.prompt}], max_tokens=args.max_tokens, ) print(response.choices[0].message.content) if __name__ == "__main__": main()