| """Tiny OpenAI-compatible client for smoke-testing a deployed endpoint. |
| |
| Usage: |
| python modal/client.py \\ |
| --base-url https://<workspace>--google-llms-gemma-4-12b.modal.run/v1 \\ |
| --model google/gemma-4-12B \\ |
| --prompt "Describe a mossy ticket booth in the wood." |
| |
| The endpoint URL is https://<workspace>--<app-name>-<endpoint-name>.modal.run/v1, |
| where <app-name> is the modal.App (nvidia-llms / openbmb-llms / google-llms) and |
| <endpoint-name> is the per-model slug. --model is the served model id (the HF |
| repo id), NOT the URL slug. |
| |
| The endpoints speak the OpenAI REST API, so the official ``openai`` SDK works |
| unchanged — the engine reaches them via the LiteLLM gateway, binding each profile |
| from ``modal/catalogue.py`` + ``MODAL_WORKSPACE`` (or a single ``MODAL_LLM_BASE_URL``). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("--base-url", required=True, help="endpoint URL ending in /v1") |
| parser.add_argument("--model", required=True, help="served model id") |
| parser.add_argument("--prompt", default="Say hello in one sentence.") |
| parser.add_argument("--max-tokens", type=int, default=256) |
| args = parser.parse_args() |
|
|
| from openai import OpenAI |
|
|
| |
| |
| client = OpenAI(base_url=args.base_url, api_key=os.environ.get("LLM_API_KEY", "EMPTY")) |
|
|
| response = client.chat.completions.create( |
| model=args.model, |
| messages=[{"role": "user", "content": args.prompt}], |
| max_tokens=args.max_tokens, |
| ) |
| print(response.choices[0].message.content) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|