Spaces:

build-small-hackathon
/

multi-agent-lab

Sleeping

agharsallah

feat: unify model catalogue and self-hosted routing

9dd6dab 25 days ago

1.83 kB

	"""Tiny OpenAI-compatible client for smoke-testing a deployed endpoint.

	Usage:
	python modal/client.py \\
	--base-url https://<workspace>--google-llms-gemma-4-12b.modal.run/v1 \\
	--model google/gemma-4-12B \\
	--prompt "Describe a mossy ticket booth in the wood."

	The endpoint URL is https://<workspace>--<app-name>-<endpoint-name>.modal.run/v1,
	where <app-name> is the modal.App (nvidia-llms / openbmb-llms / google-llms) and
	<endpoint-name> is the per-model slug. --model is the served model id (the HF
	repo id), NOT the URL slug.

	The endpoints speak the OpenAI REST API, so the official ``openai`` SDK works
	unchanged — the engine reaches them via the LiteLLM gateway, binding each profile
	from ``modal/catalogue.py`` + ``MODAL_WORKSPACE`` (or a single ``MODAL_LLM_BASE_URL``).
	"""

	from __future__ import annotations

	import argparse
	import os


	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--base-url", required=True, help="endpoint URL ending in /v1")
	parser.add_argument("--model", required=True, help="served model id")
	parser.add_argument("--prompt", default="Say hello in one sentence.")
	parser.add_argument("--max-tokens", type=int, default=256)
	args = parser.parse_args()

	from openai import OpenAI

	# Bearer token from the env var (set LLM_API_KEY to the value of the
	# `llm-api-key` Modal Secret). Any value works when the server has no auth.
	client = OpenAI(base_url=args.base_url, api_key=os.environ.get("LLM_API_KEY", "EMPTY"))

	response = client.chat.completions.create(
	model=args.model,
	messages=[{"role": "user", "content": args.prompt}],
	max_tokens=args.max_tokens,
	)
	print(response.choices[0].message.content)


	if __name__ == "__main__":
	main()