Spaces:

build-small-hackathon
/

stride-running-coach

Running

stride-running-coach / inference /app.py

nicholasg1997

feat: update model configurations and enhance README for local AI running coach

dcfc5a3 1 day ago

2.11 kB

	"""Modal app that serves a Qwen3 model with vLLM's OpenAI-compatible API.

	Deploy: uv run modal deploy inference/app.py
	Dev/test: uv run modal serve inference/app.py (hot-reloads while running)

	Right now this serves a small model to validate the whole pipeline cheaply.
	To move to the real thing later, change ONLY two lines:
	MODEL_NAME -> "Qwen/Qwen3-32B-FP8"
	gpu= -> "H100" (FP8 needs Hopper) or "A100-80GB" for a BF16 build
	Everything else — the contract, the client, the prompts — stays identical.
	"""

	import subprocess
	import os

	import modal

	MODEL_NAME = "Qwen/Qwen3-14B"
	VLLM_PORT = 8000
	MINUTES = 60

	vllm_image = (
	modal.Image.debian_slim(python_version="3.12")
	.pip_install(
	# vllm 0.10.2 only requires transformers>=4.55.2 (no upper bound), and
	# newer transformers removed `all_special_tokens_extended`, which 0.10.2
	# still calls -> crash at tokenizer init. Pin transformers to the version
	# vllm 0.10.2 was built against. Bump both together for Nemotron-Nano-v2.
	"vllm==0.10.2",
	"transformers==4.55.2",
	"huggingface_hub[hf_transfer]",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)

	# Persisted caches so we don't re-download the model on every cold start
	hf_cache = modal.Volume.from_name("hf-cache", create_if_missing=True)
	vllm_cache = modal.Volume.from_name("vllm-cache", create_if_missing=True)

	app = modal.App("rate-my-run-llm")

	@app.function(
	image=vllm_image,
	gpu="A100-40GB",
	secrets=[modal.Secret.from_name("vllm-api-key")],
	scaledown_window=5 * MINUTES,
	timeout=30 * MINUTES,
	volumes={
	"/root/.cache/huggingface": hf_cache,
	"/root/.cache/vllm": vllm_cache,
	},
	)
	@modal.concurrent(max_inputs=32)
	@modal.web_server(port=VLLM_PORT, startup_timeout=20 * MINUTES)
	def serve():
	cmd = (
	f"vllm serve {MODEL_NAME} "
	f"--host 0.0.0.0 --port {VLLM_PORT} "
	f"--api-key {os.environ['VLLM_API_KEY']} "
	f"--max-model-len 8192" # our prompts are short; caps KV-cache memory
	)
	subprocess.Popen(cmd, shell=True)