nicholasg1997
feat: update model configurations and enhance README for local AI running coach
dcfc5a3
"""Modal app that serves a Qwen3 model with vLLM's OpenAI-compatible API.
Deploy: uv run modal deploy inference/app.py
Dev/test: uv run modal serve inference/app.py (hot-reloads while running)
Right now this serves a small model to validate the whole pipeline cheaply.
To move to the real thing later, change ONLY two lines:
MODEL_NAME -> "Qwen/Qwen3-32B-FP8"
gpu= -> "H100" (FP8 needs Hopper) or "A100-80GB" for a BF16 build
Everything else — the contract, the client, the prompts — stays identical.
"""
import subprocess
import os
import modal
MODEL_NAME = "Qwen/Qwen3-14B"
VLLM_PORT = 8000
MINUTES = 60
vllm_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
# vllm 0.10.2 only requires transformers>=4.55.2 (no upper bound), and
# newer transformers removed `all_special_tokens_extended`, which 0.10.2
# still calls -> crash at tokenizer init. Pin transformers to the version
# vllm 0.10.2 was built against. Bump both together for Nemotron-Nano-v2.
"vllm==0.10.2",
"transformers==4.55.2",
"huggingface_hub[hf_transfer]",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
# Persisted caches so we don't re-download the model on every cold start
hf_cache = modal.Volume.from_name("hf-cache", create_if_missing=True)
vllm_cache = modal.Volume.from_name("vllm-cache", create_if_missing=True)
app = modal.App("rate-my-run-llm")
@app.function(
image=vllm_image,
gpu="A100-40GB",
secrets=[modal.Secret.from_name("vllm-api-key")],
scaledown_window=5 * MINUTES,
timeout=30 * MINUTES,
volumes={
"/root/.cache/huggingface": hf_cache,
"/root/.cache/vllm": vllm_cache,
},
)
@modal.concurrent(max_inputs=32)
@modal.web_server(port=VLLM_PORT, startup_timeout=20 * MINUTES)
def serve():
cmd = (
f"vllm serve {MODEL_NAME} "
f"--host 0.0.0.0 --port {VLLM_PORT} "
f"--api-key {os.environ['VLLM_API_KEY']} "
f"--max-model-len 8192" # our prompts are short; caps KV-cache memory
)
subprocess.Popen(cmd, shell=True)