"""Modal app that serves a Qwen3 model with vLLM's OpenAI-compatible API. Deploy: uv run modal deploy inference/app.py Dev/test: uv run modal serve inference/app.py (hot-reloads while running) Right now this serves a small model to validate the whole pipeline cheaply. To move to the real thing later, change ONLY two lines: MODEL_NAME -> "Qwen/Qwen3-32B-FP8" gpu= -> "H100" (FP8 needs Hopper) or "A100-80GB" for a BF16 build Everything else — the contract, the client, the prompts — stays identical. """ import subprocess import os import modal MODEL_NAME = "Qwen/Qwen3-14B" VLLM_PORT = 8000 MINUTES = 60 vllm_image = ( modal.Image.debian_slim(python_version="3.12") .pip_install( # vllm 0.10.2 only requires transformers>=4.55.2 (no upper bound), and # newer transformers removed `all_special_tokens_extended`, which 0.10.2 # still calls -> crash at tokenizer init. Pin transformers to the version # vllm 0.10.2 was built against. Bump both together for Nemotron-Nano-v2. "vllm==0.10.2", "transformers==4.55.2", "huggingface_hub[hf_transfer]", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) ) # Persisted caches so we don't re-download the model on every cold start hf_cache = modal.Volume.from_name("hf-cache", create_if_missing=True) vllm_cache = modal.Volume.from_name("vllm-cache", create_if_missing=True) app = modal.App("rate-my-run-llm") @app.function( image=vllm_image, gpu="A100-40GB", secrets=[modal.Secret.from_name("vllm-api-key")], scaledown_window=5 * MINUTES, timeout=30 * MINUTES, volumes={ "/root/.cache/huggingface": hf_cache, "/root/.cache/vllm": vllm_cache, }, ) @modal.concurrent(max_inputs=32) @modal.web_server(port=VLLM_PORT, startup_timeout=20 * MINUTES) def serve(): cmd = ( f"vllm serve {MODEL_NAME} " f"--host 0.0.0.0 --port {VLLM_PORT} " f"--api-key {os.environ['VLLM_API_KEY']} " f"--max-model-len 8192" # our prompts are short; caps KV-cache memory ) subprocess.Popen(cmd, shell=True)