nicholasg1997
feat: update model configurations and enhance README for local AI running coach
dcfc5a3 | """Modal app that serves a Qwen3 model with vLLM's OpenAI-compatible API. | |
| Deploy: uv run modal deploy inference/app.py | |
| Dev/test: uv run modal serve inference/app.py (hot-reloads while running) | |
| Right now this serves a small model to validate the whole pipeline cheaply. | |
| To move to the real thing later, change ONLY two lines: | |
| MODEL_NAME -> "Qwen/Qwen3-32B-FP8" | |
| gpu= -> "H100" (FP8 needs Hopper) or "A100-80GB" for a BF16 build | |
| Everything else — the contract, the client, the prompts — stays identical. | |
| """ | |
| import subprocess | |
| import os | |
| import modal | |
| MODEL_NAME = "Qwen/Qwen3-14B" | |
| VLLM_PORT = 8000 | |
| MINUTES = 60 | |
| vllm_image = ( | |
| modal.Image.debian_slim(python_version="3.12") | |
| .pip_install( | |
| # vllm 0.10.2 only requires transformers>=4.55.2 (no upper bound), and | |
| # newer transformers removed `all_special_tokens_extended`, which 0.10.2 | |
| # still calls -> crash at tokenizer init. Pin transformers to the version | |
| # vllm 0.10.2 was built against. Bump both together for Nemotron-Nano-v2. | |
| "vllm==0.10.2", | |
| "transformers==4.55.2", | |
| "huggingface_hub[hf_transfer]", | |
| ) | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| # Persisted caches so we don't re-download the model on every cold start | |
| hf_cache = modal.Volume.from_name("hf-cache", create_if_missing=True) | |
| vllm_cache = modal.Volume.from_name("vllm-cache", create_if_missing=True) | |
| app = modal.App("rate-my-run-llm") | |
| def serve(): | |
| cmd = ( | |
| f"vllm serve {MODEL_NAME} " | |
| f"--host 0.0.0.0 --port {VLLM_PORT} " | |
| f"--api-key {os.environ['VLLM_API_KEY']} " | |
| f"--max-model-len 8192" # our prompts are short; caps KV-cache memory | |
| ) | |
| subprocess.Popen(cmd, shell=True) | |