"""Modal app that serves a Qwen3 model with vLLM's OpenAI-compatible API.

Deploy:   uv run modal deploy inference/app.py
Dev/test: uv run modal serve inference/app.py   (hot-reloads while running)

Right now this serves a small model to validate the whole pipeline cheaply.
To move to the real thing later, change ONLY two lines:
    MODEL_NAME -> "Qwen/Qwen3-32B-FP8"
    gpu=       -> "H100"   (FP8 needs Hopper) or "A100-80GB" for a BF16 build
Everything else — the contract, the client, the prompts — stays identical.
"""

import subprocess
import os

import modal

MODEL_NAME = "Qwen/Qwen3-14B"
VLLM_PORT = 8000
MINUTES = 60

vllm_image = (
    modal.Image.debian_slim(python_version="3.12")
    .pip_install(
        # vllm 0.10.2 only requires transformers>=4.55.2 (no upper bound), and
        # newer transformers removed `all_special_tokens_extended`, which 0.10.2
        # still calls -> crash at tokenizer init. Pin transformers to the version
        # vllm 0.10.2 was built against. Bump both together for Nemotron-Nano-v2.
        "vllm==0.10.2",
        "transformers==4.55.2",
        "huggingface_hub[hf_transfer]",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)

# Persisted caches so we don't re-download the model on every cold start
hf_cache = modal.Volume.from_name("hf-cache", create_if_missing=True)
vllm_cache = modal.Volume.from_name("vllm-cache", create_if_missing=True)

app = modal.App("rate-my-run-llm")

@app.function(
    image=vllm_image,
    gpu="A100-40GB",
    secrets=[modal.Secret.from_name("vllm-api-key")],
    scaledown_window=5 * MINUTES,
    timeout=30 * MINUTES,
    volumes={
        "/root/.cache/huggingface": hf_cache,
        "/root/.cache/vllm": vllm_cache,
    },
)
@modal.concurrent(max_inputs=32)
@modal.web_server(port=VLLM_PORT, startup_timeout=20 * MINUTES)
def serve():
    cmd = (
        f"vllm serve {MODEL_NAME} "
        f"--host 0.0.0.0 --port {VLLM_PORT} "
        f"--api-key {os.environ['VLLM_API_KEY']} "
        f"--max-model-len 8192"        # our prompts are short; caps KV-cache memory
    )
    subprocess.Popen(cmd, shell=True)