import os import modal app = modal.App("code-understanding") import json from typing import Any import aiohttp vllm_image = ( modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12") .entrypoint([]) .uv_pip_install( "vllm==0.11.2", "huggingface-hub==0.36.0", "flashinfer-python==0.5.2", ) .env({"HF_XET_HIGH_PERFORMANCE": "1"}) # faster model transfers ) # Configuration EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507") EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B") VLLM_PORT = 8000 MINUTES = 60 N_GPU=1 FAST_BOOT=True @app.function(image=vllm_image, gpu=f"A10:{N_GPU}", scaledown_window=55 * MINUTES, # how long should we stay up with no requests? timeout=10 * MINUTES, # how long should we wait for container start? secrets=[modal.Secret.from_name("vllm-auth")] ) @modal.concurrent( max_inputs=32 ) @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES) def explain_code_batch(): import subprocess cmd = [ "vllm", "serve", "--uvicorn-log-level=info", EXPLANATION_MODEL, "--served-model-name", EXPLANATION_MODEL, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--max-model-len", "40000" ] cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] cmd += ["--tensor-parallel-size", str(N_GPU)] print(cmd) subprocess.Popen(" ".join(cmd), shell=True) @app.function(image=vllm_image, gpu=f"A10:{N_GPU}", scaledown_window=55 * MINUTES, timeout=10 * MINUTES, secrets=[modal.Secret.from_name("vllm-auth")]) @modal.concurrent( max_inputs=32 ) @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES) def generate_embeddings_batch(): import subprocess cmd = [ "vllm", "serve", "--uvicorn-log-level=info", EMBEDDING_MODEL, "--served-model-name", EMBEDDING_MODEL, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--task", "embedding", "--max-model-len", "40000" ] cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] cmd += ["--tensor-parallel-size", str(N_GPU)] print(cmd) subprocess.Popen(" ".join(cmd), shell=True)