|
|
import os |
|
|
import modal |
|
|
|
|
|
|
|
|
app = modal.App("code-understanding") |
|
|
|
|
|
import json |
|
|
from typing import Any |
|
|
|
|
|
import aiohttp |
|
|
|
|
|
|
|
|
vllm_image = ( |
|
|
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12") |
|
|
.entrypoint([]) |
|
|
.uv_pip_install( |
|
|
"vllm==0.11.2", |
|
|
"huggingface-hub==0.36.0", |
|
|
"flashinfer-python==0.5.2", |
|
|
) |
|
|
.env({"HF_XET_HIGH_PERFORMANCE": "1"}) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507") |
|
|
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B") |
|
|
VLLM_PORT = 8000 |
|
|
MINUTES = 60 |
|
|
N_GPU=1 |
|
|
FAST_BOOT=True |
|
|
|
|
|
@app.function(image=vllm_image, |
|
|
gpu=f"A10:{N_GPU}", |
|
|
scaledown_window=55 * MINUTES, |
|
|
timeout=10 * MINUTES, |
|
|
secrets=[modal.Secret.from_name("vllm-auth")] |
|
|
) |
|
|
@modal.concurrent( |
|
|
max_inputs=32 |
|
|
) |
|
|
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES) |
|
|
def explain_code_batch(): |
|
|
import subprocess |
|
|
|
|
|
cmd = [ |
|
|
"vllm", |
|
|
"serve", |
|
|
"--uvicorn-log-level=info", |
|
|
EXPLANATION_MODEL, |
|
|
"--served-model-name", |
|
|
EXPLANATION_MODEL, |
|
|
|
|
|
"--host", |
|
|
"0.0.0.0", |
|
|
"--port", |
|
|
str(VLLM_PORT), |
|
|
"--max-model-len", "40000" |
|
|
] |
|
|
|
|
|
|
|
|
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] |
|
|
|
|
|
|
|
|
cmd += ["--tensor-parallel-size", str(N_GPU)] |
|
|
|
|
|
print(cmd) |
|
|
|
|
|
subprocess.Popen(" ".join(cmd), shell=True) |
|
|
|
|
|
|
|
|
@app.function(image=vllm_image, |
|
|
gpu=f"A10:{N_GPU}", |
|
|
scaledown_window=55 * MINUTES, |
|
|
timeout=10 * MINUTES, |
|
|
secrets=[modal.Secret.from_name("vllm-auth")]) |
|
|
@modal.concurrent( |
|
|
max_inputs=32 |
|
|
) |
|
|
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES) |
|
|
def generate_embeddings_batch(): |
|
|
import subprocess |
|
|
|
|
|
cmd = [ |
|
|
"vllm", |
|
|
"serve", |
|
|
"--uvicorn-log-level=info", |
|
|
EMBEDDING_MODEL, |
|
|
"--served-model-name", |
|
|
EMBEDDING_MODEL, |
|
|
|
|
|
"--host", |
|
|
"0.0.0.0", |
|
|
"--port", |
|
|
str(VLLM_PORT), |
|
|
"--task", |
|
|
"embedding", |
|
|
"--max-model-len", "40000" |
|
|
] |
|
|
|
|
|
|
|
|
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] |
|
|
|
|
|
|
|
|
cmd += ["--tensor-parallel-size", str(N_GPU)] |
|
|
|
|
|
print(cmd) |
|
|
|
|
|
subprocess.Popen(" ".join(cmd), shell=True) |
|
|
|