python_project_explainer / modal_functions.py
lafifi-24's picture
i
933c2fa
raw
history blame
2.41 kB
import os
import modal
app = modal.App("code-understanding")
import json
from typing import Any
import aiohttp
vllm_image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
.entrypoint([])
.uv_pip_install(
"vllm==0.11.2",
"huggingface-hub==0.36.0",
"flashinfer-python==0.5.2",
)
.env({"HF_XET_HIGH_PERFORMANCE": "1"}) # faster model transfers
)
# Configuration
EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B")
VLLM_PORT = 8000
MINUTES = 60
N_GPU=1
FAST_BOOT=True
@app.function(image=vllm_image,
gpu=f"A10:{N_GPU}",
scaledown_window=55 * MINUTES, # how long should we stay up with no requests?
timeout=10 * MINUTES, # how long should we wait for container start?
secrets=[modal.Secret.from_name("vllm-auth")]
)
@modal.concurrent(
max_inputs=32
)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
def explain_code_batch():
import subprocess
cmd = [
"vllm",
"serve",
"--uvicorn-log-level=info",
EXPLANATION_MODEL,
"--served-model-name",
EXPLANATION_MODEL,
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
"--max-model-len", "40000"
]
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
cmd += ["--tensor-parallel-size", str(N_GPU)]
print(cmd)
subprocess.Popen(" ".join(cmd), shell=True)
@app.function(image=vllm_image,
gpu=f"A10:{N_GPU}",
scaledown_window=55 * MINUTES,
timeout=10 * MINUTES,
secrets=[modal.Secret.from_name("vllm-auth")])
@modal.concurrent(
max_inputs=32
)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
def generate_embeddings_batch():
import subprocess
cmd = [
"vllm",
"serve",
"--uvicorn-log-level=info",
EMBEDDING_MODEL,
"--served-model-name",
EMBEDDING_MODEL,
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
"--task",
"embedding",
"--max-model-len", "40000"
]
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
cmd += ["--tensor-parallel-size", str(N_GPU)]
print(cmd)
subprocess.Popen(" ".join(cmd), shell=True)