File size: 2,410 Bytes
933c2fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import modal


app = modal.App("code-understanding")

import json
from typing import Any

import aiohttp


vllm_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
    .entrypoint([])
    .uv_pip_install(
        "vllm==0.11.2",
        "huggingface-hub==0.36.0",
        "flashinfer-python==0.5.2",
    )
    .env({"HF_XET_HIGH_PERFORMANCE": "1"})  # faster model transfers
)


# Configuration
EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B")
VLLM_PORT = 8000
MINUTES = 60
N_GPU=1
FAST_BOOT=True

@app.function(image=vllm_image,
    gpu=f"A10:{N_GPU}",
    scaledown_window=55 * MINUTES,  # how long should we stay up with no requests?
    timeout=10 * MINUTES,  # how long should we wait for container start?
    secrets=[modal.Secret.from_name("vllm-auth")]
)
@modal.concurrent( 
    max_inputs=32
)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
def explain_code_batch():
    import subprocess

    cmd = [
        "vllm",
        "serve",
        "--uvicorn-log-level=info",
        EXPLANATION_MODEL,
        "--served-model-name",
        EXPLANATION_MODEL,

        "--host",
        "0.0.0.0",
        "--port",
        str(VLLM_PORT),
        "--max-model-len", "40000"
    ]


    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]

 
    cmd += ["--tensor-parallel-size", str(N_GPU)]

    print(cmd)

    subprocess.Popen(" ".join(cmd), shell=True)


@app.function(image=vllm_image,
    gpu=f"A10:{N_GPU}",
    scaledown_window=55 * MINUTES, 
    timeout=10 * MINUTES,
    secrets=[modal.Secret.from_name("vllm-auth")])
@modal.concurrent(  
    max_inputs=32
)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
def generate_embeddings_batch():
    import subprocess

    cmd = [
        "vllm",
        "serve",
        "--uvicorn-log-level=info",
        EMBEDDING_MODEL,
        "--served-model-name",
        EMBEDDING_MODEL,

        "--host",
        "0.0.0.0",
        "--port",
        str(VLLM_PORT),
        "--task",
        "embedding",
        "--max-model-len", "40000"
    ]


    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]


    cmd += ["--tensor-parallel-size", str(N_GPU)]

    print(cmd)

    subprocess.Popen(" ".join(cmd), shell=True)