File size: 2,462 Bytes
04ffcc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
# coding: utf-8

# Author: Du Mingzhe (dumingzhex@gmail.com)
# Date: 2025-02-03

import os
import subprocess
import time
from pathlib import Path

# Create logs directory
Path("./logs").mkdir(exist_ok=True)

# Launch models via vLLM
model_gpu_mapping = [
    # (0, 1000),
    # (0, 1500),
    # (1, 2000),
    # (1, 2500),
    # (2, 3000),
    (1, 3500),
    # (1, 4000),
    (1, 4500),
    # (1, 5000),
    (1, 5500),
    # (2, 6000),
    (3, 6500),
    # (3, 7000),
    (3, 7500),
]

launched_models = []

for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
    formatted_iter_num = f"{iter_num:07d}"
    model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
    arena_key = f"Local-Model-{iter_num:05d}"

    port = 9000 + index
    print(f"🚀 Launching {model_name} on port {port} (GPU {gpu_id}) ...")
    log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")

    process = subprocess.Popen(
        [
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", model_name,
            "--port", str(port),
            "--quantization", "bitsandbytes",
            "--gpu-memory-utilization", "0.3",
            "--max-model-len", "4096",
            "--trust-remote-code",
        ],
        env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
        stdout=log_file,
        stderr=log_file,
    )

    launched_models.append({
        "process": process,
        "model_name": model_name,
        "port": port,
        "gpu_id": gpu_id,
        "arena_key": arena_key,
        "log_file": log_file
    })

    time.sleep(10)  # Wait for initialization

print(f"✅ Launched {len(launched_models)} models. Check logs in ./logs/ directory.")

# Keep the script running and monitor processes
try:
    print("Models are running. Press Ctrl+C to stop all models.")
    while True:
        time.sleep(60)
        # Check if any processes have died
        for model in launched_models:
            if model["process"].poll() is not None:
                print(f"⚠️  Model {model['model_name']} (port {model['port']}) has stopped.")
except KeyboardInterrupt:
    print("\n🛑 Stopping all models...")
    for model in launched_models:
        if model["process"].poll() is None:
            print(f"Stopping {model['model_name']} (port {model['port']})...")
            model["process"].terminate()
        model["log_file"].close()
    print("✅ All models stopped.")