File size: 5,083 Bytes
f1f682e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash

# ---------------- Configuration Area ----------------
# 1. Inference Model Configuration
MODEL_NAME="Qwen-2.5-Omni-7B" # # requied, registered in ./models
EXP_MARKING="_20251024" # recommended
MODEL_PATH="/path/to/model" # requied
DATASET_NAME="UNO-Bench"
# Option 1: Use a local dataset path
DATASET_LOCAL_DIR="/path/to/dataset"
# Option 2: Use Hugging Face cache path and the program will download this dataset from Hugging Face
HF_CACHE_DIR="~/.cache/huggingface/hub"

# Inference Backend Configuration
# Options: "hf" (local HF loading) or "vllm" (start VLLM service)
INFERENCE_BACKEND="hf" # requied
TARGET_PORT=8000
TARGET_GPU_IDS="0,1"
TARGET_TP_SIZE=2

# 2. Scorer Model Configuration (UNO-Scorer)
SCORER_MODEL_PATH="/path/to/scorer" # requied
SCORER_PORT=8001
SCORER_GPU_IDS="0,1"
SCORER_TP_SIZE=2
# ------------------------------------------

set -e

# Global variables to store PIDs for cleanup
TARGET_VLLM_PID=""
SCORER_VLLM_PID=""

# === Cleanup Function ===
cleanup() {
    echo "--- [Cleanup] Checking for background processes... ---"
    if [ -n "$TARGET_VLLM_PID" ]; then
        if ps -p $TARGET_VLLM_PID > /dev/null; then
            echo "Stopping Target Inference VLLM (PID: $TARGET_VLLM_PID)..."
            kill $TARGET_VLLM_PID
            wait $TARGET_VLLM_PID 2>/dev/null || true
            echo "Target VLLM stopped."
        fi
    fi
    if [ -n "$SCORER_VLLM_PID" ]; then
        if ps -p $SCORER_VLLM_PID > /dev/null; then
            echo "Stopping Scorer VLLM (PID: $SCORER_VLLM_PID)..."
            kill $SCORER_VLLM_PID
            wait $SCORER_VLLM_PID 2>/dev/null || true
            echo "Scorer VLLM stopped."
        fi
    fi
}
trap cleanup EXIT SIGINT SIGTERM

# ==========================================
# Stage 1: Inference
# ==========================================
echo ">>> Stage 1: Running Inference ($INFERENCE_BACKEND mode)..."

if [ "$INFERENCE_BACKEND" == "vllm" ]; then
    # --- 1.1 Set environment variables specific to Qwen2.5-Omni ---

    export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
    
    echo "Starting Target Model VLLM Server..."
    echo "VIDEO_MAX_PIXELS set to: $VIDEO_MAX_PIXELS"

    # --- 1.2 Start VLLM (based on your parameters) ---
    CUDA_VISIBLE_DEVICES=$TARGET_GPU_IDS vllm serve "$MODEL_PATH" \
        --port $TARGET_PORT \
        --allowed-local-media-path "$HF_CACHE_DIR" \
        --limit_mm_per_prompt image=8 \
        --max-model-len 131072 \
        --tensor-parallel-size $TARGET_TP_SIZE \
        --trust-remote-code \
        > target_vllm.log 2>&1 &
    
    TARGET_VLLM_PID=$!
    echo "Target VLLM PID: $TARGET_VLLM_PID"

    # --- 1.3 Run Client for inference ---
    # Note: eval.py does not require a GPU, and --dataset_local_dir should preferably point to the media directory or its parent.
    CUDA_VISIBLE_DEVICES="" python3 eval.py \
        --mode inference \
        --model_name "$MODEL_NAME" \
        --model_path "$MODEL_PATH" \
        --model_api_url "http://localhost:$TARGET_PORT/v1/chat/completions" \
        --dataset_name "$DATASET_NAME" \
        --hf_cache_dir "$HF_CACHE_DIR" \
        --dataset_local_dir "$DATASET_LOCAL_DIR" \
        --exp_marking "$EXP_MARKING" \
        --batch_size 16

    echo ">>> Inference finished. Stopping Target VLLM to release GPUs..."
    
    # --- 1.4 Force release of resources (critical step) ---
    kill $TARGET_VLLM_PID
    wait $TARGET_VLLM_PID 2>/dev/null || true
    TARGET_VLLM_PID=""
    
    unset VLLM_ALLOW_LONG_MAX_MODEL_LEN
    
    echo ">>> Target GPU resources released."

else
    # --- Local HF Mode ---
    CUDA_VISIBLE_DEVICES=$TARGET_GPU_IDS python3 eval.py \
        --mode inference \
        --model_name "$MODEL_NAME" \
        --model_path "$MODEL_PATH" \
        --dataset_name "$DATASET_NAME" \
        --exp_marking "$EXP_MARKING" \
        --hf_cache_dir "$HF_CACHE_DIR" \
        --dataset_local_dir "$DATASET_LOCAL_DIR" \
        --batch_size 1
fi

# ==========================================
# Stage 2: Start Scorer Service (VLLM Scorer)
# ==========================================
echo ">>> Stage 2: Starting Scorer VLLM Server..."

# The Scorer does not need those special Omni environment variables and parameters
CUDA_VISIBLE_DEVICES=$SCORER_GPU_IDS vllm serve "$SCORER_MODEL_PATH" \
    --port $SCORER_PORT \
    --max-model-len 32768 \
    --tensor-parallel-size $SCORER_TP_SIZE \
    --trust-remote-code \
    --gpu-memory-utilization 0.9 \
    > scorer_vllm.log 2>&1 &

SCORER_VLLM_PID=$!
echo "Scorer VLLM PID: $SCORER_VLLM_PID"

# ==========================================
# Stage 3: Evaluation
# ==========================================
echo ">>> Stage 3: Running Evaluation/Scoring..."

CUDA_VISIBLE_DEVICES="" python3 eval.py \
    --mode scoring \
    --model_name "$MODEL_NAME" \
    --exp_marking "$EXP_MARKING" \
    --scorer_api_url "http://localhost:$SCORER_PORT/v1/chat/completions" \
    --dataset_name "$DATASET_NAME" \
    --hf_cache_dir "$HF_CACHE_DIR" \
    --dataset_local_dir "$DATASET_LOCAL_DIR"

echo ">>> Benchmark Workflow Completed Successfully."