File size: 4,079 Bytes
114c561 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | #!/bin/bash
# =============================================================================
# Parallel evaluation of 8 BASE (untrained) models on exp1 validation set.
# Each model runs on a dedicated GPU. InternVL uses transformers; rest use vLLM.
#
# Layout:
# GPU 0: Qwen3.5-0.8B (vLLM)
# GPU 1: Qwen3.5-2B (vLLM)
# GPU 2: Qwen3.5-9B (vLLM)
# GPU 3: Qwen3-VL-2B (vLLM)
# GPU 4: Qwen3-VL-8B (vLLM)
# GPU 5: GLM-4.6V-Flash (vLLM)
# GPU 6: InternVL3.5-8B-HF (transformers, vLLM不兼容)
# GPU 7: Gemma-4-E4B-it (vLLM)
#
# Output: <model_dir>/eval_base_<timestamp>/eval_results_<model>.json
# <model_dir>/eval_base_<timestamp>/raw_errors_<model>.json
# =============================================================================
set -uo pipefail
PROJECT_DIR="/mnt/sfs_turbo_new/R11181/project_vlm"
EXP_DIR="${PROJECT_DIR}/exp_v5"
LOG_DIR="${EXP_DIR}/logs"
mkdir -p "${LOG_DIR}"
CONDA_ROOT="/mnt/sfs_turbo/R11181/miniconda3"
MODEL_DIR="${PROJECT_DIR}/model"
VAL_JSONL="${EXP_DIR}/data/exp1/exp1_val_1160.jsonl"
ts="$(date +%Y%m%d_%H%M%S)"
# Each base model gets a dedicated output subdir to keep results separate
BASE_OUT_ROOT="${EXP_DIR}/output/base_eval_${ts}"
mkdir -p "${BASE_OUT_ROOT}"
# helper for vLLM eval
run_vllm() {
local gpu="$1" model_path="$2" name="$3" gmu="$4"
local outdir="${BASE_OUT_ROOT}/${name}"
local log="${LOG_DIR}/eval_base_${name}_${ts}.log"
mkdir -p "${outdir}"
{
source "${CONDA_ROOT}/etc/profile.d/conda.sh"
conda activate vllm_eval
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export PYTHONUNBUFFERED=1
echo "[$(date '+%F %T')] ${name} (BASE) eval on GPU ${gpu} (vLLM, gmu=${gmu})"
CUDA_VISIBLE_DEVICES="${gpu}" python3 "${PROJECT_DIR}/eval_vln_vllm.py" \
--model_path "${model_path}" \
--val_path "${VAL_JSONL}" \
--output_dir "${outdir}" \
--gpu_memory_utilization "${gmu}" \
--batch_size 32 \
--save_raw
echo "[$(date '+%F %T')] ${name} DONE"
} > "${log}" 2>&1 &
disown
echo "[OK] ${name} (vLLM) on GPU ${gpu}, pid=$!"
}
# helper for transformers eval (InternVL only)
run_transformers() {
local gpu="$1" model_path="$2" name="$3"
local outdir="${BASE_OUT_ROOT}/${name}"
local log="${LOG_DIR}/eval_base_${name}_${ts}.log"
mkdir -p "${outdir}"
{
source "${CONDA_ROOT}/etc/profile.d/conda.sh"
conda activate vlm_train
export PYTHONUNBUFFERED=1
echo "[$(date '+%F %T')] ${name} (BASE) eval on GPU ${gpu} (transformers)"
CUDA_VISIBLE_DEVICES="${gpu}" python3 "${PROJECT_DIR}/eval_vln_transformers.py" \
--model_path "${model_path}" \
--val_path "${VAL_JSONL}" \
--output_dir "${outdir}" \
--model_type internvl
echo "[$(date '+%F %T')] ${name} DONE"
} > "${log}" 2>&1 &
disown
echo "[OK] ${name} (transformers) on GPU ${gpu}, pid=$!"
}
# -----------------------------------------------------------------------------
# 8 lanes
# -----------------------------------------------------------------------------
run_vllm 0 "${MODEL_DIR}/Qwen3.5-0.8B" "Qwen3.5-0.8B-base" 0.7
run_vllm 1 "${MODEL_DIR}/Qwen3.5-2B" "Qwen3.5-2B-base" 0.7
run_vllm 2 "${MODEL_DIR}/Qwen3.5-9B" "Qwen3.5-9B-base" 0.7
run_vllm 3 "${MODEL_DIR}/Qwen3-VL-2B-Instruct" "Qwen3-VL-2B-base" 0.7
run_vllm 4 "${MODEL_DIR}/Qwen3-VL-8B-Instruct" "Qwen3-VL-8B-base" 0.7
run_vllm 5 "${MODEL_DIR}/GLM-4.6V-Flash" "GLM-4.6V-Flash-base" 0.7
run_transformers 6 "${MODEL_DIR}/InternVL3_5-8B-HF" "InternVL3.5-8B-base"
run_vllm 7 "${MODEL_DIR}/Gemma-4-E4B-it" "Gemma-4-E4B-base" 0.7
echo ""
echo "============================================================"
echo "8 base eval lanes launched."
echo "Results dir: ${BASE_OUT_ROOT}"
echo "Logs: ${LOG_DIR}/eval_base_*_${ts}.log"
echo "============================================================"
|