#!/usr/bin/env bash set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$PROJECT_ROOT" MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-10000}" VLLM_MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-32768}" MAX_BATCH_SIZE="${MAX_BATCH_SIZE:-2}" FREE_MEMORY_THRESHOLD_MIB="${FREE_MEMORY_THRESHOLD_MIB:-1024}" POLL_INTERVAL_SECONDS="${POLL_INTERVAL_SECONDS:-60}" RUN_DATE="${RUN_DATE:-20260530}" B_ATTR_MODEL="runs/mid/qwen3_5_35b_a3b_exp_b_attr/v1-20260331-192847/checkpoint-145-merged" B_ATTR_DATASET="runs/swift_data/qwen3_5_35b_a3b_exp_b_attr_v2/full.jsonl" B_ATTR_OUTPUT="runs/full_inference/b_attr_tp2_nothink_10000_${RUN_DATE}" B_LITE_MODEL="runs/mid/qwen3_5_35b_a3b_exp_b_lite/v0-20260504-214725/checkpoint-147-merged" B_LITE_DATASET="runs/swift_data/qwen3_5_35b_a3b_exp_b_lite_v1/full.jsonl" B_LITE_OUTPUT="runs/full_inference/b_lite_tp2_nothink_10000_${RUN_DATE}" find_free_gpus() { local gpu_ids gpu_ids="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits 2>/dev/null | awk -F ', *' -v threshold="$FREE_MEMORY_THRESHOLD_MIB" '$2 < threshold { print $1 }' | head -n 2 | paste -sd,)" || return 1 if [[ -n "$gpu_ids" && ! "$gpu_ids" =~ ^[0-9]+(,[0-9]+)?$ ]]; then return 1 fi printf '%s\n' "$gpu_ids" } wait_for_two_free_gpus() { local gpu_ids while true; do if ! gpu_ids="$(find_free_gpus)"; then printf '[%s] Unable to query GPU status. Retrying.\n' "$(date '+%F %T')" >&2 sleep "$POLL_INTERVAL_SECONDS" continue fi if [[ "$gpu_ids" == *,* ]]; then printf '%s\n' "$gpu_ids" return fi printf '[%s] Waiting for two free GPUs. Currently available: %s\n' \ "$(date '+%F %T')" "${gpu_ids:-none}" >&2 sleep "$POLL_INTERVAL_SECONDS" done } run_inference() { local gpu_ids="$1" local model_path="$2" local dataset_path="$3" local output_dir="$4" printf '[%s] Starting inference: output=%s GPUs=%s\n' \ "$(date '+%F %T')" "$output_dir" "$gpu_ids" CUDA_VISIBLE_DEVICES="$gpu_ids" conda run -n lsy-agent python -m pipelines.run_validation_inference \ --model-path "$model_path" \ --val-dataset-path "$dataset_path" \ --output-dir "$output_dir" \ --infer-backend vllm \ --tensor-parallel-size 2 \ --max-batch-size "$MAX_BATCH_SIZE" \ --max-new-tokens "$MAX_NEW_TOKENS" \ --vllm-max-model-len "$VLLM_MAX_MODEL_LEN" \ --template-type qwen3_nothinking } mkdir -p runs/full_inference runs/logs GPU_IDS="$(wait_for_two_free_gpus)" run_inference "$GPU_IDS" "$B_ATTR_MODEL" "$B_ATTR_DATASET" "$B_ATTR_OUTPUT" GPU_IDS="$(wait_for_two_free_gpus)" run_inference "$GPU_IDS" "$B_LITE_MODEL" "$B_LITE_DATASET" "$B_LITE_OUTPUT" printf '[%s] B full and B-lite inference completed.\n' "$(date '+%F %T')"