| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| cd "$PROJECT_ROOT" |
|
|
| MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-10000}" |
| VLLM_MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-32768}" |
| MAX_BATCH_SIZE="${MAX_BATCH_SIZE:-2}" |
| FREE_MEMORY_THRESHOLD_MIB="${FREE_MEMORY_THRESHOLD_MIB:-1024}" |
| POLL_INTERVAL_SECONDS="${POLL_INTERVAL_SECONDS:-60}" |
| RUN_DATE="${RUN_DATE:-20260530}" |
|
|
| B_ATTR_MODEL="runs/mid/qwen3_5_35b_a3b_exp_b_attr/v1-20260331-192847/checkpoint-145-merged" |
| B_ATTR_DATASET="runs/swift_data/qwen3_5_35b_a3b_exp_b_attr_v2/full.jsonl" |
| B_ATTR_OUTPUT="runs/full_inference/b_attr_tp2_nothink_10000_${RUN_DATE}" |
|
|
| B_LITE_MODEL="runs/mid/qwen3_5_35b_a3b_exp_b_lite/v0-20260504-214725/checkpoint-147-merged" |
| B_LITE_DATASET="runs/swift_data/qwen3_5_35b_a3b_exp_b_lite_v1/full.jsonl" |
| B_LITE_OUTPUT="runs/full_inference/b_lite_tp2_nothink_10000_${RUN_DATE}" |
|
|
| find_free_gpus() { |
| local gpu_ids |
| gpu_ids="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits 2>/dev/null | |
| awk -F ', *' -v threshold="$FREE_MEMORY_THRESHOLD_MIB" '$2 < threshold { print $1 }' | |
| head -n 2 | |
| paste -sd,)" || return 1 |
| if [[ -n "$gpu_ids" && ! "$gpu_ids" =~ ^[0-9]+(,[0-9]+)?$ ]]; then |
| return 1 |
| fi |
| printf '%s\n' "$gpu_ids" |
| } |
|
|
| wait_for_two_free_gpus() { |
| local gpu_ids |
| while true; do |
| if ! gpu_ids="$(find_free_gpus)"; then |
| printf '[%s] Unable to query GPU status. Retrying.\n' "$(date '+%F %T')" >&2 |
| sleep "$POLL_INTERVAL_SECONDS" |
| continue |
| fi |
| if [[ "$gpu_ids" == *,* ]]; then |
| printf '%s\n' "$gpu_ids" |
| return |
| fi |
| printf '[%s] Waiting for two free GPUs. Currently available: %s\n' \ |
| "$(date '+%F %T')" "${gpu_ids:-none}" >&2 |
| sleep "$POLL_INTERVAL_SECONDS" |
| done |
| } |
|
|
| run_inference() { |
| local gpu_ids="$1" |
| local model_path="$2" |
| local dataset_path="$3" |
| local output_dir="$4" |
|
|
| printf '[%s] Starting inference: output=%s GPUs=%s\n' \ |
| "$(date '+%F %T')" "$output_dir" "$gpu_ids" |
| CUDA_VISIBLE_DEVICES="$gpu_ids" conda run -n lsy-agent python -m pipelines.run_validation_inference \ |
| --model-path "$model_path" \ |
| --val-dataset-path "$dataset_path" \ |
| --output-dir "$output_dir" \ |
| --infer-backend vllm \ |
| --tensor-parallel-size 2 \ |
| --max-batch-size "$MAX_BATCH_SIZE" \ |
| --max-new-tokens "$MAX_NEW_TOKENS" \ |
| --vllm-max-model-len "$VLLM_MAX_MODEL_LEN" \ |
| --template-type qwen3_nothinking |
| } |
|
|
| mkdir -p runs/full_inference runs/logs |
|
|
| GPU_IDS="$(wait_for_two_free_gpus)" |
| run_inference "$GPU_IDS" "$B_ATTR_MODEL" "$B_ATTR_DATASET" "$B_ATTR_OUTPUT" |
|
|
| GPU_IDS="$(wait_for_two_free_gpus)" |
| run_inference "$GPU_IDS" "$B_LITE_MODEL" "$B_LITE_DATASET" "$B_LITE_OUTPUT" |
|
|
| printf '[%s] B full and B-lite inference completed.\n' "$(date '+%F %T')" |
|
|