SFT_Dataset / Evaluation /run_eval_checkpoints_midtrain.sh
Parsagh1383's picture
Upload folder using huggingface_hub
e6fad38 verified
#!/bin/bash
# ===============================
# Evaluation Datasets
# ===============================
scripts=(
# "evaluate_neulr_deductive_raw_vs_finetuned.py"
# "evaluate_neulr_inductive_raw_vs_finetuned.py"
# "evaluate_neulr_abductive_raw_vs_finetuned.py"
"evaluate_musr_object_placements_raw_vs_finetuned.py"
"evaluate_musr_murder_mystery_raw_vs_finetuned.py"
"evaluate_musr_team_allocation_raw_vs_finetuned.py"
"evaluate_medqa_raw_vs_finetuned.py"
"evaluate_gsm8k_raw_vs_finetuned.py"
"evaluate_aime_raw_vs_finetuned.py"
"evaluate_aimo_raw_vs_finetuned.py"
"evaluate_art_raw_vs_finetuned.py"
"evaluate_copa_raw_vs_finetuned_guess_effect.py"
"evaluate_goEmotion_raw_vs_finetuned.py"
)
declare -A BATCH_SIZES=(
["evaluate_neulr_deductive_raw_vs_finetuned.py"]=8
["evaluate_neulr_inductive_raw_vs_finetuned.py"]=8
["evaluate_neulr_abductive_raw_vs_finetuned.py"]=8
["evaluate_medqa_raw_vs_finetuned.py"]=16
["evaluate_musr_murder_mystery_raw_vs_finetuned.py"]=8
["evaluate_musr_object_placements_raw_vs_finetuned.py"]=4 # Note that each batch has 4 questions!
["evaluate_musr_team_allocation_raw_vs_finetuned.py"]=16
["evaluate_gsm8k_raw_vs_finetuned.py"]=64
["evaluate_aime_raw_vs_finetuned.py"]=8
["evaluate_aimo_raw_vs_finetuned.py"]=8
["evaluate_art_raw_vs_finetuned.py"]=64
["evaluate_copa_raw_vs_finetuned_guess_effect.py"]=128
["evaluate_goEmotion_raw_vs_finetuned.py"]=16
)
# ============================
# Input Parameters
# ============================
: "${OUTPUT_DIR:?Error: OUTPUT_DIR must be set.}"
: "${ROOT_DIR:?Error: ROOT_DIR must be set.}"
: "${BASE_RESULTS_DIR:?Error: BASE_RESULTS_DIR must be set.}"
: "${RAW_MODEL_PATH:?Error: RAW_MODEL_PATH must be set.}"
: "${RUN_NAME:?Error: RUN_NAME must be set.}"
: "${CHKPT_NAME:?Error: CHKPT_NAME must be set.}"
: "${BASE_MODEL_NAME:?Error: BASE_MODEL_NAME must be set.}"
: "${TRAIN_DATA:?Error: TRAIN_DATA must be set.}"
: "${CUDA_DEVICE:?Error: CUDA_DEVICE must be set.}"
: "${EVALUATE_CHECKPOINTS:?Error: EVALUATE_CHECKPOINTS must be set.}"
COMMON_ARGS="--cuda_device ${CUDA_DEVICE} --evaluate_checkpoints ${EVALUATE_CHECKPOINTS}"
# ============================
# Error Handling
# ============================
TRAINING_DIR="$BASE_RESULTS_DIR/Training_${RUN_NAME}"
FINAL_DIR="$BASE_RESULTS_DIR/${RUN_NAME}"
if [ -d "$TRAINING_DIR/checkpoint" ]; then
CHECKPOINT_DIR="$TRAINING_DIR/checkpoint"
TRAINING_BASE="$TRAINING_DIR"
elif [ -d "$FINAL_DIR/checkpoint" ]; then
CHECKPOINT_DIR="$FINAL_DIR/checkpoint"
TRAINING_BASE="$FINAL_DIR"
else
echo "ERROR: Could not find checkpoint directory."
echo "Tried:"
echo " $TRAINING_DIR/checkpoint"
echo " $FINAL_DIR/checkpoint"
exit 1
fi
# ====================
# Main Loop
# ====================
echo "Using checkpoint directory: $CHECKPOINT_DIR"
echo
export TRAINING_BASE
ckpt="$CHECKPOINT_DIR/$CHKPT_NAME"
echo "====================================="
echo "Using checkpoint: $ckpt"
echo "====================================="
for script in "${scripts[@]}"; do
batch_size="${BATCH_SIZES[$script]:-256}"
echo "Running $script with checkpoint $ckpt (batch_size=$batch_size) ..."
python3 GRPO/Evaluation/"$script" \
$COMMON_ARGS \
--batch_size "$batch_size" \
--checkpoint_path "$ckpt" \
--run "$RUN_NAME" \
--raw_path "$RAW_MODEL_PATH" \
--output_path "$OUTPUT_DIR"
echo "Finished $script"
echo "-------------------------------------"
python3 GRPO/Evaluation/create_table.py \
--root "$ROOT_DIR" \
--out_csv "./GRPO/Evaluation//metrics_summary.xlsx" \
--run "$RUN_NAME" \
--base_model_name $BASE_MODEL_NAME \
--base_result_dir "$BASE_RESULTS_DIR" \
--train_data $TRAIN_DATA
done