#!/bin/bash scripts=( # "evaluate_neulr_deductive_raw_vs_finetuned.py" # "evaluate_neulr_inductive_raw_vs_finetuned.py" # "evaluate_neulr_abductive_raw_vs_finetuned.py" "evaluate_musr_object_placements_raw_vs_finetuned.py" "evaluate_musr_murder_mystery_raw_vs_finetuned.py" "evaluate_musr_team_allocation_raw_vs_finetuned.py" "evaluate_medqa_raw_vs_finetuned.py" "evaluate_gsm8k_raw_vs_finetuned.py" "evaluate_aime_raw_vs_finetuned.py" "evaluate_aimo_raw_vs_finetuned.py" "evaluate_art_raw_vs_finetuned.py" "evaluate_copa_raw_vs_finetuned_guess_effect.py" "evaluate_goEmotion_raw_vs_finetuned.py" ) OUTPUT_DIR="/home/moein_salimi/users/Parsa/AbductiveReasoning/SFT/Evaluation/14B" ROOT_DIR="/home/moein_salimi/users/Parsa/AbductiveReasoning/SFT/Evaluation/14B" BASE_RESULTS_DIR="/home/moein_salimi/users/Parsa/AbductiveReasoning/SFT/results_sft_14b" # RAW_MODEL_PATH="/home/moein_salimi/PLLMS/unsloth-Qwen2.5-3B-Instruct-unsloth-bnb-4bit" RAW_MODEL_PATH="/home/moein_salimi/PLLMS/unsloth-Qwen2.5-14B-Instruct-bnb-4bit" # RUN_NAME="dt11.18.17:40_e20_unsloth_Qwen2.5_3B_Instruct_unsloth_bnb_4bit_bnb_4bit_lr1e-05_t0.7_ε0.2_r64_b16" RUN_NAME="SFT_dt12.11.19:13_e6_unsloth_Qwen2.5_14B_Instruct_bnb_4bit_bnb_4bit_lr5e-06_t0.0_r64_b4_SFT_Implementation" TRAINING_DIR="$BASE_RESULTS_DIR/Training_${RUN_NAME}" FINAL_DIR="$BASE_RESULTS_DIR/${RUN_NAME}" if [ -d "$TRAINING_DIR/checkpoint" ]; then CHECKPOINT_DIR="$TRAINING_DIR/checkpoint" TRAINING_BASE="$TRAINING_DIR" elif [ -d "$FINAL_DIR/checkpoint" ]; then CHECKPOINT_DIR="$FINAL_DIR/checkpoint" TRAINING_BASE="$FINAL_DIR" else echo "ERROR: Could not find checkpoint directory." echo "Tried:" echo " $TRAINING_DIR/checkpoint" echo " $FINAL_DIR/checkpoint" exit 1 fi echo "Using checkpoint directory: $CHECKPOINT_DIR" echo COMMON_ARGS="--cuda_device 0 --evaluate_checkpoints 1" declare -A BATCH_SIZES=( ["evaluate_neulr_deductive_raw_vs_finetuned.py"]=8 ["evaluate_neulr_inductive_raw_vs_finetuned.py"]=8 ["evaluate_neulr_abductive_raw_vs_finetuned.py"]=8 ["evaluate_medqa_raw_vs_finetuned.py"]=16 ["evaluate_musr_murder_mystery_raw_vs_finetuned.py"]=8 ["evaluate_musr_object_placements_raw_vs_finetuned.py"]=2 # Note that each batch has 4 questions! ["evaluate_musr_team_allocation_raw_vs_finetuned.py"]=16 ["evaluate_gsm8k_raw_vs_finetuned.py"]=16 ["evaluate_aime_raw_vs_finetuned.py"]=8 ["evaluate_aimo_raw_vs_finetuned.py"]=8 ["evaluate_art_raw_vs_finetuned.py"]=64 ["evaluate_copa_raw_vs_finetuned_guess_effect.py"]=64 ["evaluate_goEmotion_raw_vs_finetuned.py"]=16 ) export TRAINING_BASE for ckpt_name in $(ls -1 "$CHECKPOINT_DIR" | grep '^checkpoint-' | sort -t- -k2,2n); do ckpt="$CHECKPOINT_DIR/$ckpt_name" [ -d "$ckpt" ] || continue echo "=====================================" echo "Using checkpoint: $ckpt" echo "=====================================" python3 /home/moein_salimi/users/Parsa/AbductiveReasoning/SFT/Evaluation/create_table.py \ --root "$ROOT_DIR" \ --out_filename "./SFT/Evaluation//metrics_summary.xlsx" \ --run "$RUN_NAME" \ --base_model_name "qwen2.5-14B" \ --base_result_dir "$BASE_RESULTS_DIR" \ --train_data "UniADILR" done