dasco / eval.sh
toilachuoituyet's picture
Upload project files
78398f6 verified
#!/usr/bin/env bash
# Evaluation script for DASCO models
# Supports MATE, MASC, and MABSA evaluation
export CUDA_VISIBLE_DEVICES="0"
# ============================================
# MATE evaluation
# ============================================
CHECKPOINT_DIR="./checkpoints/MATE_custom"
TEST_DATA="./finetune_dataset/custom/test"
best_stats_values=(0 0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Recall, F1, Model]
declare -r COR=0 LABEL=1 PRED=2 ACC=3 REC=4 F1=5 MODEL=6
for model in "${CHECKPOINT_DIR}"/*.pt; do
[ -f "$model" ] || continue # Skip if no .pt files found
output=$(python eval_tools.py \
--MATE_model "${model}" \
--test_ds "${TEST_DATA}" \
--task MATE \
--gcn_layers 4 \
--device cuda:0 2>&1)
correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2)
label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2)
prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2)
accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2)
recall=$(echo "$output" | grep -o 'Recall:[0-9.]*' | cut -d':' -f2)
f1=$(echo "$output" | grep -o 'F1:[0-9.]*' | cut -d':' -f2)
echo -e "\nModel: $(basename "$model")"
echo "Correct : ${correct:-N/A}"
echo "Label : ${label:-N/A}"
echo "Prediction : ${prediction:-N/A}"
echo "Accuracy : ${accuracy:-N/A}"
echo "Recall : ${recall:-N/A}"
echo "F1 : ${f1:-N/A}"
if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then
is_better=$(awk -v f1="$f1" -v best="${best_stats_values[$F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }')
if [ "$is_better" -eq 1 ]; then
best_stats_values[$COR]=${correct:-0}
best_stats_values[$LABEL]=${label:-0}
best_stats_values[$PRED]=${prediction:-0}
best_stats_values[$ACC]=${accuracy:-0}
best_stats_values[$REC]=${recall:-0}
best_stats_values[$F1]=${f1:-0}
best_stats_values[$MODEL]=$(basename "$model")
fi
fi
done
echo -e "\n========== MATE Best Results =========="
echo "Best Model: ${best_stats_values[$MODEL]}"
echo "F1 : ${best_stats_values[$F1]}"
echo "Accuracy: ${best_stats_values[$ACC]}"
echo "Recall : ${best_stats_values[$REC]}"
# ============================================
# MASC evaluation (uncomment to use)
# ============================================
CHECKPOINT_DIR="./checkpoints/MASC_custom"
TEST_DATA="./finetune_dataset/custom/test"
masc_best_stats=(0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Macro_F1, Model]
MASC_COR=0; MASC_LABEL=1; MASC_PRED=2; MASC_ACC=3; MASC_F1=4; MASC_MODEL=5
for model in "${CHECKPOINT_DIR}"/*.pt; do
[ -f "$model" ] || continue
output=$(python eval_tools.py \
--MASC_model "${model}" \
--test_ds "${TEST_DATA}" \
--task MASC \
--gcn_layers 4 \
--device cuda:0 2>&1)
correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2)
label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2)
prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2)
accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2)
f1=$(echo "$output" | grep -o 'Macro_f1:[0-9.]*' | cut -d':' -f2)
echo -e "\nModel: $(basename "$model")"
echo "Correct : ${correct:-N/A}"
echo "Label : ${label:-N/A}"
echo "Prediction : ${prediction:-N/A}"
echo "Accuracy : ${accuracy:-N/A}"
echo "Macro_f1 : ${f1:-N/A}"
if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then
is_better=$(awk -v f1="$f1" -v best="${masc_best_stats[$MASC_F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }')
if [ "$is_better" -eq 1 ]; then
masc_best_stats[$MASC_COR]=${correct:-0}
masc_best_stats[$MASC_LABEL]=${label:-0}
masc_best_stats[$MASC_PRED]=${prediction:-0}
masc_best_stats[$MASC_ACC]=${accuracy:-0}
masc_best_stats[$MASC_F1]=${f1:-0}
masc_best_stats[$MASC_MODEL]=$(basename "$model")
fi
fi
done
echo -e "\n========== MASC Best Results =========="
echo "Best Model: ${masc_best_stats[$MASC_MODEL]}"
echo "Macro F1: ${masc_best_stats[$MASC_F1]}"
echo "Accuracy: ${masc_best_stats[$MASC_ACC]}"
# ============================================
# MABSA evaluation (uses best models from above)
# ============================================
# Auto-detect best MATE model
BEST_MATE=$(ls -1 ./checkpoints/MATE_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1)
# Auto-detect best MASC model
BEST_MASC=$(ls -1 ./checkpoints/MASC_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1)
if [ -n "$BEST_MATE" ] && [ -n "$BEST_MASC" ]; then
echo -e "\n========== MABSA Evaluation =========="
echo "Using MATE: $(basename "$BEST_MATE")"
echo "Using MASC: $(basename "$BEST_MASC")"
python eval_tools.py \
--MATE_model "$BEST_MATE" \
--MASC_model "$BEST_MASC" \
--test_ds ./finetune_dataset/custom/test \
--task MABSA \
--gcn_layers 4 \
--device cuda:0
else
echo -e "\n========== MABSA Evaluation =========="
echo "Skipped: Need both MATE and MASC best models"
[ -z "$BEST_MATE" ] && echo " - Missing MATE model in ./checkpoints/MATE_custom/"
[ -z "$BEST_MASC" ] && echo " - Missing MASC model in ./checkpoints/MASC_custom/"
fi