File size: 5,459 Bytes
dce3a17 5d2882c dce3a17 5d2882c dce3a17 5d2882c dce3a17 5d2882c dce3a17 5d2882c dce3a17 5d2882c dce3a17 5d2882c dce3a17 ea88d12 dce3a17 678fc34 dce3a17 ea88d12 678fc34 ea88d12 678fc34 ea88d12 678fc34 dce3a17 5d2882c 78398f6 5d2882c dce3a17 78398f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | #!/usr/bin/env bash
# Evaluation script for DASCO models
# Supports MATE, MASC, and MABSA evaluation
export CUDA_VISIBLE_DEVICES="0"
# ============================================
# MATE evaluation
# ============================================
CHECKPOINT_DIR="./checkpoints/MATE_custom"
TEST_DATA="./finetune_dataset/custom/test"
best_stats_values=(0 0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Recall, F1, Model]
declare -r COR=0 LABEL=1 PRED=2 ACC=3 REC=4 F1=5 MODEL=6
for model in "${CHECKPOINT_DIR}"/*.pt; do
[ -f "$model" ] || continue # Skip if no .pt files found
output=$(python eval_tools.py \
--MATE_model "${model}" \
--test_ds "${TEST_DATA}" \
--task MATE \
--gcn_layers 4 \
--device cuda:0 2>&1)
correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2)
label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2)
prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2)
accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2)
recall=$(echo "$output" | grep -o 'Recall:[0-9.]*' | cut -d':' -f2)
f1=$(echo "$output" | grep -o 'F1:[0-9.]*' | cut -d':' -f2)
echo -e "\nModel: $(basename "$model")"
echo "Correct : ${correct:-N/A}"
echo "Label : ${label:-N/A}"
echo "Prediction : ${prediction:-N/A}"
echo "Accuracy : ${accuracy:-N/A}"
echo "Recall : ${recall:-N/A}"
echo "F1 : ${f1:-N/A}"
if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then
is_better=$(awk -v f1="$f1" -v best="${best_stats_values[$F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }')
if [ "$is_better" -eq 1 ]; then
best_stats_values[$COR]=${correct:-0}
best_stats_values[$LABEL]=${label:-0}
best_stats_values[$PRED]=${prediction:-0}
best_stats_values[$ACC]=${accuracy:-0}
best_stats_values[$REC]=${recall:-0}
best_stats_values[$F1]=${f1:-0}
best_stats_values[$MODEL]=$(basename "$model")
fi
fi
done
echo -e "\n========== MATE Best Results =========="
echo "Best Model: ${best_stats_values[$MODEL]}"
echo "F1 : ${best_stats_values[$F1]}"
echo "Accuracy: ${best_stats_values[$ACC]}"
echo "Recall : ${best_stats_values[$REC]}"
# ============================================
# MASC evaluation (uncomment to use)
# ============================================
CHECKPOINT_DIR="./checkpoints/MASC_custom"
TEST_DATA="./finetune_dataset/custom/test"
masc_best_stats=(0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Macro_F1, Model]
MASC_COR=0; MASC_LABEL=1; MASC_PRED=2; MASC_ACC=3; MASC_F1=4; MASC_MODEL=5
for model in "${CHECKPOINT_DIR}"/*.pt; do
[ -f "$model" ] || continue
output=$(python eval_tools.py \
--MASC_model "${model}" \
--test_ds "${TEST_DATA}" \
--task MASC \
--gcn_layers 4 \
--device cuda:0 2>&1)
correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2)
label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2)
prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2)
accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2)
f1=$(echo "$output" | grep -o 'Macro_f1:[0-9.]*' | cut -d':' -f2)
echo -e "\nModel: $(basename "$model")"
echo "Correct : ${correct:-N/A}"
echo "Label : ${label:-N/A}"
echo "Prediction : ${prediction:-N/A}"
echo "Accuracy : ${accuracy:-N/A}"
echo "Macro_f1 : ${f1:-N/A}"
if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then
is_better=$(awk -v f1="$f1" -v best="${masc_best_stats[$MASC_F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }')
if [ "$is_better" -eq 1 ]; then
masc_best_stats[$MASC_COR]=${correct:-0}
masc_best_stats[$MASC_LABEL]=${label:-0}
masc_best_stats[$MASC_PRED]=${prediction:-0}
masc_best_stats[$MASC_ACC]=${accuracy:-0}
masc_best_stats[$MASC_F1]=${f1:-0}
masc_best_stats[$MASC_MODEL]=$(basename "$model")
fi
fi
done
echo -e "\n========== MASC Best Results =========="
echo "Best Model: ${masc_best_stats[$MASC_MODEL]}"
echo "Macro F1: ${masc_best_stats[$MASC_F1]}"
echo "Accuracy: ${masc_best_stats[$MASC_ACC]}"
# ============================================
# MABSA evaluation (uses best models from above)
# ============================================
# Auto-detect best MATE model
BEST_MATE=$(ls -1 ./checkpoints/MATE_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1)
# Auto-detect best MASC model
BEST_MASC=$(ls -1 ./checkpoints/MASC_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1)
if [ -n "$BEST_MATE" ] && [ -n "$BEST_MASC" ]; then
echo -e "\n========== MABSA Evaluation =========="
echo "Using MATE: $(basename "$BEST_MATE")"
echo "Using MASC: $(basename "$BEST_MASC")"
python eval_tools.py \
--MATE_model "$BEST_MATE" \
--MASC_model "$BEST_MASC" \
--test_ds ./finetune_dataset/custom/test \
--task MABSA \
--gcn_layers 4 \
--device cuda:0
else
echo -e "\n========== MABSA Evaluation =========="
echo "Skipped: Need both MATE and MASC best models"
[ -z "$BEST_MATE" ] && echo " - Missing MATE model in ./checkpoints/MATE_custom/"
[ -z "$BEST_MASC" ] && echo " - Missing MASC model in ./checkpoints/MASC_custom/"
fi |