#!/usr/bin/env bash # Evaluation script for DASCO models # Supports MATE, MASC, and MABSA evaluation export CUDA_VISIBLE_DEVICES="0" # ============================================ # MATE evaluation # ============================================ CHECKPOINT_DIR="./checkpoints/MATE_custom" TEST_DATA="./finetune_dataset/custom/test" best_stats_values=(0 0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Recall, F1, Model] declare -r COR=0 LABEL=1 PRED=2 ACC=3 REC=4 F1=5 MODEL=6 for model in "${CHECKPOINT_DIR}"/*.pt; do [ -f "$model" ] || continue # Skip if no .pt files found output=$(python eval_tools.py \ --MATE_model "${model}" \ --test_ds "${TEST_DATA}" \ --task MATE \ --gcn_layers 4 \ --device cuda:0 2>&1) correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2) label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2) prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2) accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2) recall=$(echo "$output" | grep -o 'Recall:[0-9.]*' | cut -d':' -f2) f1=$(echo "$output" | grep -o 'F1:[0-9.]*' | cut -d':' -f2) echo -e "\nModel: $(basename "$model")" echo "Correct : ${correct:-N/A}" echo "Label : ${label:-N/A}" echo "Prediction : ${prediction:-N/A}" echo "Accuracy : ${accuracy:-N/A}" echo "Recall : ${recall:-N/A}" echo "F1 : ${f1:-N/A}" if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then is_better=$(awk -v f1="$f1" -v best="${best_stats_values[$F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }') if [ "$is_better" -eq 1 ]; then best_stats_values[$COR]=${correct:-0} best_stats_values[$LABEL]=${label:-0} best_stats_values[$PRED]=${prediction:-0} best_stats_values[$ACC]=${accuracy:-0} best_stats_values[$REC]=${recall:-0} best_stats_values[$F1]=${f1:-0} best_stats_values[$MODEL]=$(basename "$model") fi fi done echo -e "\n========== MATE Best Results ==========" echo "Best Model: ${best_stats_values[$MODEL]}" echo "F1 : ${best_stats_values[$F1]}" echo "Accuracy: ${best_stats_values[$ACC]}" echo "Recall : ${best_stats_values[$REC]}" # ============================================ # MASC evaluation (uncomment to use) # ============================================ CHECKPOINT_DIR="./checkpoints/MASC_custom" TEST_DATA="./finetune_dataset/custom/test" masc_best_stats=(0 0 0 0 0 "None") # [Correct, Label, Prediction, Accuracy, Macro_F1, Model] MASC_COR=0; MASC_LABEL=1; MASC_PRED=2; MASC_ACC=3; MASC_F1=4; MASC_MODEL=5 for model in "${CHECKPOINT_DIR}"/*.pt; do [ -f "$model" ] || continue output=$(python eval_tools.py \ --MASC_model "${model}" \ --test_ds "${TEST_DATA}" \ --task MASC \ --gcn_layers 4 \ --device cuda:0 2>&1) correct=$(echo "$output" | grep -o 'Correct:[0-9]*' | cut -d':' -f2) label=$(echo "$output" | grep -o 'Label:[0-9]*' | cut -d':' -f2) prediction=$(echo "$output" | grep -o 'Prediction:[0-9]*' | cut -d':' -f2) accuracy=$(echo "$output" | grep -o 'Accuracy:[0-9.]*' | cut -d':' -f2) f1=$(echo "$output" | grep -o 'Macro_f1:[0-9.]*' | cut -d':' -f2) echo -e "\nModel: $(basename "$model")" echo "Correct : ${correct:-N/A}" echo "Label : ${label:-N/A}" echo "Prediction : ${prediction:-N/A}" echo "Accuracy : ${accuracy:-N/A}" echo "Macro_f1 : ${f1:-N/A}" if [[ "${f1:-0}" =~ ^[0-9.]+$ ]]; then is_better=$(awk -v f1="$f1" -v best="${masc_best_stats[$MASC_F1]}" 'BEGIN { print (f1 > best) ? 1 : 0 }') if [ "$is_better" -eq 1 ]; then masc_best_stats[$MASC_COR]=${correct:-0} masc_best_stats[$MASC_LABEL]=${label:-0} masc_best_stats[$MASC_PRED]=${prediction:-0} masc_best_stats[$MASC_ACC]=${accuracy:-0} masc_best_stats[$MASC_F1]=${f1:-0} masc_best_stats[$MASC_MODEL]=$(basename "$model") fi fi done echo -e "\n========== MASC Best Results ==========" echo "Best Model: ${masc_best_stats[$MASC_MODEL]}" echo "Macro F1: ${masc_best_stats[$MASC_F1]}" echo "Accuracy: ${masc_best_stats[$MASC_ACC]}" # ============================================ # MABSA evaluation (uses best models from above) # ============================================ # Auto-detect best MATE model BEST_MATE=$(ls -1 ./checkpoints/MATE_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1) # Auto-detect best MASC model BEST_MASC=$(ls -1 ./checkpoints/MASC_custom/best_f1:*.pt 2>/dev/null | sort -t: -k2 -rn | head -1) if [ -n "$BEST_MATE" ] && [ -n "$BEST_MASC" ]; then echo -e "\n========== MABSA Evaluation ==========" echo "Using MATE: $(basename "$BEST_MATE")" echo "Using MASC: $(basename "$BEST_MASC")" python eval_tools.py \ --MATE_model "$BEST_MATE" \ --MASC_model "$BEST_MASC" \ --test_ds ./finetune_dataset/custom/test \ --task MABSA \ --gcn_layers 4 \ --device cuda:0 else echo -e "\n========== MABSA Evaluation ==========" echo "Skipped: Need both MATE and MASC best models" [ -z "$BEST_MATE" ] && echo " - Missing MATE model in ./checkpoints/MATE_custom/" [ -z "$BEST_MASC" ] && echo " - Missing MASC model in ./checkpoints/MASC_custom/" fi