File size: 1,890 Bytes
5faf2eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
#!/bin/bash
# Script to evaluate two models on AWS and compare results
# This script compares the original model (without end token) with the v2 model (with end token)
# Usage: bash scripts/aws/evaluate_models.sh
set -e
echo "=========================================="
echo "Model Comparison: v1 vs v2"
echo "=========================================="
echo "Model 1: augustocsc/Se124M_700K_infix (original)"
echo "Model 2: augustocsc/Se124M_700K_infix_v2 (with <|endofex|> token)"
echo "=========================================="
echo ""
# Activate virtual environment
source ~/seriguela/venv/bin/activate
cd ~/seriguela
# Set up logging
LOG_FILE="evaluation_$(date +%Y%m%d_%H%M%S).log"
exec > >(tee -a "$LOG_FILE") 2>&1
echo "[$(date)] Starting evaluation..."
echo ""
# Check GPU availability
echo "Checking GPU..."
if nvidia-smi &> /dev/null; then
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
echo ""
else
echo "WARNING: No GPU detected. Evaluation will be slow."
echo ""
fi
# Run comparison
echo "Running model comparison..."
echo "This will evaluate both models on 500 samples from the test set."
echo ""
python scripts/compare_models.py \
--model1 augustocsc/Se124M_700K_infix \
--model2 augustocsc/Se124M_700K_infix_v2 \
--model1_name "Original (no end token)" \
--model2_name "V2 (with <|endofex|>)" \
--num_samples 500 \
--dataset_repo_id augustocsc/sintetico_natural \
--data_dir 700K \
--data_column i_prompt_n \
--output_dir ./evaluation_results/comparison
echo ""
echo "=========================================="
echo "Evaluation Complete!"
echo "=========================================="
echo "Results saved to: ./evaluation_results/comparison"
echo "Log file: $LOG_FILE"
echo ""
echo "To view results:"
echo " cat ./evaluation_results/comparison/comparison_*.json | jq"
echo ""
|