#!/bin/bash # Script to evaluate two models on AWS and compare results # This script compares the original model (without end token) with the v2 model (with end token) # Usage: bash scripts/aws/evaluate_models.sh set -e echo "==========================================" echo "Model Comparison: v1 vs v2" echo "==========================================" echo "Model 1: augustocsc/Se124M_700K_infix (original)" echo "Model 2: augustocsc/Se124M_700K_infix_v2 (with <|endofex|> token)" echo "==========================================" echo "" # Activate virtual environment source ~/seriguela/venv/bin/activate cd ~/seriguela # Set up logging LOG_FILE="evaluation_$(date +%Y%m%d_%H%M%S).log" exec > >(tee -a "$LOG_FILE") 2>&1 echo "[$(date)] Starting evaluation..." echo "" # Check GPU availability echo "Checking GPU..." if nvidia-smi &> /dev/null; then nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader echo "" else echo "WARNING: No GPU detected. Evaluation will be slow." echo "" fi # Run comparison echo "Running model comparison..." echo "This will evaluate both models on 500 samples from the test set." echo "" python scripts/compare_models.py \ --model1 augustocsc/Se124M_700K_infix \ --model2 augustocsc/Se124M_700K_infix_v2 \ --model1_name "Original (no end token)" \ --model2_name "V2 (with <|endofex|>)" \ --num_samples 500 \ --dataset_repo_id augustocsc/sintetico_natural \ --data_dir 700K \ --data_column i_prompt_n \ --output_dir ./evaluation_results/comparison echo "" echo "==========================================" echo "Evaluation Complete!" echo "==========================================" echo "Results saved to: ./evaluation_results/comparison" echo "Log file: $LOG_FILE" echo "" echo "To view results:" echo " cat ./evaluation_results/comparison/comparison_*.json | jq" echo ""