gpt2_base_prefix_682k / scripts /aws /evaluate_models.sh

GPT-2 Base trained on prefix dataset (682K)

5faf2eb verified 4 days ago

1.89 kB

	#!/bin/bash
	# Script to evaluate two models on AWS and compare results
	# This script compares the original model (without end token) with the v2 model (with end token)
	# Usage: bash scripts/aws/evaluate_models.sh

	set -e

	echo "=========================================="
	echo "Model Comparison: v1 vs v2"
	echo "=========================================="
	echo "Model 1: augustocsc/Se124M_700K_infix (original)"
	echo "Model 2: augustocsc/Se124M_700K_infix_v2 (with <\|endofex\|> token)"
	echo "=========================================="
	echo ""

	# Activate virtual environment
	source ~/seriguela/venv/bin/activate
	cd ~/seriguela

	# Set up logging
	LOG_FILE="evaluation_$(date +%Y%m%d_%H%M%S).log"
	exec > >(tee -a "$LOG_FILE") 2>&1

	echo "[$(date)] Starting evaluation..."
	echo ""

	# Check GPU availability
	echo "Checking GPU..."
	if nvidia-smi &> /dev/null; then
	nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
	echo ""
	else
	echo "WARNING: No GPU detected. Evaluation will be slow."
	echo ""
	fi

	# Run comparison
	echo "Running model comparison..."
	echo "This will evaluate both models on 500 samples from the test set."
	echo ""

	python scripts/compare_models.py \
	--model1 augustocsc/Se124M_700K_infix \
	--model2 augustocsc/Se124M_700K_infix_v2 \
	--model1_name "Original (no end token)" \
	--model2_name "V2 (with <\|endofex\|>)" \
	--num_samples 500 \
	--dataset_repo_id augustocsc/sintetico_natural \
	--data_dir 700K \
	--data_column i_prompt_n \
	--output_dir ./evaluation_results/comparison

	echo ""
	echo "=========================================="
	echo "Evaluation Complete!"
	echo "=========================================="
	echo "Results saved to: ./evaluation_results/comparison"
	echo "Log file: $LOG_FILE"
	echo ""
	echo "To view results:"
	echo " cat ./evaluation_results/comparison/comparison_*.json \| jq"
	echo ""