#!/bin/bash # EXP-B: Training with GPT-2 EOS token (<|endoftext|>) # Uses native GPT-2 EOS token (ID 50256) set -e echo "==============================================" echo "EXP-B: EOS Token Format Training" echo "==============================================" cd ~/seriguela # Activate virtual environment source venv/bin/activate # Check data exists if [ ! -f "./data/experiments/exp_b_eos/train.csv" ]; then echo "ERROR: Training data not found!" echo "Expected: ./data/experiments/exp_b_eos/train.csv" exit 1 fi # Count samples TRAIN_COUNT=$(wc -l < ./data/experiments/exp_b_eos/train.csv) echo "Training samples: $TRAIN_COUNT" # Training configuration export WANDB_PROJECT="seriguela_experiments" export HF_TOKEN="${HF_TOKEN:-}" export WANDB_API_KEY="${WANDB_API_KEY:-}" # Run training echo "" echo "Starting training..." echo "Output: ./output/exp_b_eos" echo "" python scripts/train_experiment.py \ --experiment_name "exp_b_eos" \ --train_file ./data/experiments/exp_b_eos/train.csv \ --validation_file ./data/experiments/exp_b_eos/validation.csv \ --output_dir ./output/exp_b_eos \ --end_marker "<|endoftext|>" \ --use_native_eos \ --num_train_epochs 3 \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 4 \ --learning_rate 5e-5 \ --block_size 128 \ --fp16 \ --wandb_project seriguela_experiments \ --wandb_run_name "exp_b_eos_$(date +%Y%m%d_%H%M%S)" echo "" echo "==============================================" echo "EXP-B Training Complete!" echo "==============================================" echo "Model saved to: ./output/exp_b_eos"