augustocsc's picture
GPT-2 Medium trained on prefix dataset (682K)
3742716 verified
#!/bin/bash
# EXP-B: Training with GPT-2 EOS token (<|endoftext|>)
# Uses native GPT-2 EOS token (ID 50256)
set -e
echo "=============================================="
echo "EXP-B: EOS Token Format Training"
echo "=============================================="
cd ~/seriguela
# Activate virtual environment
source venv/bin/activate
# Check data exists
if [ ! -f "./data/experiments/exp_b_eos/train.csv" ]; then
echo "ERROR: Training data not found!"
echo "Expected: ./data/experiments/exp_b_eos/train.csv"
exit 1
fi
# Count samples
TRAIN_COUNT=$(wc -l < ./data/experiments/exp_b_eos/train.csv)
echo "Training samples: $TRAIN_COUNT"
# Training configuration
export WANDB_PROJECT="seriguela_experiments"
export HF_TOKEN="${HF_TOKEN:-}"
export WANDB_API_KEY="${WANDB_API_KEY:-}"
# Run training
echo ""
echo "Starting training..."
echo "Output: ./output/exp_b_eos"
echo ""
python scripts/train_experiment.py \
--experiment_name "exp_b_eos" \
--train_file ./data/experiments/exp_b_eos/train.csv \
--validation_file ./data/experiments/exp_b_eos/validation.csv \
--output_dir ./output/exp_b_eos \
--end_marker "<|endoftext|>" \
--use_native_eos \
--num_train_epochs 3 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 4 \
--learning_rate 5e-5 \
--block_size 128 \
--fp16 \
--wandb_project seriguela_experiments \
--wandb_run_name "exp_b_eos_$(date +%Y%m%d_%H%M%S)"
echo ""
echo "=============================================="
echo "EXP-B Training Complete!"
echo "=============================================="
echo "Model saved to: ./output/exp_b_eos"