gpt2_medium_prefix_682k / scripts /aws /train_fixed_model.sh
augustocsc's picture
GPT-2 Medium trained on prefix dataset (682K)
3742716 verified
#!/bin/bash
# Train model with proper end-of-expression markers
# This script retrains the Seriguela model with <|endofex|> markers in the training data
# so the model learns to stop generation correctly.
set -e # Exit on error
echo "================================================================"
echo "SERIGUELA - Training Model with Proper End Markers"
echo "================================================================"
# Configuration
MODEL_NAME="gpt2"
DATASET_REPO="augustocsc/sintetico_natural"
DATA_DIR="700K"
DATA_COLUMN="i_prompt_n" # or p_prompt_n for prefix
OUTPUT_DIR="./output/Se124M_700K_infix_v2"
HUB_MODEL_ID="augustocsc/Se124M_700K_infix_v2" # NEW REPO NAME
# Hyperparameters
EPOCHS=3
BATCH_SIZE=8
LEARNING_RATE=5e-5
BLOCK_SIZE=128
LORA_R=8
LORA_ALPHA=32
LORA_DROPOUT=0.05
echo ""
echo "Configuration:"
echo " Model: $MODEL_NAME"
echo " Dataset: $DATASET_REPO/$DATA_DIR"
echo " Data Column: $DATA_COLUMN"
echo " Output: $OUTPUT_DIR"
echo " Hub Model: $HUB_MODEL_ID"
echo ""
echo "Hyperparameters:"
echo " Epochs: $EPOCHS"
echo " Batch Size: $BATCH_SIZE"
echo " Learning Rate: $LEARNING_RATE"
echo " Block Size: $BLOCK_SIZE"
echo " LoRA r: $LORA_R"
echo " LoRA alpha: $LORA_ALPHA"
echo " LoRA dropout: $LORA_DROPOUT"
echo "================================================================"
# Check if data preparation is needed
echo ""
echo "[Step 1/3] Checking data preparation..."
if [ ! -f "./data/processed/700K_fixed/train_700K.csv" ]; then
echo "Training data not found. Preparing data with end markers..."
python scripts/data/prepare_training_data_fixed.py \
--dataset_repo_id $DATASET_REPO \
--data_dir $DATA_DIR \
--data_column $DATA_COLUMN \
--output_dir ./data/processed/700K_fixed \
--validate
if [ $? -ne 0 ]; then
echo "❌ Data preparation failed!"
exit 1
fi
echo "βœ… Data preparation complete!"
else
echo "βœ… Training data already prepared (./data/processed/700K_fixed/)"
fi
# Optional: Show sample of prepared data
echo ""
echo "Sample of prepared data:"
head -n 2 ./data/processed/700K_fixed/train_700K.csv
echo ""
# Start training
echo ""
echo "[Step 2/3] Starting training..."
echo "================================================================"
echo ""
python scripts/train.py \
--model_name_or_path $MODEL_NAME \
--dataset_repo_id $DATASET_REPO \
--data_dir $DATA_DIR \
--data_column $DATA_COLUMN \
--output_dir $OUTPUT_DIR \
--num_train_epochs $EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--learning_rate $LEARNING_RATE \
--block_size $BLOCK_SIZE \
--eval_strategy epoch \
--save_strategy epoch \
--save_total_limit 2 \
--load_best_model_at_end \
--lora_r $LORA_R \
--lora_alpha $LORA_ALPHA \
--lora_dropout $LORA_DROPOUT \
--push_to_hub \
--hub_model_id $HUB_MODEL_ID \
--logging_steps 100 \
--seed 42
if [ $? -ne 0 ]; then
echo ""
echo "❌ Training failed!"
exit 1
fi
echo ""
echo "βœ… Training complete!"
# Quick test generation
echo ""
echo "[Step 3/3] Testing model generation..."
echo "================================================================"
echo ""
python scripts/generate.py \
--model_path $OUTPUT_DIR \
--num_generations 5 \
--validate
if [ $? -ne 0 ]; then
echo ""
echo "⚠️ Generation test failed, but model was trained successfully"
else
echo ""
echo "βœ… Generation test passed!"
fi
# Summary
echo ""
echo "================================================================"
echo "TRAINING COMPLETE"
echo "================================================================"
echo "Model saved to: $OUTPUT_DIR"
echo "Model pushed to: $HUB_MODEL_ID"
echo ""
echo "Next steps:"
echo " 1. Evaluate the model: python scripts/evaluate.py --model_path $OUTPUT_DIR"
echo " 2. Compare with old model: python scripts/compare_models.py --model1 ./output/Se124M_700K_infix --model2 $OUTPUT_DIR"
echo " 3. Generate more samples: python scripts/generate.py --model_path $OUTPUT_DIR --num_generations 20"
echo "================================================================"