File size: 4,179 Bytes
5faf2eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
#!/bin/bash
# Train model with proper end-of-expression markers
# This script retrains the Seriguela model with <|endofex|> markers in the training data
# so the model learns to stop generation correctly.
set -e # Exit on error
echo "================================================================"
echo "SERIGUELA - Training Model with Proper End Markers"
echo "================================================================"
# Configuration
MODEL_NAME="gpt2"
DATASET_REPO="augustocsc/sintetico_natural"
DATA_DIR="700K"
DATA_COLUMN="i_prompt_n" # or p_prompt_n for prefix
OUTPUT_DIR="./output/Se124M_700K_infix_v2"
HUB_MODEL_ID="augustocsc/Se124M_700K_infix_v2" # NEW REPO NAME
# Hyperparameters
EPOCHS=3
BATCH_SIZE=8
LEARNING_RATE=5e-5
BLOCK_SIZE=128
LORA_R=8
LORA_ALPHA=32
LORA_DROPOUT=0.05
echo ""
echo "Configuration:"
echo " Model: $MODEL_NAME"
echo " Dataset: $DATASET_REPO/$DATA_DIR"
echo " Data Column: $DATA_COLUMN"
echo " Output: $OUTPUT_DIR"
echo " Hub Model: $HUB_MODEL_ID"
echo ""
echo "Hyperparameters:"
echo " Epochs: $EPOCHS"
echo " Batch Size: $BATCH_SIZE"
echo " Learning Rate: $LEARNING_RATE"
echo " Block Size: $BLOCK_SIZE"
echo " LoRA r: $LORA_R"
echo " LoRA alpha: $LORA_ALPHA"
echo " LoRA dropout: $LORA_DROPOUT"
echo "================================================================"
# Check if data preparation is needed
echo ""
echo "[Step 1/3] Checking data preparation..."
if [ ! -f "./data/processed/700K_fixed/train_700K.csv" ]; then
echo "Training data not found. Preparing data with end markers..."
python scripts/data/prepare_training_data_fixed.py \
--dataset_repo_id $DATASET_REPO \
--data_dir $DATA_DIR \
--data_column $DATA_COLUMN \
--output_dir ./data/processed/700K_fixed \
--validate
if [ $? -ne 0 ]; then
echo "❌ Data preparation failed!"
exit 1
fi
echo "✅ Data preparation complete!"
else
echo "✅ Training data already prepared (./data/processed/700K_fixed/)"
fi
# Optional: Show sample of prepared data
echo ""
echo "Sample of prepared data:"
head -n 2 ./data/processed/700K_fixed/train_700K.csv
echo ""
# Start training
echo ""
echo "[Step 2/3] Starting training..."
echo "================================================================"
echo ""
python scripts/train.py \
--model_name_or_path $MODEL_NAME \
--dataset_repo_id $DATASET_REPO \
--data_dir $DATA_DIR \
--data_column $DATA_COLUMN \
--output_dir $OUTPUT_DIR \
--num_train_epochs $EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--learning_rate $LEARNING_RATE \
--block_size $BLOCK_SIZE \
--eval_strategy epoch \
--save_strategy epoch \
--save_total_limit 2 \
--load_best_model_at_end \
--lora_r $LORA_R \
--lora_alpha $LORA_ALPHA \
--lora_dropout $LORA_DROPOUT \
--push_to_hub \
--hub_model_id $HUB_MODEL_ID \
--logging_steps 100 \
--seed 42
if [ $? -ne 0 ]; then
echo ""
echo "❌ Training failed!"
exit 1
fi
echo ""
echo "✅ Training complete!"
# Quick test generation
echo ""
echo "[Step 3/3] Testing model generation..."
echo "================================================================"
echo ""
python scripts/generate.py \
--model_path $OUTPUT_DIR \
--num_generations 5 \
--validate
if [ $? -ne 0 ]; then
echo ""
echo "⚠️ Generation test failed, but model was trained successfully"
else
echo ""
echo "✅ Generation test passed!"
fi
# Summary
echo ""
echo "================================================================"
echo "TRAINING COMPLETE"
echo "================================================================"
echo "Model saved to: $OUTPUT_DIR"
echo "Model pushed to: $HUB_MODEL_ID"
echo ""
echo "Next steps:"
echo " 1. Evaluate the model: python scripts/evaluate.py --model_path $OUTPUT_DIR"
echo " 2. Compare with old model: python scripts/compare_models.py --model1 ./output/Se124M_700K_infix --model2 $OUTPUT_DIR"
echo " 3. Generate more samples: python scripts/generate.py --model_path $OUTPUT_DIR --num_generations 20"
echo "================================================================"
|