|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
echo "================================================================" |
|
|
echo "SERIGUELA - Training Model with Proper End Markers" |
|
|
echo "================================================================" |
|
|
|
|
|
|
|
|
MODEL_NAME="gpt2" |
|
|
DATASET_REPO="augustocsc/sintetico_natural" |
|
|
DATA_DIR="700K" |
|
|
DATA_COLUMN="i_prompt_n" |
|
|
OUTPUT_DIR="./output/Se124M_700K_infix_v2" |
|
|
HUB_MODEL_ID="augustocsc/Se124M_700K_infix_v2" |
|
|
|
|
|
|
|
|
EPOCHS=3 |
|
|
BATCH_SIZE=8 |
|
|
LEARNING_RATE=5e-5 |
|
|
BLOCK_SIZE=128 |
|
|
LORA_R=8 |
|
|
LORA_ALPHA=32 |
|
|
LORA_DROPOUT=0.05 |
|
|
|
|
|
echo "" |
|
|
echo "Configuration:" |
|
|
echo " Model: $MODEL_NAME" |
|
|
echo " Dataset: $DATASET_REPO/$DATA_DIR" |
|
|
echo " Data Column: $DATA_COLUMN" |
|
|
echo " Output: $OUTPUT_DIR" |
|
|
echo " Hub Model: $HUB_MODEL_ID" |
|
|
echo "" |
|
|
echo "Hyperparameters:" |
|
|
echo " Epochs: $EPOCHS" |
|
|
echo " Batch Size: $BATCH_SIZE" |
|
|
echo " Learning Rate: $LEARNING_RATE" |
|
|
echo " Block Size: $BLOCK_SIZE" |
|
|
echo " LoRA r: $LORA_R" |
|
|
echo " LoRA alpha: $LORA_ALPHA" |
|
|
echo " LoRA dropout: $LORA_DROPOUT" |
|
|
echo "================================================================" |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "[Step 1/3] Checking data preparation..." |
|
|
if [ ! -f "./data/processed/700K_fixed/train_700K.csv" ]; then |
|
|
echo "Training data not found. Preparing data with end markers..." |
|
|
|
|
|
python scripts/data/prepare_training_data_fixed.py \ |
|
|
--dataset_repo_id $DATASET_REPO \ |
|
|
--data_dir $DATA_DIR \ |
|
|
--data_column $DATA_COLUMN \ |
|
|
--output_dir ./data/processed/700K_fixed \ |
|
|
--validate |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "β Data preparation failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "β
Data preparation complete!" |
|
|
else |
|
|
echo "β
Training data already prepared (./data/processed/700K_fixed/)" |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "Sample of prepared data:" |
|
|
head -n 2 ./data/processed/700K_fixed/train_700K.csv |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "[Step 2/3] Starting training..." |
|
|
echo "================================================================" |
|
|
echo "" |
|
|
|
|
|
python scripts/train.py \ |
|
|
--model_name_or_path $MODEL_NAME \ |
|
|
--dataset_repo_id $DATASET_REPO \ |
|
|
--data_dir $DATA_DIR \ |
|
|
--data_column $DATA_COLUMN \ |
|
|
--output_dir $OUTPUT_DIR \ |
|
|
--num_train_epochs $EPOCHS \ |
|
|
--per_device_train_batch_size $BATCH_SIZE \ |
|
|
--learning_rate $LEARNING_RATE \ |
|
|
--block_size $BLOCK_SIZE \ |
|
|
--eval_strategy epoch \ |
|
|
--save_strategy epoch \ |
|
|
--save_total_limit 2 \ |
|
|
--load_best_model_at_end \ |
|
|
--lora_r $LORA_R \ |
|
|
--lora_alpha $LORA_ALPHA \ |
|
|
--lora_dropout $LORA_DROPOUT \ |
|
|
--push_to_hub \ |
|
|
--hub_model_id $HUB_MODEL_ID \ |
|
|
--logging_steps 100 \ |
|
|
--seed 42 |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "" |
|
|
echo "β Training failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
echo "β
Training complete!" |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "[Step 3/3] Testing model generation..." |
|
|
echo "================================================================" |
|
|
echo "" |
|
|
|
|
|
python scripts/generate.py \ |
|
|
--model_path $OUTPUT_DIR \ |
|
|
--num_generations 5 \ |
|
|
--validate |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "" |
|
|
echo "β οΈ Generation test failed, but model was trained successfully" |
|
|
else |
|
|
echo "" |
|
|
echo "β
Generation test passed!" |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "================================================================" |
|
|
echo "TRAINING COMPLETE" |
|
|
echo "================================================================" |
|
|
echo "Model saved to: $OUTPUT_DIR" |
|
|
echo "Model pushed to: $HUB_MODEL_ID" |
|
|
echo "" |
|
|
echo "Next steps:" |
|
|
echo " 1. Evaluate the model: python scripts/evaluate.py --model_path $OUTPUT_DIR" |
|
|
echo " 2. Compare with old model: python scripts/compare_models.py --model1 ./output/Se124M_700K_infix --model2 $OUTPUT_DIR" |
|
|
echo " 3. Generate more samples: python scripts/generate.py --model_path $OUTPUT_DIR --num_generations 20" |
|
|
echo "================================================================" |
|
|
|