| #!/bin/bash |
| |
| |
| |
|
|
| set -e |
|
|
| echo "================================================================" |
| echo "SERIGUELA - Training Model with Proper End Markers" |
| echo "================================================================" |
|
|
| |
| MODEL_NAME="gpt2" |
| DATASET_REPO="augustocsc/sintetico_natural" |
| DATA_DIR="700K" |
| DATA_COLUMN="i_prompt_n" |
| OUTPUT_DIR="./output/Se124M_700K_infix_v2" |
| HUB_MODEL_ID="augustocsc/Se124M_700K_infix_v2" |
|
|
| |
| EPOCHS=3 |
| BATCH_SIZE=8 |
| LEARNING_RATE=5e-5 |
| BLOCK_SIZE=128 |
| LORA_R=8 |
| LORA_ALPHA=32 |
| LORA_DROPOUT=0.05 |
|
|
| echo "" |
| echo "Configuration:" |
| echo " Model: $MODEL_NAME" |
| echo " Dataset: $DATASET_REPO/$DATA_DIR" |
| echo " Data Column: $DATA_COLUMN" |
| echo " Output: $OUTPUT_DIR" |
| echo " Hub Model: $HUB_MODEL_ID" |
| echo "" |
| echo "Hyperparameters:" |
| echo " Epochs: $EPOCHS" |
| echo " Batch Size: $BATCH_SIZE" |
| echo " Learning Rate: $LEARNING_RATE" |
| echo " Block Size: $BLOCK_SIZE" |
| echo " LoRA r: $LORA_R" |
| echo " LoRA alpha: $LORA_ALPHA" |
| echo " LoRA dropout: $LORA_DROPOUT" |
| echo "================================================================" |
|
|
| |
| echo "" |
| echo "[Step 1/3] Checking data preparation..." |
| if [ ! -f "./data/processed/700K_fixed/train_700K.csv" ]; then |
| echo "Training data not found. Preparing data with end markers..." |
|
|
| python scripts/data/prepare_training_data_fixed.py \ |
| --dataset_repo_id $DATASET_REPO \ |
| --data_dir $DATA_DIR \ |
| --data_column $DATA_COLUMN \ |
| --output_dir ./data/processed/700K_fixed \ |
| --validate |
|
|
| if [ $? -ne 0 ]; then |
| echo "β Data preparation failed!" |
| exit 1 |
| fi |
|
|
| echo "β
Data preparation complete!" |
| else |
| echo "β
Training data already prepared (./data/processed/700K_fixed/)" |
| fi |
|
|
| |
| echo "" |
| echo "Sample of prepared data:" |
| head -n 2 ./data/processed/700K_fixed/train_700K.csv |
| echo "" |
|
|
| |
| echo "" |
| echo "[Step 2/3] Starting training..." |
| echo "================================================================" |
| echo "" |
|
|
| python scripts/train.py \ |
| --model_name_or_path $MODEL_NAME \ |
| --dataset_repo_id $DATASET_REPO \ |
| --data_dir $DATA_DIR \ |
| --data_column $DATA_COLUMN \ |
| --output_dir $OUTPUT_DIR \ |
| --num_train_epochs $EPOCHS \ |
| --per_device_train_batch_size $BATCH_SIZE \ |
| --learning_rate $LEARNING_RATE \ |
| --block_size $BLOCK_SIZE \ |
| --eval_strategy epoch \ |
| --save_strategy epoch \ |
| --save_total_limit 2 \ |
| --load_best_model_at_end \ |
| --lora_r $LORA_R \ |
| --lora_alpha $LORA_ALPHA \ |
| --lora_dropout $LORA_DROPOUT \ |
| --push_to_hub \ |
| --hub_model_id $HUB_MODEL_ID \ |
| --logging_steps 100 \ |
| --seed 42 |
|
|
| if [ $? -ne 0 ]; then |
| echo "" |
| echo "β Training failed!" |
| exit 1 |
| fi |
|
|
| echo "" |
| echo "β
Training complete!" |
|
|
| |
| echo "" |
| echo "[Step 3/3] Testing model generation..." |
| echo "================================================================" |
| echo "" |
|
|
| python scripts/generate.py \ |
| --model_path $OUTPUT_DIR \ |
| --num_generations 5 \ |
| --validate |
|
|
| if [ $? -ne 0 ]; then |
| echo "" |
| echo "β οΈ Generation test failed, but model was trained successfully" |
| else |
| echo "" |
| echo "β
Generation test passed!" |
| fi |
|
|
| |
| echo "" |
| echo "================================================================" |
| echo "TRAINING COMPLETE" |
| echo "================================================================" |
| echo "Model saved to: $OUTPUT_DIR" |
| echo "Model pushed to: $HUB_MODEL_ID" |
| echo "" |
| echo "Next steps:" |
| echo " 1. Evaluate the model: python scripts/evaluate.py --model_path $OUTPUT_DIR" |
| echo " 2. Compare with old model: python scripts/compare_models.py --model1 ./output/Se124M_700K_infix --model2 $OUTPUT_DIR" |
| echo " 3. Generate more samples: python scripts/generate.py --model_path $OUTPUT_DIR --num_generations 20" |
| echo "================================================================" |
|
|