| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| echo "==========================================" |
| echo "Seriguela - Full Training Workflow" |
| echo "==========================================" |
|
|
| |
| RED='\033[0;31m' |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| print_status() { |
| echo -e "${GREEN}[INFO]${NC} $1" |
| } |
|
|
| print_warning() { |
| echo -e "${YELLOW}[WARNING]${NC} $1" |
| } |
|
|
| print_error() { |
| echo -e "${RED}[ERROR]${NC} $1" |
| } |
|
|
| print_header() { |
| echo "" |
| echo -e "${BLUE}==========================================" |
| echo "$1" |
| echo -e "==========================================${NC}" |
| echo "" |
| } |
|
|
| |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")" |
| cd "$PROJECT_DIR" |
|
|
| |
| if [ -z "$VIRTUAL_ENV" ]; then |
| print_warning "Virtual environment not activated. Activating..." |
| source venv/bin/activate 2>/dev/null || { |
| print_error "Could not activate virtual environment. Please run setup_aws.sh first." |
| exit 1 |
| } |
| fi |
|
|
| |
| if [ -z "$HF_TOKEN" ]; then |
| print_error "HF_TOKEN not set. Please export HF_TOKEN='your_token'" |
| exit 1 |
| fi |
|
|
| |
| print_status "Checking GPU..." |
| nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv || { |
| print_error "GPU not available!" |
| exit 1 |
| } |
|
|
| |
| DATASET_REPO="augustocsc/sintetico_natural" |
| DATA_DIR="700K" |
| HF_USER="augustocsc" |
|
|
| |
| WANDB_PROJECT="seriguela_700K" |
| SEED=42 |
| BLOCK_SIZE=128 |
|
|
| |
| OUTPUT_BASE="./output" |
| EVAL_OUTPUT="./evaluation_results" |
| mkdir -p "$OUTPUT_BASE" "$EVAL_OUTPUT" |
|
|
| |
| |
| declare -a CONFIGS=( |
| |
| "gpt2|3|16|4|5e-5|Se124M" |
| |
| "gpt2-medium|2|8|8|3e-5|Se355M" |
| |
| "gpt2-large|2|4|16|2e-5|Se774M" |
| ) |
|
|
| |
| declare -a DATA_COLUMNS=( |
| "i_prompt_n|infix" |
| "p_prompt_n|prefix" |
| ) |
|
|
| |
| run_training() { |
| local model_name=$1 |
| local epochs=$2 |
| local batch_size=$3 |
| local grad_accum=$4 |
| local lr=$5 |
| local run_suffix=$6 |
| local data_column=$7 |
| local format=$8 |
|
|
| local run_name="${run_suffix}_${DATA_DIR}_${format}" |
| local output_dir="${OUTPUT_BASE}/${run_name}" |
| local hub_model_id="${HF_USER}/${run_name}" |
|
|
| print_header "Training: $run_name" |
| echo "Model: $model_name" |
| echo "Epochs: $epochs" |
| echo "Batch size: $batch_size" |
| echo "Gradient accumulation: $grad_accum" |
| echo "Effective batch size: $((batch_size * grad_accum))" |
| echo "Learning rate: $lr" |
| echo "Data column: $data_column" |
| echo "Output: $output_dir" |
| echo "Hub ID: $hub_model_id" |
| echo "" |
|
|
| |
| python scripts/train.py \ |
| --model_name_or_path "$model_name" \ |
| --dataset_repo_id "$DATASET_REPO" \ |
| --data_dir "$DATA_DIR" \ |
| --data_column "$data_column" \ |
| --approach "$format" \ |
| --output_dir "$output_dir" \ |
| --num_train_epochs "$epochs" \ |
| --per_device_train_batch_size "$batch_size" \ |
| --per_device_eval_batch_size "$batch_size" \ |
| --gradient_accumulation_steps "$grad_accum" \ |
| --learning_rate "$lr" \ |
| --weight_decay 0.01 \ |
| --warmup_steps 100 \ |
| --block_size "$BLOCK_SIZE" \ |
| --logging_steps 50 \ |
| --eval_strategy epoch \ |
| --save_strategy epoch \ |
| --save_total_limit 2 \ |
| --load_best_model_at_end \ |
| --fp16 \ |
| --seed "$SEED" \ |
| --wandb_project "$WANDB_PROJECT" \ |
| --wandb_run_name "$run_name" \ |
| --push_to_hub \ |
| --hub_model_id "$hub_model_id" |
|
|
| |
| if [ $? -eq 0 ]; then |
| print_status "Training completed successfully: $run_name" |
| return 0 |
| else |
| print_error "Training failed: $run_name" |
| return 1 |
| fi |
| } |
|
|
| |
| run_evaluation() { |
| local model_path=$1 |
| local data_column=$2 |
| local num_samples=${3:-500} |
|
|
| print_status "Evaluating model: $model_path" |
|
|
| python scripts/evaluate.py \ |
| --model_path "$model_path" \ |
| --dataset_repo_id "$DATASET_REPO" \ |
| --data_dir "$DATA_DIR" \ |
| --data_column "$data_column" \ |
| --num_samples "$num_samples" \ |
| --output_dir "$EVAL_OUTPUT" \ |
| --temperature 0.7 \ |
| --seed "$SEED" |
|
|
| if [ $? -eq 0 ]; then |
| print_status "Evaluation completed: $model_path" |
| else |
| print_warning "Evaluation had issues: $model_path" |
| fi |
| } |
|
|
| |
| RUN_TEST=false |
| RUN_TRAINING=true |
| RUN_EVAL=true |
| SPECIFIC_MODEL="" |
|
|
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --test-only) |
| RUN_TEST=true |
| RUN_TRAINING=false |
| RUN_EVAL=false |
| shift |
| ;; |
| --no-eval) |
| RUN_EVAL=false |
| shift |
| ;; |
| --eval-only) |
| RUN_TRAINING=false |
| RUN_EVAL=true |
| shift |
| ;; |
| --model) |
| SPECIFIC_MODEL="$2" |
| shift 2 |
| ;; |
| --help) |
| echo "Usage: $0 [OPTIONS]" |
| echo "" |
| echo "Options:" |
| echo " --test-only Run only the test training (1 epoch)" |
| echo " --no-eval Skip evaluation after training" |
| echo " --eval-only Run only evaluation (skip training)" |
| echo " --model NAME Train only specific model (gpt2, gpt2-medium, gpt2-large)" |
| echo " --help Show this help message" |
| exit 0 |
| ;; |
| *) |
| print_error "Unknown option: $1" |
| exit 1 |
| ;; |
| esac |
| done |
|
|
| |
| if [ "$RUN_TEST" = true ]; then |
| print_header "Running Test Training (1 epoch with gpt2)" |
|
|
| python scripts/train.py \ |
| --model_name_or_path gpt2 \ |
| --dataset_repo_id "$DATASET_REPO" \ |
| --data_dir "$DATA_DIR" \ |
| --data_column "i_prompt_n" \ |
| --approach "infix" \ |
| --output_dir "${OUTPUT_BASE}/test_run" \ |
| --num_train_epochs 1 \ |
| --per_device_train_batch_size 16 \ |
| --gradient_accumulation_steps 4 \ |
| --learning_rate 5e-5 \ |
| --block_size "$BLOCK_SIZE" \ |
| --logging_steps 20 \ |
| --eval_strategy epoch \ |
| --save_strategy epoch \ |
| --fp16 \ |
| --seed "$SEED" \ |
| --wandb_project "${WANDB_PROJECT}_test" |
|
|
| print_status "Test training completed!" |
| print_status "Checklist:" |
| echo " [ ] GPU detected and functioning" |
| echo " [ ] Dataset loaded correctly" |
| echo " [ ] Training completed without errors" |
| echo " [ ] Wandb received metrics" |
| echo " [ ] Model saved locally" |
| echo "" |
| echo "Now test evaluate.py and generate.py:" |
| echo " python scripts/evaluate.py --model_path ./output/test_run --num_samples 50" |
| echo " python scripts/generate.py --model_path ./output/test_run --num_generations 5 --validate" |
| exit 0 |
| fi |
|
|
| |
| declare -a COMPLETED_MODELS=() |
| declare -a FAILED_MODELS=() |
|
|
| |
| if [ "$RUN_TRAINING" = true ]; then |
| print_header "Starting Full Training Workflow" |
|
|
| START_TIME=$(date +%s) |
|
|
| for config in "${CONFIGS[@]}"; do |
| IFS='|' read -r model_name epochs batch_size grad_accum lr run_suffix <<< "$config" |
|
|
| |
| if [ -n "$SPECIFIC_MODEL" ] && [ "$model_name" != "$SPECIFIC_MODEL" ]; then |
| continue |
| fi |
|
|
| for data_config in "${DATA_COLUMNS[@]}"; do |
| IFS='|' read -r data_column format <<< "$data_config" |
|
|
| run_name="${run_suffix}_${DATA_DIR}_${format}" |
|
|
| print_status "Starting training: $run_name" |
|
|
| if run_training "$model_name" "$epochs" "$batch_size" "$grad_accum" "$lr" "$run_suffix" "$data_column" "$format"; then |
| COMPLETED_MODELS+=("${HF_USER}/${run_name}|${data_column}") |
| else |
| FAILED_MODELS+=("$run_name") |
| fi |
|
|
| |
| sleep 10 |
| done |
| done |
|
|
| END_TIME=$(date +%s) |
| DURATION=$((END_TIME - START_TIME)) |
| HOURS=$((DURATION / 3600)) |
| MINUTES=$(((DURATION % 3600) / 60)) |
|
|
| print_header "Training Summary" |
| echo "Total time: ${HOURS}h ${MINUTES}m" |
| echo "" |
| echo "Completed models (${#COMPLETED_MODELS[@]}):" |
| for model in "${COMPLETED_MODELS[@]}"; do |
| echo " - ${model%|*}" |
| done |
| echo "" |
| if [ ${#FAILED_MODELS[@]} -gt 0 ]; then |
| echo "Failed models (${#FAILED_MODELS[@]}):" |
| for model in "${FAILED_MODELS[@]}"; do |
| echo " - $model" |
| done |
| fi |
| fi |
|
|
| |
| if [ "$RUN_EVAL" = true ]; then |
| print_header "Running Evaluations" |
|
|
| |
| if [ ${#COMPLETED_MODELS[@]} -gt 0 ]; then |
| for model_info in "${COMPLETED_MODELS[@]}"; do |
| IFS='|' read -r model_path data_column <<< "$model_info" |
| run_evaluation "$model_path" "$data_column" 500 |
| done |
| else |
| |
| for config in "${CONFIGS[@]}"; do |
| IFS='|' read -r model_name epochs batch_size grad_accum lr run_suffix <<< "$config" |
|
|
| for data_config in "${DATA_COLUMNS[@]}"; do |
| IFS='|' read -r data_column format <<< "$data_config" |
|
|
| run_name="${run_suffix}_${DATA_DIR}_${format}" |
| model_path="${HF_USER}/${run_name}" |
|
|
| run_evaluation "$model_path" "$data_column" 500 |
| done |
| done |
| fi |
|
|
| print_header "Evaluation Complete" |
| echo "Results saved to: $EVAL_OUTPUT" |
| fi |
|
|
| print_header "Workflow Complete!" |
| echo "" |
| echo "Next steps:" |
| echo "1. Check training results on wandb: https://wandb.ai/${WANDB_PROJECT}" |
| echo "2. Check models on HuggingFace Hub: https://huggingface.co/${HF_USER}" |
| echo "3. Review evaluation results in: $EVAL_OUTPUT" |
| echo "" |
| echo "To test a model interactively:" |
| echo " python scripts/generate.py --model_path ${HF_USER}/Se124M_700K_infix --interactive --validate" |
| echo "" |
|
|