#!/bin/bash # Start CodeLlama fine-tuning with chat format dataset set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" # Activate virtual environment source /venv/main/bin/activate echo "======================================================================" echo "🚀 Starting CodeLlama Fine-tuning with Chat Format Dataset" echo "======================================================================" # Configuration BASE_MODEL="models/base-models/CodeLlama-7B-Instruct" TRAIN_DATASET="datasets/processed/split_chat_format/train.jsonl" VAL_DATASET="datasets/processed/split_chat_format/val.jsonl" OUTPUT_DIR="training-outputs/codellama-fifo-v2-chat" # Check if datasets exist if [ ! -f "$TRAIN_DATASET" ]; then echo "❌ Error: Training dataset not found: $TRAIN_DATASET" exit 1 fi if [ ! -f "$VAL_DATASET" ]; then echo "❌ Error: Validation dataset not found: $VAL_DATASET" exit 1 fi echo "📊 Configuration:" echo " Base Model: $BASE_MODEL" echo " Train Dataset: $TRAIN_DATASET" echo " Val Dataset: $VAL_DATASET" echo " Output Directory: $OUTPUT_DIR" echo "" # Start training # Note: val-dataset is auto-detected if val.jsonl exists in same directory as train.jsonl python3 scripts/training/finetune_codellama.py \ --base-model "$BASE_MODEL" \ --dataset "$TRAIN_DATASET" \ --output-dir "$OUTPUT_DIR" \ --max-length 1536 \ --num-epochs 5 \ --learning-rate 2e-5 \ --batch-size 4 \ --gradient-accumulation 4 \ --lora-r 48 \ --lora-alpha 96 \ --lora-dropout 0.15 \ --resume-from-checkpoint auto echo "" echo "======================================================================" echo "✅ Training started!" echo "======================================================================"