#!/bin/bash
# Start CodeLlama fine-tuning with chat format dataset

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

# Activate virtual environment
source /venv/main/bin/activate

echo "======================================================================"
echo "🚀 Starting CodeLlama Fine-tuning with Chat Format Dataset"
echo "======================================================================"

# Configuration
BASE_MODEL="models/base-models/CodeLlama-7B-Instruct"
TRAIN_DATASET="datasets/processed/split_chat_format/train.jsonl"
VAL_DATASET="datasets/processed/split_chat_format/val.jsonl"
OUTPUT_DIR="training-outputs/codellama-fifo-v2-chat"

# Check if datasets exist
if [ ! -f "$TRAIN_DATASET" ]; then
    echo "❌ Error: Training dataset not found: $TRAIN_DATASET"
    exit 1
fi

if [ ! -f "$VAL_DATASET" ]; then
    echo "❌ Error: Validation dataset not found: $VAL_DATASET"
    exit 1
fi

echo "📊 Configuration:"
echo "   Base Model: $BASE_MODEL"
echo "   Train Dataset: $TRAIN_DATASET"
echo "   Val Dataset: $VAL_DATASET"
echo "   Output Directory: $OUTPUT_DIR"
echo ""

# Start training
# Note: val-dataset is auto-detected if val.jsonl exists in same directory as train.jsonl
python3 scripts/training/finetune_codellama.py \
    --base-model "$BASE_MODEL" \
    --dataset "$TRAIN_DATASET" \
    --output-dir "$OUTPUT_DIR" \
    --max-length 1536 \
    --num-epochs 5 \
    --learning-rate 2e-5 \
    --batch-size 4 \
    --gradient-accumulation 4 \
    --lora-r 48 \
    --lora-alpha 96 \
    --lora-dropout 0.15 \
    --resume-from-checkpoint auto

echo ""
echo "======================================================================"
echo "✅ Training started!"
echo "======================================================================"