codellama-fine-tuning / start_training_chat_format.sh

Upload start_training_chat_format.sh with huggingface_hub

c6dbcac verified 2 months ago

1.81 kB

	#!/bin/bash
	# Start CodeLlama fine-tuning with chat format dataset

	set -e

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	cd "$SCRIPT_DIR"

	# Activate virtual environment
	source /venv/main/bin/activate

	echo "======================================================================"
	echo "🚀 Starting CodeLlama Fine-tuning with Chat Format Dataset"
	echo "======================================================================"

	# Configuration
	BASE_MODEL="models/base-models/CodeLlama-7B-Instruct"
	TRAIN_DATASET="datasets/processed/split_chat_format/train.jsonl"
	VAL_DATASET="datasets/processed/split_chat_format/val.jsonl"
	OUTPUT_DIR="training-outputs/codellama-fifo-v2-chat"

	# Check if datasets exist
	if [ ! -f "$TRAIN_DATASET" ]; then
	echo "❌ Error: Training dataset not found: $TRAIN_DATASET"
	exit 1
	fi

	if [ ! -f "$VAL_DATASET" ]; then
	echo "❌ Error: Validation dataset not found: $VAL_DATASET"
	exit 1
	fi

	echo "📊 Configuration:"
	echo " Base Model: $BASE_MODEL"
	echo " Train Dataset: $TRAIN_DATASET"
	echo " Val Dataset: $VAL_DATASET"
	echo " Output Directory: $OUTPUT_DIR"
	echo ""

	# Start training
	# Note: val-dataset is auto-detected if val.jsonl exists in same directory as train.jsonl
	python3 scripts/training/finetune_codellama.py \
	--base-model "$BASE_MODEL" \
	--dataset "$TRAIN_DATASET" \
	--output-dir "$OUTPUT_DIR" \
	--max-length 1536 \
	--num-epochs 5 \
	--learning-rate 2e-5 \
	--batch-size 4 \
	--gradient-accumulation 4 \
	--lora-r 48 \
	--lora-alpha 96 \
	--lora-dropout 0.15 \
	--resume-from-checkpoint auto

	echo ""
	echo "======================================================================"
	echo "✅ Training started!"
	echo "======================================================================"