File size: 2,321 Bytes
c7ebaa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
#SBATCH --job-name=biorlhf_sft
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --mem=64G
#SBATCH --cpus-per-task=8
#SBATCH --time=12:00:00
#SBATCH --output=logs/sft_%j.log
#SBATCH --error=logs/sft_%j.err

# ============================================================
# BioRLHF SFT Training Job Script for Cayuga HPC
# ============================================================

echo "============================================================"
echo "BioRLHF SFT Training"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURMD_NODENAME"
echo "Start time: $(date)"
echo "============================================================"

# Create logs directory
mkdir -p logs

# Load modules (adjust based on Cayuga's available modules)
module purge
module load cuda/12.1  # or available CUDA version
# module load anaconda3  # if using system anaconda

# Activate conda environment
source ~/.bashrc
conda activate biorlhf

# Verify GPU availability
echo ""
echo "GPU Information:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
echo ""

# Set environment variables
export CUDA_VISIBLE_DEVICES=0
export TRANSFORMERS_CACHE="./cache/transformers"
export HF_HOME="./cache/huggingface"
export WANDB_DIR="./wandb"

# Create cache directories
mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR

# Run training
echo "Starting SFT training..."
python sft_train.py \
    --model "mistralai/Mistral-7B-v0.3" \
    --dataset "kmp_sft_dataset.json" \
    --output_dir "./kmp_sft_model" \
    --epochs 3 \
    --batch_size 4 \
    --grad_accum 4 \
    --lr 2e-4 \
    --max_seq_length 2048 \
    --lora_r 32 \
    --lora_alpha 64 \
    --wandb_project "biorlhf" \
    --wandb_run "kmp_sft_$(date +%Y%m%d_%H%M%S)"

# Check exit status
if [ $? -eq 0 ]; then
    echo ""
    echo "============================================================"
    echo "Training completed successfully!"
    echo "Model saved to: ./kmp_sft_model"
    echo "End time: $(date)"
    echo "============================================================"
else
    echo ""
    echo "============================================================"
    echo "Training failed with exit code $?"
    echo "Check logs/sft_${SLURM_JOB_ID}.err for details"
    echo "============================================================"
    exit 1
fi