File size: 2,637 Bytes
2145d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
#SBATCH --job-name=biogrpo_phase4
#SBATCH --partition=scu-gpu
#SBATCH --account=cayuga_0003
#SBATCH --gres=gpu:1
#SBATCH --mem=96G
#SBATCH --cpus-per-task=8
#SBATCH --time=48:00:00
#SBATCH --output=logs/grpo_phase4_%j.log
#SBATCH --error=logs/grpo_phase4_%j.err

# ============================================================
# BioGRPO Phase 4: V1-Aware V4 Calibration Fix
# V4 weight=0.45 (dominant), V1-aware confidence targeting
# ============================================================

SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
WORKDIR="${SCRATCH}/training/BioRLHF"

echo "============================================================"
echo "BioGRPO Phase 4 Training"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURMD_NODENAME"
echo "Working dir: $WORKDIR"
echo "Start time: $(date)"
echo "============================================================"

cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; }
mkdir -p logs

module purge
module load cuda/12.1

. /home/fs01/jak4013/miniconda3/miniconda3/etc/profile.d/conda.sh
conda activate biorlhf

echo ""
echo "GPU Information:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
echo ""

export CUDA_VISIBLE_DEVICES=0
export TRANSFORMERS_CACHE="${WORKDIR}/cache/transformers"
export HF_HOME="${WORKDIR}/cache/huggingface"
export WANDB_DIR="${WORKDIR}/wandb"
export TOKENIZERS_PARALLELISM=false

# Data paths
export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark"
export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data"
export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm"
export BIOEVAL_ROOT="${SCRATCH}/data/BioEval"

mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR

# Symlink SFT checkpoint if not already present
if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then
    ln -s "${SCRATCH}/training/biorlhf/kmp_sft_model_final" "${WORKDIR}/kmp_sft_model_final"
    echo "Symlinked kmp_sft_model_final"
fi

echo "Starting BioGRPO Phase 4 training..."
biorlhf-grpo --config configs/grpo_phase4.json

if [ $? -eq 0 ]; then
    echo ""
    echo "============================================================"
    echo "BioGRPO Phase 4 training completed!"
    echo "Model saved to: ./biogrpo_phase4_model"
    echo "End time: $(date)"
    echo "============================================================"
else
    echo ""
    echo "============================================================"
    echo "BioGRPO Phase 4 training failed with exit code $?"
    echo "Check logs/grpo_phase4_${SLURM_JOB_ID}.err for details"
    echo "============================================================"
    exit 1
fi