#!/bin/bash
#SBATCH --job-name=Qwen3_1.7B_SFT_RL    # Name of the job
#SBATCH --gres=gpu:4    # Number of GPUs
#SBATCH -p a100    # Partition
#SBATCH -c 12    # Number of cores
#SBATCH --time=12:00:00    # Time limit
#SBATCH --mem=128gb    # Memory limit
#SBATCH --output=Qwen3_1.7B_SFT_RL_a100-%j.out    # Output file
#SBATCH --error=Qwen3_1.7B_SFT_RL_a100-%j.err    # Error file

## Environment Setup
echo "CUDA_HOME: $CUDA_HOME"
echo "PATH: $PATH"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "which python: $(which python)"

## Configuration Variables
# Change these to match your setup
SFT_CHECKPOINT=SFT_CHECKPOINT    # Change to the checkpoint of the SFT model
CACHE_DIR=CACHE_DIR    # Change to the directory where the model weights are cached
OUTPUT_DIR=OUTPUT_DIR    # Change to the directory where the model will be saved
CONDA_ENV=CONDA_ENV    # Change to the conda environment

## Setup Environment
conda activate $CONDA_ENV    # Change to the conda environment
cd .../BioReason/    # Change to the directory containing the script
nvidia-smi    # Check GPU status

## Dependencies
# You might need to install this on a gpu session
# pip install trl[vllm]

## =============================================================================
## Reinforcement Learning Training with DeepSpeed
## =============================================================================

# Run with DeepSpeed ZeRO Stage 2
srun deepspeed --num_gpus=4 --num_nodes=1 \
    reason.py \
    --deepspeed grpo_trainer_lora_model/ds_config_stage2.json \
    --num_generations 4 \
    --per_device_train_batch_size 2 \
    --bf16 true \
    --ddp_find_unused_parameters false \
    --sft_checkpoint $SFT_CHECKPOINT \
    --model_name_or_path Qwen/Qwen3-1.7B \
    --dna_model_name_or_path InstaDeepAI/nucleotide-transformer-v2-500m-multi-species \
    --cache_dir $CACHE_DIR \
    --output_dir $OUTPUT_DIR \
    --save_strategy "steps" \
    --save_steps 100 \
    --save_total_limit 2 \
    --use_vllm true \
    --temperature 0.6 \
    --top_p 0.95 \
    --top_k 20 \
    --num_train_epochs 1