vwxyzjn's picture
Upload folder using huggingface_hub
af1dcbd verified
poetry run accelerate launch --num_processes=8 old/sft.py
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/sft.py
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/sft.py --num_train_epochs=0 --run_eval --track
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/reward.py
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/reward.py --num_train_epochs=0 --run_eval
poetry run python -i summarize_from_feedback_details/dpo.py --num_train_epochs=0 --run_eval
# the PPO difference comes from datasets....
poetry run accelerate launch --num_processes=8 old/ppo_left_padding.py
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/ppo_left_padding.py
poetry run accelerate launch --num_processes=8 summarize_from_feedback_details/ppo_left_padding.py --num_train_epochs=0 --run_eval
poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding.py \
--local_rollout_forward_batch_size=2 \
--gradient_accumulation_steps=32 \
--local_micro_batch_size=2 \
--base_model=EleutherAI/pythia-6.9b-deduped \
--sft_model_path=EleutherAI/pythia-6.9b-deduped \
--reward_model_path=EleutherAI/pythia-6.9b-deduped \
--lr=3e-6 \
--deepspeed \
--run_eval \
--push_to_hub \
--track \
--seed=6661
poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding.py \
--local_rollout_forward_batch_size=4 \
--gradient_accumulation_steps=64 \
--local_micro_batch_size=1 \
--base_model=EleutherAI/pythia-6.9b-deduped \
--sft_model_path=EleutherAI/pythia-6.9b-deduped \
--deepspeed
poetry run python -i summarize_from_feedback_details/dpo_on_policy.py --sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 --reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 --base_model=EleutherAI/pythia-1b-deduped
if [ "$MODEL" = "EleutherAI/pythia-1b-deduped" ]; then
local_rollout_forward_batch_size=64
gradient_accumulation_steps=4
local_micro_batch_size=16
local_eval_batch_size=32
fi
sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
poetry run python -i \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--run_eval \
--track \
--push_to_hub
# larger model
poetry run python -i \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-6.9b-deduped/reward_model_44413 \
--local_eval_batch_size=8 \
--local_rollout_forward_batch_size=8 \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=117000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2_larger_rm" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=117000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-6.9b-deduped/reward_model_44413 \
--local_eval_batch_size=8 \
--local_rollout_forward_batch_size=8 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2_larger_rm" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-6.9b-deduped/reward_model_44413 \
--local_eval_batch_size=8 \
--local_rollout_forward_batch_size=8 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes_manual_adjustments" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
# go back to what works
/fsx/costa/summarize_from_feedback_details/summarize_from_feedback_details/dpo_on_policy.py --base_model=EleutherAI/pythia-1b-deduped --sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 --reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 --local_eval_batch_size=32 --local_rollout_forward_batch_size=32 --deepspeed --run_eval --track --push_to_hub
WANDB_TAGS="dpo_onpolicy_works_epochs" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_works_epochs" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--num_train_epochs=1 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_works_epochs" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--num_train_epochs=2 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_works_epochs" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--num_train_epochs=4 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
poetry run python -i \
summarize_from_feedback_details/dpo_on_policy_new.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--run_eval \
--track \
--push_to_hub
# larger model
poetry run python -i \
summarize_from_feedback_details/dpo_on_policy_new.py \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-6.9b-deduped/reward_model_44413 \
--local_eval_batch_size=8 \
--local_rollout_forward_batch_size=8 \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2_betas" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--beta=0.1 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2_betas" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--beta=0.2 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes2_betas" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--beta=0.4 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes3_no_rejection_sampling" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes3_no_rejection_sampling" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_55513 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_55513 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes4" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes4" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_55513 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_55513 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes5_longer_generation" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
WANDB_TAGS="dpo_onpolicy_episodes5_longer_generation" sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/dpo_on_policy_new.py \
--total_episodes=234000 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_55513 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_55513 \
--local_eval_batch_size=32 \
--local_rollout_forward_batch_size=32 \
--deepspeed \
--run_eval \
--track \
--push_to_hub
python -i summarize_from_feedback_details/ppo_left_padding1.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped
python -i summarize_from_feedback_details/ppo_left_padding.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --gradient_accumulation_steps 16 --local_rollout_forward_batch_size 5
python -i summarize_from_feedback_details/reward.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413 --num_train_epochs=0
poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding_d.py \
--local_rollout_forward_batch_size=16 \
--gradient_accumulation_steps=16 \
--local_micro_batch_size=4 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=EleutherAI/pythia-1b-deduped \
--lr=3e-6 \
--deepspeed
poetry run accelerate launch --config_file deepspeed3.yaml \
summarize_from_feedback_details/ppo_left_padding_d.py \
--local_rollout_forward_batch_size=16 \
--gradient_accumulation_steps=16 \
--local_micro_batch_size=4 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=EleutherAI/pythia-1b-deduped \
--lr=3e-6 \
--deepspeed3
python -i summarize_from_feedback_details/ppo_left_padding1.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding1.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding3.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding4.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding5.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding6.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
python -i summarize_from_feedback_details/ppo_left_padding_new1.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding2.py \
--local_rollout_forward_batch_size=64 \
--gradient_accumulation_steps=4 \
--local_micro_batch_size=16 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--lr=3e-6 \
--deepspeed \
--run_eval \
--push_to_hub \
--track \
--seed=44413
sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding4.py \
--local_rollout_forward_batch_size=64 \
--gradient_accumulation_steps=4 \
--local_micro_batch_size=16 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--lr=3e-6 \
--deepspeed \
--run_eval \
--push_to_hub \
--track \
--seed=44413
sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding5.py \
--local_rollout_forward_batch_size=64 \
--gradient_accumulation_steps=4 \
--local_micro_batch_size=16 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--lr=3e-6 \
--deepspeed \
--run_eval \
--push_to_hub \
--track \
--seed=44413
sbatch r.sbatch poetry run accelerate launch --config_file deepspeed.yaml \
summarize_from_feedback_details/ppo_left_padding_new.py \
--local_rollout_forward_batch_size=64 \
--gradient_accumulation_steps=4 \
--local_micro_batch_size=16 \
--base_model=EleutherAI/pythia-1b-deduped \
--sft_model_path=models/EleutherAI/pythia-1b-deduped/sft_model_44413 \
--reward_model_path=models/EleutherAI/pythia-1b-deduped/reward_model_44413 \
--lr=3e-6 \
--deepspeed \
--run_eval \
--push_to_hub \
--track \
--seed=44413
module load cuda/12.2
export WANDB_TAGS=refactor-chosen-rejected3,no-tag-$(git rev-parse --short HEAD)
MODELS=("EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1b-deduped")
SEEDS=(44413 55513 66613 77713)
MODEL_INDEX=$((SLURM_ARRAY_TASK_ID / 4))
SEED_INDEX=$((SLURM_ARRAY_TASK_ID % 4))
MODEL=${MODELS[$MODEL_INDEX]}
SEED=${SEEDS[$SEED_INDEX]}
echo "Running task $SLURM_ARRAY_TASK_ID with SEED: $SEED and MODEL: $MODEL"
if [ -z "$SEED" ]; then
SEED=66613
fi
if [ -z "$MODEL" ]; then
# MODEL=EleutherAI/pythia-6.9b-deduped
# MODEL=EleutherAI/pythia-2.8b-deduped
MODEL=EleutherAI/pythia-1b-deduped
# MODEL=EleutherAI/pythia-410m-deduped
fi
if [ -z "$LR" ]; then
LR=3e-6
fi
REWARD_MODEL_PATH=models/$MODEL/reward_model_$SEED
SFT_MODEL_PATH=models/$MODEL/sft_model_$SEED
POLICY_MODEL_PATH=models/$MODEL/policy_model_$SEED
DPO_POLICY_MODEL_PATH=models/$MODEL/dpo_policy_model_$SEED
poetry run accelerate launch --num_processes 8 \
summarize_from_feedback_details/reward.py \
--num_train_epochs=0 \
--run_eval \
--local_eval_batch_size=32 \
--base_model=$MODEL \
--sft_model_path=$SFT_MODEL_PATH \
--lr=$LR \
--deepspeed \
--exp_name reward_eval \
--track \
--output_dir=$REWARD_MODEL_PATH \
--local_eval_batch_size=$local_eval_batch_size \
--seed=$SEED
sbatch r.sbatch python visualize_tokens.py > sft6.9b.txt
sbatch r.sbatch python visualize_tokens.py > ppo6.9b.txt
sbatch r.sbatch python visualize_tokens.py > dpo6.9b.txt
python -i summarize_from_feedback_details/ppo_lora.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413
accelerate launch --config_file deepspeed.yaml summarize_from_feedback_details/ppo_lora.py --sft_model_path models/EleutherAI/pythia-6.9b-deduped/sft_model_44413 --base_model EleutherAI/pythia-6.9b-deduped --reward_model_path models/EleutherAI/pythia-6.9b-deduped/reward_model_44413 --local_rollout_forward_batch_size=4 --gradient_accumulation_steps=64
python -i summarize_from_feedback_details/ppo_lora.py --sft_model_path models/EleutherAI/pythia-1b-deduped/sft_model_44413 --base_model EleutherAI/pythia-1b-deduped --reward_model_path models/EleutherAI/pythia-1b-deduped/reward_model_44413 --push-to-hub --total_episodes 64