#!/bin/bash export SAMA_CONFIG=./config/sama_math_mistral7.yaml export TOKENIZERS_PARALLELISM=true # CUDA Include (/cuda.h) CUDA_INCLUDE_PATH="/home/work/miniconda3/envs/allm/include" export CPATH=$CPATH:$CUDA_INCLUDE_PATH export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:$CUDA_INCLUDE_PATH export WANDB_PROJECT="SAMA_MATH" date +"%F %T" # test # STEP=50 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end True --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 16 --sama_adapter.row_R 16 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to none \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 16 --sama_adapter.num_unique_blocks_R 16 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:2000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 16 --sama_adapter.row_R 16 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 16 --sama_adapter.num_unique_blocks_R 16 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 4 --sama_adapter.num_unique_blocks_R 4 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 8 --sama_adapter.row_R 8 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 4 --sama_adapter.num_unique_blocks_R 4 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 2 # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 16 --sama_adapter.row_R 16 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 4 --sama_adapter.num_unique_blocks_R 4 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 4 # date +"%F %T" # # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 32 --sama_adapter.row_R 32 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 16 --sama_adapter.num_unique_blocks_R 16 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 2 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 2 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 2 --sama_adapter.row_R 2 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 16 --sama_adapter.row_R 16 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 16 --sama_adapter.num_unique_blocks_R 16 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 4 --sama_adapter.num_unique_blocks_R 4 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # date +"%F %T" # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 2 --sama_adapter.row_R 2 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1 # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 2 # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 1.4142 # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 2.8284 # STEP=500 # accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ # --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ # --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ # --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ # --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ # --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ # --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ # --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ # --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ # --sama_adapter.scaling 4 STEP=500 accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ --sama_adapter.col_L 4 --sama_adapter.row_R 4 \ --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ --sama_adapter.scaling 1 STEP=500 accelerate launch --dynamo_backend=inductor --dynamo_mode=max-autotune --main_process_port 41353 -m src.math_train \ --config_path $SAMA_CONFIG --trainer_args.learning_rate=5e-4 --trainer_args.output_dir "./Mistral7B" \ --trainer_args.load_best_model_at_end False --trainer_args.save_strategy '"steps"' \ --sama_adapter.col_L 2 --sama_adapter.row_R 2 \ --trainer_args.num_train_epochs 2 --trainer_args.report_to wandb \ --trainer_args.save_steps $STEP --trainer_args.eval_steps $STEP --trainer_args.logging_steps $STEP \ --sama_adapter.num_unique_blocks_L 2 --sama_adapter.num_unique_blocks_R 2 \ --sama_adapter.target_modules '["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj","up_proj","down_proj"]' \ --data.dataset_split train[:20000] --trainer_args.eval_delay 0 \ --sama_adapter.scaling 0.7071 # bash scripts/math_gemma9_train.sh