augustocsc's picture
Test training flow - 1 epoch
2c4ca2f verified
CUDA_VISIBLE_DEVICES=0 python /home/augusto/symbo_repos/seringuela/scripts/train_test.py \
--dataset_repo_id augustocsc/sintetico_natural \
--data_dir 500k \
--output_dir ./output \
--push_to_hub \
--hub_model_id augustocsc/Se124M500KInfPrompt_EOS \
--source_data_column i_prompt \
--report_to wandb \
--run_name Se124M500KInfPrompt_EOS \
--model_name_or_path gpt2 \
--bf16 \
--eval_strategy steps \
--num_train_epochs 3 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 4 \
--dataloader_num_workers 8 \
--learning_rate 5e-5 \
--warmup_ratio 0.03 \
--weight_decay 0.01 \
--max_grad_norm 1.0 \
--lr_scheduler_type cosine \
--optim adamw_torch_fused \
--logging_steps 20 \
--eval_steps 500 \
--save_steps 1000 \
--save_total_limit 3 \
# CUDA_VISIBLE_DEVICES=1 python /home/augusto/symbo_repos/seringuela/scripts/train_test.py \
# --dataset_repo_id augustocsc/sintetico_final \
# --data_dir 100k \
# --output_dir ./output \
# --push_to_hub \
# --hub_model_id augustocsc/Se124M100KInfPrompt_NT \
# --source_data_column i_prompt \
# --report_to wandb \
# --run_name Se124M100KInfPrompt_NT \
# --bf16 \
# --eval_strategy steps \
# --num_train_epochs 3 \
# --per_device_train_batch_size 16 \
# --per_device_eval_batch_size 16 \
# --gradient_accumulation_steps 2 \
# --dataloader_num_workers 8 \
# --learning_rate 2e-5 \
# --warmup_ratio 0.03 \
# --weight_decay 0.01 \
# --max_grad_norm 1.0 \
# --lr_scheduler_type cosine \
# --optim adamw_torch_fused \
# --logging_steps 20 \
# --eval_steps 500 \
# --save_steps 1000 \
# --save_total_limit 3
# CUDA_VISIBLE_DEVICES=0 python /home/augusto/symbo_repos/seringuela/scripts/train_test.py \
# --dataset_repo_id augustocsc/sintetico_final \
# --data_dir 100k \
# --output_dir ./output \
# --push_to_hub \
# --hub_model_id augustocsc/Se124M100KInfPrompt_WT \
# --source_data_column i_prompt \
# --report_to wandb \
# --run_name Se124M100KInfPrompt_WT \
# --bf16 \
# --eval_strategy steps \
# --num_train_epochs 3 \
# --per_device_train_batch_size 16 \
# --per_device_eval_batch_size 16 \
# --gradient_accumulation_steps 2 \
# --dataloader_num_workers 8 \
# --learning_rate 2e-5 \
# --warmup_ratio 0.03 \
# --weight_decay 0.01 \
# --max_grad_norm 1.0 \
# --lr_scheduler_type cosine \
# --optim adamw_torch_fused \
# --logging_steps 20 \
# --eval_steps 500 \
# --save_steps 1000 \
# --save_total_limit 3