| CURRENT_DIR=`pwd` | |
| NCCL_DEBUG=INFO | |
| GPU_ID=0 | |
| usage() { | |
| echo "Usage: ${0} [-g|--gpuid] " 1>&2 | |
| exit 1 | |
| } | |
| while [[ $# -gt 0 ]];do | |
| key=${1} | |
| case ${key} in | |
| -g|--gpuid) | |
| GPU_ID=${2} | |
| shift 2 | |
| ;; | |
| *) | |
| usage | |
| shift | |
| ;; | |
| esac | |
| done | |
| function pretrain() { | |
| SCRIPT_PATH="src/pre_training/pretrain.py" | |
| python $SCRIPT_PATH \ | |
| --model_type codet5_CC \ | |
| --warmup_steps 500 \ | |
| --learning_rate 3e-4 \ | |
| --num_train_epochs 30 \ | |
| --model_name_or_path Salesforce/codet5-base \ | |
| --tokenizer_name Salesforce/codet5-base \ | |
| --data_dir ${CURRENT_DIR}/Dataset/pre-training \ | |
| --output_dir ${CURRENT_DIR}/outputs/models/pre-training \ | |
| --always_save_model \ | |
| --train_batch_size 32 \ | |
| --gradient_accumulation_steps 4 \ | |
| --eval_batch_size 4 \ | |
| --max_source_length 512 \ | |
| --max_target_length 128 \ | |
| --gpu_id ${GPU_ID} \ | |
| --mask_rate 0.15 \ | |
| --save_steps 6000 \ | |
| --log_steps 5 \ | |
| --train_steps 800000 \ | |
| --treesitter_path ${CURRENT_DIR}/myParser/my-languages.so | |
| } | |
| pretrain; |