| #!/bin/bash |
|
|
| export HF_PROJECT="t5-base-dutch" |
|
|
| |
| export VOCAB_SIZE="32000" |
| export N_INPUT_SENTENCES="1000000" |
| export DATASET="yhavinga/mc4_nl_cleaned" |
| export DATASET_CONFIG="full" |
| export DATASET_SPLIT="train" |
| export TEXT_FIELD="text" |
| export CONFIG_TYPE="t5-base" |
| export MODEL_PATH="${HOME}/data/${HF_PROJECT}" |
|
|
| python run_t5_mlm_flax.py \ |
| --output_dir="${MODEL_PATH}" \ |
| --model_type="t5" \ |
| --config_name="${MODEL_PATH}" \ |
| --tokenizer_name="${MODEL_PATH}" \ |
| --preprocessing_num_workers="96" \ |
| --do_train --do_eval \ |
| --dataset_name="${DATASET}" \ |
| --dataset_config_name="${DATASET_CONFIG}" \ |
| --max_seq_length="512" \ |
| --per_device_train_batch_size="16" \ |
| --per_device_eval_batch_size="16" \ |
| --adafactor \ |
| --learning_rate="0.005" \ |
| --overwrite_output_dir \ |
| --num_train_epochs="1" \ |
| --logging_steps="500" \ |
| --save_steps="80000" \ |
| --eval_steps="2500" \ |
| --weight_decay="0.01" \ |
| --warmup_steps="10000" \ |
| --validation_split_count="15000" \ |
| --push_to_hub |
|
|