updated training script with auth_token
Browse files
events.out.tfevents.1629051212.t1v-n-358ff5d1-w-0.196309.3.v2
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b2349e130dcf873a6c1b6a72c7905545522989ef1325b13371a5f71d077ff8c
|
| 3 |
+
size 77452220
|
run_mlm_flax_stream.py
CHANGED
|
@@ -129,6 +129,9 @@ class DataTrainingArguments:
|
|
| 129 |
default=None,
|
| 130 |
metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
|
| 131 |
)
|
|
|
|
|
|
|
|
|
|
| 132 |
overwrite_cache: bool = field(
|
| 133 |
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
| 134 |
)
|
|
@@ -361,6 +364,7 @@ if __name__ == "__main__":
|
|
| 361 |
#
|
| 362 |
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
| 363 |
# 'text' is found. You can easily tweak this behavior (see below).
|
|
|
|
| 364 |
if data_args.dataset_name is not None:
|
| 365 |
# Downloading and loading a dataset from the hub.
|
| 366 |
dataset = load_dataset(
|
|
@@ -368,6 +372,7 @@ if __name__ == "__main__":
|
|
| 368 |
data_args.dataset_config_name,
|
| 369 |
cache_dir=model_args.cache_dir,
|
| 370 |
streaming=True,
|
|
|
|
| 371 |
split="train",
|
| 372 |
)
|
| 373 |
|
|
|
|
| 129 |
default=None,
|
| 130 |
metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
|
| 131 |
)
|
| 132 |
+
auth_token: bool = field(
|
| 133 |
+
default=False, metadata={"help": "Use authorisation token"}
|
| 134 |
+
)
|
| 135 |
overwrite_cache: bool = field(
|
| 136 |
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
| 137 |
)
|
|
|
|
| 364 |
#
|
| 365 |
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
| 366 |
# 'text' is found. You can easily tweak this behavior (see below).
|
| 367 |
+
|
| 368 |
if data_args.dataset_name is not None:
|
| 369 |
# Downloading and loading a dataset from the hub.
|
| 370 |
dataset = load_dataset(
|
|
|
|
| 372 |
data_args.dataset_config_name,
|
| 373 |
cache_dir=model_args.cache_dir,
|
| 374 |
streaming=True,
|
| 375 |
+
use_auth_token=data_args.auth_token,
|
| 376 |
split="train",
|
| 377 |
)
|
| 378 |
|
run_recover_1350_stream.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
./run_mlm_flax_stream.py \
|
| 2 |
+
--output_dir="./" \
|
| 3 |
+
--model_type="roberta" \
|
| 4 |
+
--config_name="./" \
|
| 5 |
+
--tokenizer_name="./" \
|
| 6 |
+
--model_name_or_path="./"
|
| 7 |
+
--dataset_name="NbAiLab/NCC2" \
|
| 8 |
+
--max_seq_length="128" \
|
| 9 |
+
--weight_decay="0.01" \
|
| 10 |
+
--per_device_train_batch_size="128" \
|
| 11 |
+
--per_device_eval_batch_size="128" \
|
| 12 |
+
--learning_rate="3e-4" \
|
| 13 |
+
--warmup_steps="0" \
|
| 14 |
+
--overwrite_output_dir \
|
| 15 |
+
--cache_dir /mnt/disks/flaxdisk/cache/ \
|
| 16 |
+
--num_train_steps="1150000" \
|
| 17 |
+
--adam_beta1="0.9" \
|
| 18 |
+
--adam_beta2="0.98" \
|
| 19 |
+
--logging_steps="10000" \
|
| 20 |
+
--save_steps="100000" \
|
| 21 |
+
--eval_steps="50000" \
|
| 22 |
+
--preprocessing_num_workers 96 \
|
| 23 |
+
--auth_token True \
|
| 24 |
+
--adafactor \
|
| 25 |
+
--push_to_hub
|