updated training script with auth_token

Files changed (3) hide show

events.out.tfevents.1629051212.t1v-n-358ff5d1-w-0.196309.3.v2 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a472266ff2fda9a2b30fa12a34c12eaf6276efefac5a60cb147cf96b632ebab1
-size 74472080

 version https://git-lfs.github.com/spec/v1
+oid sha256:3b2349e130dcf873a6c1b6a72c7905545522989ef1325b13371a5f71d077ff8c
+size 77452220

run_mlm_flax_stream.py CHANGED Viewed

@@ -129,6 +129,9 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
     )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
@@ -361,6 +364,7 @@ if __name__ == "__main__":
     #
     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
@@ -368,6 +372,7 @@ if __name__ == "__main__":
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             streaming=True,
             split="train",
         )

         default=None,
         metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
     )
+    auth_token: bool = field(
+        default=False, metadata={"help": "Use authorisation token"}
+    )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     #
     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             streaming=True,
+            use_auth_token=data_args.auth_token,
             split="train",
         )

run_recover_1350_stream.sh ADDED Viewed

+./run_mlm_flax_stream.py \
+    --output_dir="./" \
+    --model_type="roberta" \
+    --config_name="./" \
+    --tokenizer_name="./" \
+    --model_name_or_path="./"
+    --dataset_name="NbAiLab/NCC2" \
+    --max_seq_length="128" \
+    --weight_decay="0.01" \
+    --per_device_train_batch_size="128" \
+    --per_device_eval_batch_size="128" \
+    --learning_rate="3e-4" \
+    --warmup_steps="0" \
+    --overwrite_output_dir \
+    --cache_dir /mnt/disks/flaxdisk/cache/ \
+    --num_train_steps="1150000" \
+    --adam_beta1="0.9" \
+    --adam_beta2="0.98" \
+    --logging_steps="10000" \
+    --save_steps="100000" \
+    --eval_steps="50000" \
+    --preprocessing_num_workers 96 \
+    --auth_token True \
+    --adafactor \
+    --push_to_hub