NbAiLabArchive
/

test_NCC_small_flax

@@ -3,7 +3,8 @@
     --model_type="roberta" \
     --config_name="./" \
     --tokenizer_name="./" \
-    --dataset_name="NbAiLab/NCC_small" \
     --max_seq_length="128" \
     --weight_decay="0.01" \
     --per_device_train_batch_size="232" \

     --model_type="roberta" \
     --config_name="./" \
     --tokenizer_name="./" \
+    --train_file="/mnt/disks/flaxdisk/smallcorpus/train-shard-0001-of-0001.json" \
+    --validation_file="/mnt/disks/flaxdisk/smallcorpus/validation-shard-0001-of-0001.json" \
     --max_seq_length="128" \
     --weight_decay="0.01" \
     --per_device_train_batch_size="232" \

run_mlm_flax.py CHANGED Viewed

@@ -317,10 +317,9 @@ if __name__ == "__main__":
     #
     # In distributed training, the load_dataset function guarantees that only one local process can concurrently
     # download the dataset.
-    chunksize = 10<<20
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, chunksize=chunksize)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(

     #
     # In distributed training, the load_dataset function guarantees that only one local process can concurrently
     # download the dataset.
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(