first submit
Browse files- run_128.sh +1 -0
- run_mlm_flax.py +15 -9
run_128.sh
CHANGED
|
@@ -19,6 +19,7 @@ python run_mlm_flax.py \
|
|
| 19 |
--logging_steps="1000" \
|
| 20 |
--save_steps="1000" \
|
| 21 |
--eval_steps="1000" \
|
|
|
|
| 22 |
--do_train \
|
| 23 |
--do_eval \
|
| 24 |
--dtype="bfloat16" \
|
|
|
|
| 19 |
--logging_steps="1000" \
|
| 20 |
--save_steps="1000" \
|
| 21 |
--eval_steps="1000" \
|
| 22 |
+
--auth_token="True" \
|
| 23 |
--do_train \
|
| 24 |
--do_eval \
|
| 25 |
--dtype="bfloat16" \
|
run_mlm_flax.py
CHANGED
|
@@ -224,6 +224,10 @@ class DataTrainingArguments:
|
|
| 224 |
default=False,
|
| 225 |
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
| 226 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def __post_init__(self):
|
| 229 |
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
|
@@ -376,14 +380,14 @@ def main():
|
|
| 376 |
set_seed(training_args.seed)
|
| 377 |
|
| 378 |
# Handle the repository creation
|
| 379 |
-
if training_args.push_to_hub:
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
|
| 388 |
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
| 389 |
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
|
@@ -396,7 +400,7 @@ def main():
|
|
| 396 |
# download the dataset.
|
| 397 |
if data_args.dataset_name is not None:
|
| 398 |
# Downloading and loading a dataset from the hub.
|
| 399 |
-
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
| 400 |
|
| 401 |
if "validation" not in datasets.keys():
|
| 402 |
datasets["validation"] = load_dataset(
|
|
@@ -404,12 +408,14 @@ def main():
|
|
| 404 |
data_args.dataset_config_name,
|
| 405 |
split=f"train[:{data_args.validation_split_percentage}%]",
|
| 406 |
cache_dir=model_args.cache_dir,
|
|
|
|
| 407 |
)
|
| 408 |
datasets["train"] = load_dataset(
|
| 409 |
data_args.dataset_name,
|
| 410 |
data_args.dataset_config_name,
|
| 411 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
| 412 |
cache_dir=model_args.cache_dir,
|
|
|
|
| 413 |
)
|
| 414 |
else:
|
| 415 |
data_files = {}
|
|
|
|
| 224 |
default=False,
|
| 225 |
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
| 226 |
)
|
| 227 |
+
|
| 228 |
+
auth_token: bool = field(
|
| 229 |
+
default=False, metadata={"help": "Use authorisation token"}
|
| 230 |
+
)
|
| 231 |
|
| 232 |
def __post_init__(self):
|
| 233 |
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
|
|
|
| 380 |
set_seed(training_args.seed)
|
| 381 |
|
| 382 |
# Handle the repository creation
|
| 383 |
+
# if training_args.push_to_hub:
|
| 384 |
+
# if training_args.hub_model_id is None:
|
| 385 |
+
# repo_name = get_full_repo_name(
|
| 386 |
+
# Path(training_args.output_dir).absolute().name, token=training_args.hub_token
|
| 387 |
+
# )
|
| 388 |
+
# else:
|
| 389 |
+
# repo_name = training_args.hub_model_id
|
| 390 |
+
# repo = Repository(training_args.output_dir, clone_from=repo_name)
|
| 391 |
|
| 392 |
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
| 393 |
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
|
|
|
| 400 |
# download the dataset.
|
| 401 |
if data_args.dataset_name is not None:
|
| 402 |
# Downloading and loading a dataset from the hub.
|
| 403 |
+
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, use_auth_token=data_args.auth_token, cache_dir=model_args.cache_dir)
|
| 404 |
|
| 405 |
if "validation" not in datasets.keys():
|
| 406 |
datasets["validation"] = load_dataset(
|
|
|
|
| 408 |
data_args.dataset_config_name,
|
| 409 |
split=f"train[:{data_args.validation_split_percentage}%]",
|
| 410 |
cache_dir=model_args.cache_dir,
|
| 411 |
+
use_auth_token=data_args.auth_token,
|
| 412 |
)
|
| 413 |
datasets["train"] = load_dataset(
|
| 414 |
data_args.dataset_name,
|
| 415 |
data_args.dataset_config_name,
|
| 416 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
| 417 |
cache_dir=model_args.cache_dir,
|
| 418 |
+
use_auth_token=data_args.auth_token,
|
| 419 |
)
|
| 420 |
else:
|
| 421 |
data_files = {}
|