pere commited on
Commit
d7792f5
·
1 Parent(s): 313936e
run_mlm_flax_stream_tpunew.py CHANGED
@@ -180,7 +180,7 @@ class DataTrainingArguments:
180
  default=10000, metadata={"help": "The number of examples to pre-load for shuffling."}
181
  )
182
  num_train_steps: int = field(default=50000, metadata={"help": "The number of training steps."})
183
- num_eval_samples: int = field(default=50, metadata={"help": "The number of samples to be used for evaluation"})
184
 
185
  def __post_init__(self):
186
  if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -269,11 +269,12 @@ class FlaxDataCollatorForLanguageModeling:
269
  def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarray:
270
  num_samples = len(samples_idx)
271
  samples_to_remove = num_samples % batch_size
272
-
273
  if samples_to_remove != 0:
274
  samples_idx = samples_idx[:-samples_to_remove]
275
  sections_split = num_samples // batch_size
276
  batch_idx = np.split(samples_idx, sections_split)
 
277
  return batch_idx
278
 
279
  def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
@@ -561,13 +562,9 @@ if __name__ == "__main__":
561
  train_metrics = []
562
  eval_metrics = []
563
 
564
- if num_of_hosts > 1:
565
- training_iter = iter(torch.utils.data.DataLoader(tokenized_datasets.with_format("torch"), batch_size=1, shuffle=False, num_workers=dataset.n_shards, collate_fn=lambda x: x))
566
- else:
567
- training_iter = iter(tokenized_datasets)
568
 
569
  max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
570
-
571
  eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
572
 
573
 
 
180
  default=10000, metadata={"help": "The number of examples to pre-load for shuffling."}
181
  )
182
  num_train_steps: int = field(default=50000, metadata={"help": "The number of training steps."})
183
+ num_eval_samples: int = field(default=10000, metadata={"help": "The number of samples to be used for evaluation"})
184
 
185
  def __post_init__(self):
186
  if self.dataset_name is None and self.train_file is None and self.validation_file is None:
 
269
  def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarray:
270
  num_samples = len(samples_idx)
271
  samples_to_remove = num_samples % batch_size
272
+
273
  if samples_to_remove != 0:
274
  samples_idx = samples_idx[:-samples_to_remove]
275
  sections_split = num_samples // batch_size
276
  batch_idx = np.split(samples_idx, sections_split)
277
+
278
  return batch_idx
279
 
280
  def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
 
562
  train_metrics = []
563
  eval_metrics = []
564
 
565
+ training_iter = iter(torch.utils.data.DataLoader(tokenized_datasets.with_format("torch"), batch_size=1, shuffle=False, num_workers=dataset.n_shards, collate_fn=lambda x: x))
 
 
 
566
 
567
  max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
 
568
  eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
569
 
570
 
run_nb_roberta_base_scandi_tpunew8.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_mlm_flax_stream_tpunew.py \
2
+ --output_dir="../nb-roberta-base-scandi-tpunew8" \
3
+ --hub_model_id="NbAiLab/nb-roberta-base-scandi-tpunew8" \
4
+ --hub_private_repo=True \
5
+ --model_name_or_path="xlm-roberta-base" \
6
+ --config_name="./config_base.json" \
7
+ --tokenizer_name="./" \
8
+ --dataset_name="NbAiLab/scandinavian" \
9
+ --max_seq_length="512" \
10
+ --weight_decay="0.01" \
11
+ --per_device_train_batch_size="62" \
12
+ --per_device_eval_batch_size="62" \
13
+ --learning_rate="4e-4" \
14
+ --warmup_steps="1000" \
15
+ --overwrite_output_dir \
16
+ --num_train_steps="10000" \
17
+ --adam_beta1="0.9" \
18
+ --adam_beta2="0.98" \
19
+ --logging_steps="50" \
20
+ --save_steps="50" \
21
+ --eval_steps="50" \
22
+ --dtype="bfloat16" \
23
+ --push_to_hub