acul3
/

roberta-base-indo

@@ -308,7 +308,7 @@ def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
     while i < num_total_tokens:
         tokenized_samples = next(train_iterator)
         i += len(tokenized_samples["input_ids"])
-        print(tokenized_samples)
         # concatenate tokenized samples to list
         samples = {k: samples[k] + tokenized_samples[k] for k in tokenized_samples.keys()}
@@ -451,30 +451,13 @@ if __name__ == "__main__":
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
-        filepaths = {}
-        if data_args.train_file:
-            filepaths["train"] = data_args.train_file
-        if data_args.validation_file:
-            filepaths["validation"] = data_args.validation_file
-        try:
-            dataset = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                cache_dir=model_args.cache_dir,
-                streaming=True,
-                split="train",
-            )
-        except Exception as exc:
-            logger.warning(
-                f"Unable to load local dataset with perplexity sampling support. Using huggingface.co/datasets/{data_args.dataset_name}: {exc}"
-            )
-            dataset = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                cache_dir=model_args.cache_dir,
-                streaming=True,
-                split="train",
-            )
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
@@ -505,13 +488,13 @@ if __name__ == "__main__":
         return tokenizer(
             examples[data_args.text_column_name],
             max_length=512,
-            truncation=True,
             return_special_tokens_mask=True
         )
     tokenized_datasets = dataset.map(
         tokenize_function,
         batched=True,
     )
     shuffle_seed = training_args.seed
@@ -524,8 +507,8 @@ if __name__ == "__main__":
             # Enable Weight&Biases
             import wandb
             wandb.init(
-                entity='munggok',
-                project='roberta-indo-base',
                 sync_tensorboard=True,
             )
             wandb.config.update(training_args)

     while i < num_total_tokens:
         tokenized_samples = next(train_iterator)
         i += len(tokenized_samples["input_ids"])
         # concatenate tokenized samples to list
         samples = {k: samples[k] + tokenized_samples[k] for k in tokenized_samples.keys()}
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            streaming=True,
+            split="train",
+        )
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
         return tokenizer(
             examples[data_args.text_column_name],
             max_length=512,
             return_special_tokens_mask=True
         )
     tokenized_datasets = dataset.map(
         tokenize_function,
         batched=True,
+        remove_columns=list(dataset.features.keys()),
     )
     shuffle_seed = training_args.seed
             # Enable Weight&Biases
             import wandb
             wandb.init(
+                entity='wandb',
+                project='hf-flax-bertin-roberta-es',
                 sync_tensorboard=True,
             )
             wandb.config.update(training_args)