Spaces:

Vishwas1
/

LLMTrainingPro

Sleeping

Vishwas1 commited on Sep 18, 2024

Commit

791abc9

verified ·

1 Parent(s): 3266823

Update train_model.py

Files changed (1) hide show

train_model.py CHANGED Viewed

@@ -68,18 +68,18 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
             # Check if dataset_name includes config
             if '/' in dataset_name:
                 dataset, config = dataset_name.split('/', 1)
-                dataset = load_dataset(dataset, config, split='train[:1%]', use_auth_token=True)
             else:
-                dataset = load_dataset(dataset_name, split='train[:1%]', use_auth_token=True)
             logging.info("Dataset loaded successfully for generation task.")
             def tokenize_function(examples):
                 return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
         elif task == "classification":
             if '/' in dataset_name:
                 dataset, config = dataset_name.split('/', 1)
-                dataset = load_dataset(dataset, config, split='train[:1%]', use_auth_token=True)
             else:
-                dataset = load_dataset(dataset_name, split='train[:1%]', use_auth_token=True)
             logging.info("Dataset loaded successfully for classification task.")
             # Assuming the dataset has 'text' and 'label' columns
             def tokenize_function(examples):

             # Check if dataset_name includes config
             if '/' in dataset_name:
                 dataset, config = dataset_name.split('/', 1)
+                dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train[:1%]', use_auth_token=True)
             else:
+                dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split='train[:1%]', use_auth_token=True)
             logging.info("Dataset loaded successfully for generation task.")
             def tokenize_function(examples):
                 return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
         elif task == "classification":
             if '/' in dataset_name:
                 dataset, config = dataset_name.split('/', 1)
+                dataset = load_dataset("stanfordnlp/imdb", split='train[:1%]', use_auth_token=True)
             else:
+                dataset = load_dataset("stanfordnlp/imdb", split='train[:1%]', use_auth_token=True)
             logging.info("Dataset loaded successfully for classification task.")
             # Assuming the dataset has 'text' and 'label' columns
             def tokenize_function(examples):