Spaces:

Vishwas1
/

LLMTrainingPro

Sleeping

App Files Files Community

Vishwas1 commited on Sep 18, 2024

Commit

8955717

verified ·

1 Parent(s): 4a9e5f8

Update train_model.py

Browse files

Files changed (1) hide show

train_model.py +17 -4

train_model.py CHANGED Viewed

@@ -68,16 +68,29 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
         # Log some examples to check dataset structure
         logging.info(f"Example data from the dataset: {dataset[:5]}")
         def tokenize_function(examples):
             try:
                 # Tokenize with truncation and padding
                 tokens = tokenizer(
                     examples['text'],
                     truncation=True,
                     max_length=sequence_length,
-                    padding='max_length',  # Force padding to max length for debugging
-                    return_tensors=None  # Let the collator handle tensor conversion
                 )
                 # Log the tokens for debugging
                 logging.info(f"Tokenized example: {tokens}")
@@ -87,7 +100,7 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
                 logging.error(f"Problematic example: {examples}")
                 raise e
-        # Tokenize the dataset
         tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
         logging.info("Dataset tokenization complete.")
         return tokenized_datasets
@@ -215,7 +228,7 @@ def main():
     if args.task == "generation":
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     elif args.task == "classification":
-        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # Handle padding dynamically during batching
     else:
         logging.error("Unsupported task type for data collator.")
         raise ValueError("Unsupported task type for data collator.")

         # Log some examples to check dataset structure
         logging.info(f"Example data from the dataset: {dataset[:5]}")
+        def clean_text(text):
+            # Ensure each text is a string
+            if isinstance(text, list):
+                return " ".join([str(t) for t in text])
+            return str(text)
         def tokenize_function(examples):
             try:
+                # Clean text to ensure correct format
+                examples['text'] = [clean_text(text) for text in examples['text']]
+                # Log the type and structure of text to debug
+                logging.info(f"Type of examples['text']: {type(examples['text'])}")
+                logging.info(f"First example type: {type(examples['text'][0])}")
                 # Tokenize with truncation and padding
                 tokens = tokenizer(
                     examples['text'],
                     truncation=True,
                     max_length=sequence_length,
+                    padding=False,  # Defer padding to data collator
+                    return_tensors=None  # Let the data collator handle tensor creation
                 )
                 # Log the tokens for debugging
                 logging.info(f"Tokenized example: {tokens}")
                 logging.error(f"Problematic example: {examples}")
                 raise e
+        # Tokenize the dataset using the modified tokenize_function
         tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
         logging.info("Dataset tokenization complete.")
         return tokenized_datasets
     if args.task == "generation":
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     elif args.task == "classification":
+        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')  # Handle padding dynamically during batching
     else:
         logging.error("Unsupported task type for data collator.")
         raise ValueError("Unsupported task type for data collator.")