hf-train-frontend

Paused

App Files Files Community

George-API commited on Mar 10

Commit

73ea801

verified ·

1 Parent(s): 93b2fec

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +113 -24

run_transformers_training.py CHANGED Viewed

@@ -337,6 +337,31 @@ def load_dataset_with_mapping(dataset_config):
             if len(dataset) == 0:
                 raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) is empty (contains 0 examples)")
         except Exception as dataset_error:
             logger.error(f"Failed to load dataset {dataset_name}: {str(dataset_error)}")
             logger.error("Make sure the dataset exists and you have proper access permissions")
@@ -478,32 +503,59 @@ class SimpleDataCollator:
         for example in features:
             try:
                 # Get ID
-                paper_id = example.get("id", "")
-                # Get conversations - these should already contain role and content
-                conversations = example.get("conversations", [])
-                if not conversations:
                     self.stats["skipped"] += 1
                     continue
-                # Directly use the conversations array as input to the model's chat template
-                # This preserves the exact structure with roles and content as they are
                 try:
-                    # Let tokenizer handle the content with the model's chat template
                     inputs = self.tokenizer.apply_chat_template(
-                        conversations,
                         return_tensors=None,
                         add_generation_prompt=False
                     )
                 except Exception as chat_error:
                     # Fallback if apply_chat_template fails
-                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
-                    # Create a basic representation of the conversation
                     conversation_text = ""
-                    for msg in conversations:
                         if isinstance(msg, dict) and 'content' in msg:
-                            conversation_text += msg.get('content', '') + "\n\n"
                     # Basic tokenization
                     inputs = self.tokenizer(
@@ -537,7 +589,7 @@ class SimpleDataCollator:
                         logger.info(f"Example {self.stats['processed']}:")
                         logger.info(f"Paper ID: {paper_id}")
                         logger.info(f"Token count: {len(inputs)}")
-                        logger.info(f"Conversation entries: {len(conversations)}")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
@@ -1004,6 +1056,14 @@ def main():
                 """Custom dataloader that preserves original dataset order"""
                 log_info("Creating sequential dataloader to maintain original dataset order")
                 # Create a simple sequential sampler
                 sequential_sampler = torch.utils.data.SequentialSampler(dataset)
@@ -1018,10 +1078,16 @@ def main():
                 # Log our approach clearly
                 log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
-                # Verify column order
                 expected_order = ["prompt_number", "article_id", "conversations"]
                 if hasattr(dataset, 'column_names'):
                     actual_order = dataset.column_names
                     if actual_order == expected_order:
                         log_info(f"Confirmed dataset columns are in expected order: {', '.join(expected_order)}")
                     else:
@@ -1030,6 +1096,16 @@ def main():
                 log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
                 # Calculate batch size based on device availability
                 if getattr(training_args, "no_cuda", False):
                     batch_size = training_args.per_device_train_batch_size
@@ -1038,16 +1114,29 @@ def main():
                 log_info(f"Using sequential sampler with batch size {batch_size}")
-                # Return DataLoader with sequential sampler
-                return torch.utils.data.DataLoader(
-                    dataset,
-                    batch_size=batch_size,
-                    sampler=sequential_sampler,  # Always use sequential sampler
-                    collate_fn=data_collator,
-                    drop_last=training_args.dataloader_drop_last,
-                    num_workers=training_args.dataloader_num_workers,
-                    pin_memory=training_args.dataloader_pin_memory,
-                )
             # Override the get_train_dataloader method
             trainer.get_train_dataloader = custom_get_train_dataloader

             if len(dataset) == 0:
                 raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) is empty (contains 0 examples)")
+            # Verify conversations field specifically - this is critical for training
+            if "conversations" not in dataset.column_names:
+                raise ValueError(f"Dataset {dataset_name} missing required 'conversations' column")
+            # Check a sample of conversation entries to validate structure
+            logger.info("Validating conversation structure...")
+            for i in range(min(5, len(dataset))):
+                conv = dataset[i].get("conversations")
+                if conv is None:
+                    logger.warning(f"Example {i} has None as 'conversations' value")
+                elif not isinstance(conv, list):
+                    logger.warning(f"Example {i} has non-list 'conversations': {type(conv)}")
+                elif len(conv) == 0:
+                    logger.warning(f"Example {i} has empty conversations list")
+                else:
+                    # Look at the first conversation entry
+                    first_entry = conv[0]
+                    logger.info(f"Sample conversation: {str(first_entry)[:100]}...")
+                    # Make sure content field exists
+                    if isinstance(first_entry, dict) and "content" in first_entry:
+                        logger.info(f"Content field example: {str(first_entry['content'])[:50]}...")
+                    else:
+                        logger.warning(f"Example {i} missing 'content' key in conversation")
         except Exception as dataset_error:
             logger.error(f"Failed to load dataset {dataset_name}: {str(dataset_error)}")
             logger.error("Make sure the dataset exists and you have proper access permissions")
         for example in features:
             try:
                 # Get ID
+                paper_id = example.get("article_id", example.get("id", ""))
+                # Get conversations
+                raw_conversations = example.get("conversations", [])
+                if not raw_conversations:
+                    logger.warning(f"Empty conversations for example {paper_id}")
                     self.stats["skipped"] += 1
                     continue
+                # Extract only the 'content' field from each conversation item
+                # This simplifies the structure and avoids potential NoneType errors
                 try:
+                    # Convert conversations to the simple format with only content
+                    simplified_conversations = []
+                    for item in raw_conversations:
+                        if isinstance(item, dict) and "content" in item:
+                            # Keep only the content field
+                            content = item["content"]
+                            simplified_conversations.append({"role": "user", "content": content})
+                        elif isinstance(item, str):
+                            # If it's just a string, treat it as content
+                            simplified_conversations.append({"role": "user", "content": item})
+                        else:
+                            logger.warning(f"Skipping invalid conversation item: {item}")
+                    # Skip if no valid conversations after filtering
+                    if not simplified_conversations:
+                        logger.warning(f"No valid conversations after filtering for example {paper_id}")
+                        self.stats["skipped"] += 1
+                        continue
+                    # Log the simplified content for debugging
+                    if len(simplified_conversations) > 0:
+                        first_content = simplified_conversations[0]["content"]
+                        logger.debug(f"First content: {first_content[:50]}...")
+                    # Let tokenizer handle the simplified conversations
                     inputs = self.tokenizer.apply_chat_template(
+                        simplified_conversations,
                         return_tensors=None,
                         add_generation_prompt=False
                     )
                 except Exception as chat_error:
                     # Fallback if apply_chat_template fails
+                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)}")
+                    # Create a basic representation of just the content
                     conversation_text = ""
+                    for msg in raw_conversations:
                         if isinstance(msg, dict) and 'content' in msg:
+                            conversation_text += msg['content'] + "\n\n"
+                        elif isinstance(msg, str):
+                            conversation_text += msg + "\n\n"
                     # Basic tokenization
                     inputs = self.tokenizer(
                         logger.info(f"Example {self.stats['processed']}:")
                         logger.info(f"Paper ID: {paper_id}")
                         logger.info(f"Token count: {len(inputs)}")
+                        logger.info(f"Conversation entries: {len(raw_conversations)}")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 """Custom dataloader that preserves original dataset order"""
                 log_info("Creating sequential dataloader to maintain original dataset order")
+                # Safety check - make sure dataset exists and is not None
+                if dataset is None:
+                    raise ValueError("Dataset is None - cannot create dataloader")
+                # Make sure dataset is not empty
+                if len(dataset) == 0:
+                    raise ValueError("Dataset is empty - cannot create dataloader")
                 # Create a simple sequential sampler
                 sequential_sampler = torch.utils.data.SequentialSampler(dataset)
                 # Log our approach clearly
                 log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
+                # Verify column order and check for 'conversations' field
                 expected_order = ["prompt_number", "article_id", "conversations"]
                 if hasattr(dataset, 'column_names'):
                     actual_order = dataset.column_names
+                    # Verify all required fields exist
+                    missing_fields = [field for field in ["conversations"] if field not in actual_order]
+                    if missing_fields:
+                        raise ValueError(f"Dataset missing critical fields: {missing_fields}")
                     if actual_order == expected_order:
                         log_info(f"Confirmed dataset columns are in expected order: {', '.join(expected_order)}")
                     else:
                 log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
+                # Validate a few samples before proceeding
+                for i in range(min(3, len(dataset))):
+                    sample = dataset[i]
+                    if "conversations" not in sample:
+                        log_info(f"WARNING: Sample {i} missing 'conversations' field")
+                    elif sample["conversations"] is None:
+                        log_info(f"WARNING: Sample {i} has None 'conversations' field")
+                    elif not isinstance(sample["conversations"], list):
+                        log_info(f"WARNING: Sample {i} has non-list 'conversations' field: {type(sample['conversations'])}")
                 # Calculate batch size based on device availability
                 if getattr(training_args, "no_cuda", False):
                     batch_size = training_args.per_device_train_batch_size
                 log_info(f"Using sequential sampler with batch size {batch_size}")
+                # Return DataLoader with sequential sampler and extra error handling
+                try:
+                    return torch.utils.data.DataLoader(
+                        dataset,
+                        batch_size=batch_size,
+                        sampler=sequential_sampler,  # Always use sequential sampler
+                        collate_fn=data_collator,
+                        drop_last=training_args.dataloader_drop_last,
+                        num_workers=training_args.dataloader_num_workers,
+                        pin_memory=training_args.dataloader_pin_memory,
+                    )
+                except Exception as e:
+                    log_info(f"Error creating DataLoader: {str(e)}")
+                    # Try again with minimal settings
+                    log_info("Attempting to create DataLoader with minimal settings")
+                    return torch.utils.data.DataLoader(
+                        dataset,
+                        batch_size=1,  # Minimal batch size
+                        sampler=sequential_sampler,
+                        collate_fn=data_collator,
+                        num_workers=0,  # No parallel workers
+                        pin_memory=False,
+                    )
             # Override the get_train_dataloader method
             trainer.get_train_dataloader = custom_get_train_dataloader