Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

1cf4e07

verified ·

1 Parent(s): ae57ea2

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +240 -69

run_transformers_training.py CHANGED Viewed

@@ -285,7 +285,7 @@ def load_model_and_tokenizer(config):
         raise
 def load_dataset_with_mapping(dataset_config):
-    """Load and prepare dataset with proper column mapping."""
     try:
         # Load dataset
         dataset_name = dataset_config.get("dataset", {}).get("name", "")
@@ -319,6 +319,45 @@ def load_dataset_with_mapping(dataset_config):
                     if source != target:  # Only rename if names are different
                         dataset = dataset.rename_column(source, target)
         # Verify expected columns exist
         expected_columns = {"id", "conversations"}
         for col in expected_columns:
@@ -369,40 +408,105 @@ def load_dataset_with_mapping(dataset_config):
         # Verify the IDs are in sequential order if they're numeric
         try:
-            if len(dataset) > 1 and all(isinstance(example.get('id', ''), (int, str)) for example in dataset.select(range(min(10, len(dataset))))):
-                sample_ids = [example['id'] for example in dataset.select(range(min(10, len(dataset))))]
-                logger.info(f"Verifying sequential integrity with first few IDs: {sample_ids}")
-                # Check if IDs are numeric and ordered
-                if all(isinstance(id, int) or id.isdigit() for id in sample_ids):
-                    numeric_ids = [int(id) if isinstance(id, str) else id for id in sample_ids]
-                    is_ordered = all(numeric_ids[i] <= numeric_ids[i+1] for i in range(len(numeric_ids)-1))
-                    if not is_ordered:
-                        logger.warning("WARNING: Sample IDs are not in sequential order.")
                         logger.warning("This may indicate that data sequence is not preserved.")
                     else:
-                        logger.info("Sample IDs appear to be in sequential order.")
         except Exception as e:
             logger.warning(f"Could not verify sequential integrity: {e}")
-        # Log examples without printing full content
         if "conversations" in dataset.column_names:
-            sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
-            logger.info(f"First few IDs: {sample_ids}")
-            # Log conversation structure without full content
-            if len(dataset) > 0:
-                sample_conv_structure = []
-                for msg in dataset["conversations"][0]:
-                    if isinstance(msg, dict):
-                        content = msg.get('content', '')
-                        preview = content[:50] + "..." if len(content) > 50 else content
-                        sample_conv_structure.append({
-                            "role": msg.get('role', ''),
-                            "content_length": len(content),
-                            "preview": preview
-                        })
-                logger.info(f"Conversation structure: {sample_conv_structure}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
@@ -597,39 +701,88 @@ class LoggingCallback(TrainerCallback):
         if self.verify_sequence is True and state.global_step % 100 == 0 and self.sequence_samples:
             try:
                 # Get a batch of data without disturbing the training
-                batch = next(iter(trainer.get_train_dataloader()))
-                if 'input_ids' in batch and 'labels' in batch:
-                    log_info("Verifying data sequence integrity...")
-                    # Check if we can access some of our reference samples
-                    current_indices = list(range(min(3, len(trainer.train_dataset))))
-                    current_samples = [trainer.train_dataset[i] for i in current_indices]
-                    # Compare current samples with our reference samples from training start
-                    is_sequence_maintained = True
-                    for i, (orig_idx, orig_sample) in enumerate(zip(self.sample_indices, self.sequence_samples)):
-                        # Check if sample IDs still match our reference
-                        if orig_idx < len(current_samples):
-                            current_sample = current_samples[i]
-                            # Compare IDs if available
-                            if 'id' in orig_sample and 'id' in current_sample:
-                                if orig_sample['id'] != current_sample['id']:
-                                    log_info(f"WARNING: Sequence integrity compromised! Sample {i} ID changed from {orig_sample['id']} to {current_sample['id']}")
-                                    is_sequence_maintained = False
-                            # Compare input fingerprints
-                            if 'conversations' in orig_sample and 'conversations' in current_sample:
-                                orig_len = len(orig_sample['conversations'])
-                                curr_len = len(current_sample['conversations'])
-                                if orig_len != curr_len:
-                                    log_info(f"WARNING: Sequence integrity compromised! Sample {i} conversation length changed from {orig_len} to {curr_len}")
-                                    is_sequence_maintained = False
-                    if is_sequence_maintained:
-                        log_info("Data sequence integrity check: OK")
                     else:
-                        log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
             except Exception as e:
                 log_info(f"Warning: Couldn't verify sequence integrity: {e}")
@@ -666,16 +819,33 @@ class LoggingCallback(TrainerCallback):
                 log_info("Sequence integrity verification enabled during training")
                 # Save actual samples for later verification
-                if trainer and trainer.train_dataset:
-                    # Get some reference samples from the beginning of the dataset
-                    self.sample_indices = list(range(min(5, len(trainer.train_dataset))))
-                    self.sequence_samples = [trainer.train_dataset[i] for i in self.sample_indices]
-                    log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
-                    # Log sample IDs for debugging
-                    if len(self.sequence_samples) > 0 and 'id' in self.sequence_samples[0]:
-                        sample_ids = [s.get('id') for s in self.sequence_samples if 'id' in s]
-                        log_info(f"Reference sample IDs: {sample_ids}")
                 else:
                     log_info("Warning: Could not capture reference samples - verification will be limited")
         except Exception as e:
@@ -685,7 +855,8 @@ class LoggingCallback(TrainerCallback):
         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
-        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")

         raise
 def load_dataset_with_mapping(dataset_config):
+    """Load dataset and apply appropriate column mappings."""
     try:
         # Load dataset
         dataset_name = dataset_config.get("dataset", {}).get("name", "")
                     if source != target:  # Only rename if names are different
                         dataset = dataset.rename_column(source, target)
+        # Add prompt_number field that increments based on original order
+        def add_prompt_numbers(examples, indices):
+            # Defensive check to ensure indices is not None
+            if indices is None:
+                logger.warning("Warning: indices is None in add_prompt_numbers, using empty list")
+                indices = []
+            # Create a new field with the dataset index as the prompt number, starting at 1
+            examples["prompt_number"] = [idx + 1 for idx in indices]  # Adding 1 to make it 1-indexed
+            return examples
+        # Add prompt numbers to the dataset based on original order
+        logger.info("Adding prompt numbers based on original dataset order (starting at 1)")
+        try:
+            dataset = dataset.map(
+                add_prompt_numbers,
+                with_indices=True,
+                desc="Adding prompt numbers"
+            )
+            logger.info(f"Successfully added prompt_number field to dataset")
+        except Exception as e:
+            logger.error(f"Error adding prompt numbers: {e}")
+            # Create a fallback implementation that doesn't rely on with_indices
+            logger.info("Attempting fallback method for adding prompt numbers")
+            def add_prompt_numbers_fallback(example, idx):
+                example["prompt_number"] = idx + 1
+                return example
+            # Process each example one by one with explicit indices
+            updated_examples = []
+            for i, example in enumerate(dataset):
+                updated_examples.append(add_prompt_numbers_fallback(dict(example), i))
+            # Create a new dataset with the updated examples
+            from datasets import Dataset
+            dataset = Dataset.from_list(updated_examples)
+            logger.info(f"Successfully added prompt_number field using fallback method")
         # Verify expected columns exist
         expected_columns = {"id", "conversations"}
         for col in expected_columns:
         # Verify the IDs are in sequential order if they're numeric
         try:
+            if len(dataset) > 1:
+                # Check prompt numbers are sequential
+                sample_indices = range(min(10, len(dataset)))
+                sample_prompt_numbers = []
+                # Defensive collection of prompt numbers
+                for i in sample_indices:
+                    try:
+                        if i < len(dataset) and "prompt_number" in dataset[i]:
+                            sample_prompt_numbers.append(dataset[i]["prompt_number"])
+                        else:
+                            # If prompt_number doesn't exist, use index+1 as fallback
+                            sample_prompt_numbers.append(i + 1)
+                            logger.warning(f"Sample at index {i} missing prompt_number, using {i+1} as fallback")
+                    except Exception as e:
+                        logger.warning(f"Error accessing sample at index {i}: {e}")
+                        sample_prompt_numbers.append(i + 1)  # Use fallback
+                logger.info(f"Verifying sequential integrity with prompt numbers: {sample_prompt_numbers}")
+                # Check if prompt numbers are sequential (1-indexed)
+                if sample_prompt_numbers:
+                    is_sequential = all(sample_prompt_numbers[i] == i + 1 for i in range(len(sample_prompt_numbers)))
+                    if not is_sequential:
+                        logger.warning("WARNING: Prompt numbers are not in sequential order!")
                         logger.warning("This may indicate that data sequence is not preserved.")
                     else:
+                        logger.info("Prompt numbers verify that samples are in sequential order.")
+                else:
+                    logger.warning("Could not verify sequential integrity: no prompt numbers collected")
+                # Also check original IDs as a backup if numeric
+                try:
+                    sample_examples = []
+                    for i in sample_indices:
+                        try:
+                            if i < len(dataset):
+                                sample_examples.append(dataset[i])
+                        except Exception as e:
+                            logger.warning(f"Error accessing dataset at index {i}: {e}")
+                    if sample_examples:
+                        if all(isinstance(example.get('id', ''), (int, str)) for example in sample_examples):
+                            sample_ids = [example.get('id', '') for example in sample_examples if 'id' in example]
+                            if sample_ids and all(isinstance(id, int) or (isinstance(id, str) and id.isdigit()) for id in sample_ids):
+                                numeric_ids = [int(id) if isinstance(id, str) else id for id in sample_ids]
+                                if len(numeric_ids) > 1:
+                                    is_ordered = all(numeric_ids[i] <= numeric_ids[i+1] for i in range(len(numeric_ids)-1))
+                                    if not is_ordered:
+                                        logger.warning("WARNING: Sample IDs are not in sequential order.")
+                                    else:
+                                        logger.info("Sample IDs appear to be in sequential order.")
+                except Exception as e:
+                    logger.warning(f"Error checking ID sequence: {e}")
         except Exception as e:
             logger.warning(f"Could not verify sequential integrity: {e}")
+        # Log examples without printing full content - with defensive coding
         if "conversations" in dataset.column_names:
+            try:
+                # Safely get first few samples
+                first_few_indices = range(min(5, len(dataset)))
+                sample_prompt_numbers = []
+                sample_ids = []
+                for i in first_few_indices:
+                    try:
+                        example = dataset[i]
+                        if 'prompt_number' in example:
+                            sample_prompt_numbers.append(example['prompt_number'])
+                        if 'id' in example:
+                            sample_ids.append(example['id'])
+                    except Exception as e:
+                        logger.warning(f"Error accessing sample at index {i}: {e}")
+                logger.info(f"First few samples - Prompt numbers: {sample_prompt_numbers}, IDs: {sample_ids}")
+                # Log conversation structure without full content
+                if len(dataset) > 0:
+                    try:
+                        sample_conv_structure = []
+                        first_example = dataset[0]
+                        if 'conversations' in first_example and first_example['conversations'] is not None:
+                            for msg in first_example['conversations']:
+                                if isinstance(msg, dict):
+                                    content = msg.get('content', '')
+                                    preview = content[:50] + "..." if len(content) > 50 else content
+                                    sample_conv_structure.append({
+                                        "role": msg.get('role', ''),
+                                        "content_length": len(content),
+                                        "preview": preview
+                                    })
+                            logger.info(f"Conversation structure: {sample_conv_structure}")
+                    except Exception as e:
+                        logger.warning(f"Error logging conversation structure: {e}")
+            except Exception as e:
+                logger.warning(f"Error logging sample examples: {e}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
         if self.verify_sequence is True and state.global_step % 100 == 0 and self.sequence_samples:
             try:
                 # Get a batch of data without disturbing the training
+                train_dataloader = trainer.get_train_dataloader()
+                if train_dataloader is None:
+                    log_info("Warning: Could not get train dataloader for verification")
+                else:
+                    batch_iterator = iter(train_dataloader)
+                    if batch_iterator is None:
+                        log_info("Warning: Could not get batch iterator for verification")
                     else:
+                        try:
+                            batch = next(batch_iterator)
+                            if batch is None:
+                                log_info("Warning: Could not get batch for verification")
+                            elif 'input_ids' in batch and 'labels' in batch:
+                                log_info("Verifying data sequence integrity...")
+                                # Check if we can access some of our reference samples
+                                if not hasattr(trainer, 'train_dataset') or trainer.train_dataset is None:
+                                    log_info("Warning: Train dataset is not available")
+                                else:
+                                    # Get current samples defensively
+                                    current_samples = []
+                                    current_indices = list(range(min(3, len(trainer.train_dataset))))
+                                    for idx in current_indices:
+                                        try:
+                                            if idx < len(trainer.train_dataset):
+                                                current_samples.append(trainer.train_dataset[idx])
+                                        except Exception as e:
+                                            log_info(f"Warning: Error accessing dataset at index {idx}: {e}")
+                                    # Only proceed if we have samples to compare
+                                    if current_samples and self.sequence_samples:
+                                        # Compare current samples with our reference samples from training start
+                                        is_sequence_maintained = True
+                                        for i, (orig_idx, orig_sample) in enumerate(zip(self.sample_indices, self.sequence_samples)):
+                                            # Check if sample index is valid
+                                            if i < len(current_samples):
+                                                current_sample = current_samples[i]
+                                                # Compare prompt numbers if available
+                                                if ('prompt_number' in orig_sample and
+                                                    'prompt_number' in current_sample and
+                                                    orig_sample['prompt_number'] is not None and
+                                                    current_sample['prompt_number'] is not None):
+                                                    if orig_sample['prompt_number'] != current_sample['prompt_number']:
+                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} prompt number changed from {orig_sample['prompt_number']} to {current_sample['prompt_number']}")
+                                                        is_sequence_maintained = False
+                                                # Also compare IDs as a backup check
+                                                elif ('id' in orig_sample and
+                                                      'id' in current_sample and
+                                                      orig_sample['id'] is not None and
+                                                      current_sample['id'] is not None):
+                                                    if orig_sample['id'] != current_sample['id']:
+                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} ID changed from {orig_sample['id']} to {current_sample['id']}")
+                                                        is_sequence_maintained = False
+                                                # Compare input fingerprints
+                                                if ('conversations' in orig_sample and
+                                                    'conversations' in current_sample and
+                                                    orig_sample['conversations'] is not None and
+                                                    current_sample['conversations'] is not None):
+                                                    orig_len = len(orig_sample['conversations'])
+                                                    curr_len = len(current_sample['conversations'])
+                                                    if orig_len != curr_len:
+                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} conversation length changed from {orig_len} to {curr_len}")
+                                                        is_sequence_maintained = False
+                                        if is_sequence_maintained:
+                                            log_info("Data sequence integrity check: OK")
+                                        else:
+                                            log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
+                                    else:
+                                        log_info("Warning: Not enough samples available for sequence verification")
+                        except StopIteration:
+                            log_info("Warning: No batches available in the dataloader")
+                        except Exception as e:
+                            log_info(f"Warning: Error iterating through dataloader: {e}")
             except Exception as e:
                 log_info(f"Warning: Couldn't verify sequence integrity: {e}")
                 log_info("Sequence integrity verification enabled during training")
                 # Save actual samples for later verification
+                if trainer and hasattr(trainer, 'train_dataset') and trainer.train_dataset is not None:
+                    # Get some reference samples from the beginning of the dataset defensively
+                    self.sample_indices = []
+                    self.sequence_samples = []
+                    max_samples = min(5, len(trainer.train_dataset))
+                    for i in range(max_samples):
+                        try:
+                            if i < len(trainer.train_dataset):
+                                self.sample_indices.append(i)
+                                self.sequence_samples.append(trainer.train_dataset[i])
+                        except Exception as e:
+                            log_info(f"Warning: Error capturing reference sample at index {i}: {e}")
+                    if self.sequence_samples:
+                        log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
+                        # Log sample prompt numbers for debugging
+                        sample_prompt_numbers = []
+                        for s in self.sequence_samples:
+                            if isinstance(s, dict) and 'prompt_number' in s and s['prompt_number'] is not None:
+                                sample_prompt_numbers.append(s.get('prompt_number'))
+                        if sample_prompt_numbers:
+                            log_info(f"Reference sample prompt numbers: {sample_prompt_numbers}")
+                    else:
+                        log_info("Warning: No reference samples were captured")
                 else:
                     log_info("Warning: Could not capture reference samples - verification will be limited")
         except Exception as e:
         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
+        total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
+        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")