Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

dc055e5

verified ·

1 Parent(s): 5f730a4

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

run_transformers_training.py +127 -240
transformers_config.json +5 -2

run_transformers_training.py CHANGED Viewed

@@ -337,10 +337,29 @@ def load_dataset_with_mapping(dataset_config):
             if len(dataset) == 0:
                 raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) is empty (contains 0 examples)")
-            # Verify conversations field specifically - this is critical for training
             if "conversations" not in dataset.column_names:
                 raise ValueError(f"Dataset {dataset_name} missing required 'conversations' column")
             # Check a sample of conversation entries to validate structure
             logger.info("Validating conversation structure...")
             for i in range(min(5, len(dataset))):
@@ -354,9 +373,6 @@ def load_dataset_with_mapping(dataset_config):
                 else:
                     # Look at the first conversation entry
                     first_entry = conv[0]
-                    logger.info(f"Sample conversation: {str(first_entry)[:100]}...")
-                    # Make sure content field exists
                     if isinstance(first_entry, dict) and "content" in first_entry:
                         logger.info(f"Content field example: {str(first_entry['content'])[:50]}...")
                     else:
@@ -368,71 +384,6 @@ def load_dataset_with_mapping(dataset_config):
             logger.error("This could be due to authentication issues with your HF_TOKEN")
             raise
-        # Apply minimal processing since the dataset has already been properly structured
-        # Just perform validation to ensure required fields exist
-        # Check for required fields
-        required_fields = ["prompt_number", "article_id", "conversations"]
-        missing_fields = [field for field in required_fields if field not in dataset.column_names]
-        if missing_fields:
-            logger.warning(f"Dataset is missing required fields: {missing_fields}")
-            logger.warning("This may cause issues with sequence integrity and metadata management")
-        else:
-            logger.info(f"Dataset has all required fields: {required_fields}")
-        # Verify that column order matches our expectation
-        expected_order = ["prompt_number", "article_id", "conversations"]
-        actual_order = dataset.column_names
-        if actual_order == expected_order:
-            logger.info("Dataset column order matches expected order (prompt_number, article_id, conversations)")
-        else:
-            logger.warning(f"Dataset column order ({', '.join(actual_order)}) differs from expected order ({', '.join(expected_order)})")
-            logger.warning("This should not affect processing but is noted for debugging purposes")
-        # Log a few samples for verification
-        if len(dataset) > 0:
-            sample_indices = range(min(5, len(dataset)))
-            sample_records = []
-            for i in sample_indices:
-                record = {}
-                record["prompt_number"] = dataset[i].get("prompt_number", "N/A")
-                record["article_id"] = dataset[i].get("article_id", "N/A")
-                # Safely get conversations length with None check
-                conversations = dataset[i].get("conversations")
-                if conversations is not None and isinstance(conversations, list):
-                    record["conversations_length"] = len(conversations)
-                else:
-                    record["conversations_length"] = 0
-                    logger.warning(f"Invalid conversations for sample {i}: {type(conversations)}")
-                sample_records.append(record)
-            logger.info(f"Sample records: {sample_records}")
-        # Verify sequential integrity
-        if "prompt_number" in dataset.column_names and len(dataset) > 1:
-            first_prompt_numbers = [dataset[i]["prompt_number"] for i in range(min(10, len(dataset)))]
-            is_sequential = all(first_prompt_numbers[i] == i + 1 for i in range(len(first_prompt_numbers)))
-            if is_sequential:
-                logger.info("Dataset prompt numbers are sequential (1-indexed) - sequence integrity preserved")
-            else:
-                logger.warning("Dataset prompt numbers are not sequential - sequence integrity may be compromised")
-                logger.info(f"First few prompt numbers: {first_prompt_numbers}")
-        logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
-        logger.info(f"Dataset columns: {dataset.column_names}")
-        # Data loading configuration - ensure shuffle is disabled
-        data_loading_config = dataset_config.get("data_loading", {})
-        if data_loading_config.get("shuffle", False):
-            logger.error("CRITICAL: shuffle is enabled in the dataset config!")
-            logger.error("This will RANDOMIZE your dataset and break sequential order.")
-            logger.error("Setting shuffle to False to preserve order")
-            data_loading_config["shuffle"] = False
         return dataset
     except Exception as e:
@@ -447,42 +398,35 @@ def format_phi_chat(messages, dataset_config):
     roles = dataset_config.get("data_formatting", {}).get("roles", {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
-        "user": "Human: {content}\n\n",
         "assistant": "Assistant: {content}\n\n"
     })
-    # Handle research introduction metadata first
-    metadata = next((msg for msg in messages if isinstance(msg, dict) and
-                    "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
-    if metadata:
-        system_template = roles.get("system", "System: {content}\n\n")
-        formatted_chat = system_template.format(content=metadata['content'])
-        messages = [msg for msg in messages if msg != metadata]
-    # Process remaining messages
     for message in messages:
         if not isinstance(message, dict) or "content" not in message:
             logger.warning(f"Skipping invalid message format: {message}")
             continue
-        role = message.get("role", "").lower()
-        content = message.get("content", "")
-        # Format based on role
-        if role == "human" or role == "user":
-            template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
-            formatted_chat += template.format(content=content)
-        elif role == "assistant" or role == "bot":
-            template = roles.get("assistant", "Assistant: {content}\n\n")
-            formatted_chat += template.format(content=content)
-        elif role == "system":
-            # For system messages, prepend them
             template = roles.get("system", "System: {content}\n\n")
             formatted_chat = template.format(content=content) + formatted_chat
         else:
-            # Default to system for unknown roles
-            logger.warning(f"Unknown role '{role}' - treating as system message")
-            template = roles.get("system", "System: {content}\n\n")
             formatted_chat += template.format(content=content)
     return formatted_chat.strip()
@@ -506,7 +450,7 @@ class SimpleDataCollator:
                 paper_id = example.get("article_id", "unknown")
                 prompt_num = example.get("prompt_number", "unknown")
-                # Get the conversations list - should be a single item
                 conversations = example.get("conversations", [])
                 # Skip if no conversations
@@ -515,27 +459,17 @@ class SimpleDataCollator:
                     self.stats["skipped"] += 1
                     continue
-                # Get the first conversation item (should be the only one)
-                conv_item = conversations[0]
-                # Skip if invalid format
-                if not isinstance(conv_item, dict) or "content" not in conv_item:
-                    logger.warning(f"Invalid conversation format for paper_id {paper_id}, prompt {prompt_num}")
                     self.stats["skipped"] += 1
                     continue
-                # Get the pre-tokenized content
-                content = conv_item["content"]
-                # Skip if empty content
-                if not content:
-                    logger.warning(f"Empty content for paper_id {paper_id}, prompt {prompt_num}")
-                    self.stats["skipped"] += 1
-                    continue
-                # Create input IDs and attention mask directly from the content
-                # The content is already pre-tokenized and properly chunked
-                input_ids = self.tokenizer.encode(content, add_special_tokens=False)
                 # Truncate if needed
                 if len(input_ids) > self.max_seq_length:
@@ -553,6 +487,11 @@ class SimpleDataCollator:
                 self.stats["processed"] += 1
                 self.stats["total_tokens"] += len(input_ids)
             except Exception as e:
                 logger.warning(f"Error processing example {paper_id}, prompt {prompt_num}: {str(e)}")
                 self.stats["skipped"] += 1
@@ -588,31 +527,30 @@ class SimpleDataCollator:
         return batch
 class LoggingCallback(TrainerCallback):
-    def __init__(self):
         super().__init__()
         self.training_started = time.time()
         self.last_log_time = time.time()
         self.last_step = 0
-        self.verify_sequence = None
-        self.sequence_samples = None
-        self.sample_indices = None
     def on_train_begin(self, args, state, control, **kwargs):
         log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
-        log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
-        # Disable sequence verification
-        self.verify_sequence = False
-        log_info("=== Training is starting ===")
         # Log important training parameters for visibility
         total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
-        total_steps = int(len(dataset) / (args.per_device_train_batch_size * NUM_GPUS * args.gradient_accumulation_steps) * args.num_train_epochs)
-        log_info(f"Training plan: {len(dataset)} examples over {args.num_train_epochs} epochs ≈ {total_steps} steps")
         log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
-        log_info(f"Learning rate: {args.learning_rate}")
-        log_info(f"Epochs: {args.num_train_epochs}")
         # Log memory information in compact format
         if CUDA_AVAILABLE:
@@ -621,85 +559,63 @@ class LoggingCallback(TrainerCallback):
                 allocated = torch.cuda.memory_allocated(i) / 1024**2
                 max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
                 memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
             log_info(f"Initial memory usage - {', '.join(memory_info)}")
-    def on_step_end(self, args, state, control, **kwargs):
-        # Log every 50 steps or every 5 minutes, whichever comes first
-        current_time = time.time()
-        # Sequence verification removed
-        # Log progress at regular intervals
-        if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
-            if state.log_history:
-                loss = state.log_history[-1].get('loss', 'N/A')
-                # Use simple formatting for better Space log compatibility
-                log_info(f"Step {state.global_step}: Loss {loss}")
-            else:
-                log_info(f"Step {state.global_step}: No loss data available")
-            self.last_log_time = current_time
-    def on_train_end(self, args, state, control, **kwargs):
-        training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - self.training_started))
-        log_info(f"=== Training completed in {training_time} ===")
-        # Log final memory usage
-        if CUDA_AVAILABLE:
-            for i in range(NUM_GPUS):
-                max_mem = torch.cuda.max_memory_allocated(i) / 1024**3  # GB
-                log_info(f"GPU {i} max memory: {max_mem:.2f} GB")
-            # Clear GPU memory
-            torch.cuda.empty_cache()
-            log_info("GPU memory cleared")
-        log_info(f"Total steps: {state.global_step}")
-        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
 def check_dependencies():
     """Check if all required dependencies are installed and in the correct order."""
     missing_packages = []
     order_issues = []
-    # Check critical packages in the required order
-    # 1. First check for unsloth as it should be imported before transformers
-    if not unsloth_available:
-        missing_packages.append("unsloth>=2024.3")
-    # 2. Check transformers (imported at module level)
-    try:
-        import transformers
-        logger.info(f"Using transformers version {transformers.__version__}")
-    except ImportError:
-        missing_packages.append("transformers>=4.38.0")
-    # 3. Check for peft
-    if not peft_available:
-        missing_packages.append("peft>=0.9.0")
-    # 4. Check for accelerate
-    try:
-        import accelerate
-        logger.info(f"Using accelerate version {accelerate.__version__}")
-    except ImportError:
-        missing_packages.append("accelerate>=0.27.0")
-    # Check for order-specific issues
     try:
         import sys
-        modules = sys.modules.keys()
-        # Unsloth should be imported before transformers for optimal performance
         if 'transformers' in modules and 'unsloth' in modules:
-            if modules.index('transformers') < modules.index('unsloth'):
-                order_issues.append("For optimal performance, unsloth should be imported before transformers")
-    except Exception:
-        # If we can't check order, just skip this check
-        pass
-    # If critical packages are missing, exit with instructions
     if missing_packages:
         logger.error("Critical dependencies missing:")
         for pkg in missing_packages:
@@ -712,35 +628,6 @@ def check_dependencies():
     for issue in order_issues:
         logger.warning(issue)
-    # Optional packages - moved to the end
-    if find_spec("flash_attn"):
-        logger.info("flash-attn found. Flash attention will be used for faster training.")
-    else:
-        logger.warning("flash-attn not found. Training will work but may be slower.")
-        logger.warning("Attempting to install flash-attn automatically...")
-        try:
-            import subprocess
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"])
-            logger.info("Successfully installed flash-attn!")
-            # Try to import it now that it's installed
-            try:
-                import flash_attn
-                logger.info("flash-attn imported successfully after installation.")
-            except ImportError:
-                logger.warning("flash-attn installed but import failed - may require restart.")
-        except Exception as e:
-            logger.warning(f"Failed to install flash-attn: {str(e)}")
-            logger.warning("To manually install flash attention, run: pip install flash-attn --no-build-isolation")
-    # Additional optional packages that improve performance
-    if find_spec("bitsandbytes"):
-        logger.info("bitsandbytes found. Quantization will be available.")
-    else:
-        logger.warning("bitsandbytes not found. Quantization may not be available.")
-        logger.warning("To use quantization, install with: pip install bitsandbytes")
     return True
 def update_huggingface_space():
@@ -981,27 +868,28 @@ def main():
             # Set up training arguments
             log_info("Setting up training arguments")
-            # Validate FSDP config before using it
             fsdp_args = None
-            if fsdp_config is not None and is_distributed and multi_gpu_strategy == "fsdp":
-                try:
-                    # Convert FSDP config to proper format expected by TrainingArguments
-                    fsdp_args = {
-                        "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", []),
-                        "fsdp_offload_params": fsdp_config.get("fsdp_offload_params", False),
-                        "fsdp_backward_prefetch": fsdp_config.get("fsdp_backward_prefetch", "BACKWARD_PRE"),
-                        "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1e6),
-                        "fsdp_sharding_strategy": fsdp_config.get("fsdp_sharding_strategy", 1),
-                    }
-                    log_info("FSDP config validated and prepared")
-                except Exception as e:
-                    log_info(f"Error preparing FSDP config: {str(e)}, disabling FSDP")
-                    fsdp_args = None
             # Check if we're running in a Space
             is_space = bool(os.environ.get("SPACE_ID"))
-            # Create training arguments with validated FSDP config
             training_args = TrainingArguments(
                 output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
                 num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
@@ -1020,7 +908,6 @@ def main():
                 max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
                 push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
                 hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
-                # Don't set hub_token when running in a Space - it will use Space secrets automatically
                 hub_token=None if is_space else os.environ.get("HF_TOKEN", None),
                 report_to="tensorboard",
                 remove_unused_columns=False,  # Keep all columns
@@ -1031,7 +918,7 @@ def main():
                 dataloader_drop_last=False,  # Process all examples
                 dataloader_num_workers=dataloader_workers,
                 no_cuda=False if CUDA_AVAILABLE else True,  # Use CUDA if available
-                fsdp=fsdp_args,  # Use validated FSDP config
             )
             log_info("Training arguments created successfully")
@@ -1049,9 +936,9 @@ def main():
             trainer = Trainer(
                 model=model,
                 args=training_args,
-                train_dataset=dataset,  # We'll override this with our custom dataloader
                 data_collator=data_collator,
-                callbacks=[LoggingCallback()],
             )
             # Then override the get_train_dataloader method
@@ -1153,7 +1040,7 @@ def main():
                     log_info("Cleared CUDA cache before training")
                 # Display compact training info
-                total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs)
                 log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
                 trainer.train()

             if len(dataset) == 0:
                 raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) is empty (contains 0 examples)")
+            # Verify conversations field specifically
             if "conversations" not in dataset.column_names:
                 raise ValueError(f"Dataset {dataset_name} missing required 'conversations' column")
+            # Validate conversation structure
+            if len(dataset) > 0:
+                sample = dataset[0]
+                conversations = sample.get("conversations", [])
+                if conversations:
+                    first_conv = conversations[0]
+                    if isinstance(first_conv, dict):
+                        # Check actual fields
+                        fields = list(first_conv.keys())
+                        logger.info(f"Conversation fields: {fields}")
+                        # Verify only 'content' field exists
+                        if fields == ["content"]:
+                            logger.info("Confirmed conversations have correct format with only 'content' field")
+                        else:
+                            logger.warning(f"Unexpected conversation fields: {fields}")
+                            logger.warning("Expected only 'content' field")
             # Check a sample of conversation entries to validate structure
             logger.info("Validating conversation structure...")
             for i in range(min(5, len(dataset))):
                 else:
                     # Look at the first conversation entry
                     first_entry = conv[0]
                     if isinstance(first_entry, dict) and "content" in first_entry:
                         logger.info(f"Content field example: {str(first_entry['content'])[:50]}...")
                     else:
             logger.error("This could be due to authentication issues with your HF_TOKEN")
             raise
         return dataset
     except Exception as e:
     roles = dataset_config.get("data_formatting", {}).get("roles", {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
         "assistant": "Assistant: {content}\n\n"
     })
+    # Handle each message in the conversation
     for message in messages:
         if not isinstance(message, dict) or "content" not in message:
             logger.warning(f"Skipping invalid message format: {message}")
             continue
+        content = message.get("content", "").strip()
+        # Skip empty content
+        if not content:
+            continue
+        # Infer role based on content patterns
+        if "[RESEARCH INTRODUCTION]" in content:
+            # System message
             template = roles.get("system", "System: {content}\n\n")
             formatted_chat = template.format(content=content) + formatted_chat
         else:
+            # Alternate between human and assistant for regular conversation turns
+            # In phi-4 format, human messages come first, followed by assistant responses
+            if len(formatted_chat.split("Human:")) == len(formatted_chat.split("Assistant:")):
+                # If equal numbers of Human and Assistant messages, next is Human
+                template = roles.get("human", "Human: {content}\n\n")
+            else:
+                # Otherwise, next is Assistant
+                template = roles.get("assistant", "Assistant: {content}\n\n")
             formatted_chat += template.format(content=content)
     return formatted_chat.strip()
                 paper_id = example.get("article_id", "unknown")
                 prompt_num = example.get("prompt_number", "unknown")
+                # Get the conversations list
                 conversations = example.get("conversations", [])
                 # Skip if no conversations
                     self.stats["skipped"] += 1
                     continue
+                # Format the conversation using phi chat template
+                formatted_chat = format_phi_chat(conversations, self.dataset_config)
+                # Skip if formatting resulted in empty content
+                if not formatted_chat:
+                    logger.warning(f"Empty formatted chat for paper_id {paper_id}, prompt {prompt_num}")
                     self.stats["skipped"] += 1
                     continue
+                # Create input IDs and attention mask
+                input_ids = self.tokenizer.encode(formatted_chat, add_special_tokens=False)
                 # Truncate if needed
                 if len(input_ids) > self.max_seq_length:
                 self.stats["processed"] += 1
                 self.stats["total_tokens"] += len(input_ids)
+                # Log first few examples for verification
+                if self.stats["processed"] <= 3:
+                    logger.info(f"Sample {self.stats['processed']} formatted chat:")
+                    logger.info(f"{formatted_chat[:200]}...")
             except Exception as e:
                 logger.warning(f"Error processing example {paper_id}, prompt {prompt_num}: {str(e)}")
                 self.stats["skipped"] += 1
         return batch
 class LoggingCallback(TrainerCallback):
+    def __init__(self, model=None, dataset=None):
         super().__init__()
         self.training_started = time.time()
         self.last_log_time = time.time()
         self.last_step = 0
+        self.model = model
+        self.dataset = dataset
     def on_train_begin(self, args, state, control, **kwargs):
         log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
+        # Log model info if available
+        if self.model is not None:
+            log_info(f"Model parameters: {sum(p.numel() for p in self.model.parameters())/1e6:.2f}M")
+        # Log dataset info if available
+        if self.dataset is not None:
+            log_info(f"Dataset size: {len(self.dataset)} examples")
         # Log important training parameters for visibility
         total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
+        total_steps = int(len(self.dataset or []) / (args.per_device_train_batch_size * NUM_GPUS * args.gradient_accumulation_steps) * args.num_train_epochs)
+        log_info(f"Training plan: {len(self.dataset or [])} examples over {args.num_train_epochs} epochs ≈ {total_steps} steps")
         log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
         # Log memory information in compact format
         if CUDA_AVAILABLE:
                 allocated = torch.cuda.memory_allocated(i) / 1024**2
                 max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
                 memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
             log_info(f"Initial memory usage - {', '.join(memory_info)}")
 def check_dependencies():
     """Check if all required dependencies are installed and in the correct order."""
     missing_packages = []
     order_issues = []
+    # Define required packages with versions
+    required_packages = {
+        "unsloth": ">=2024.3",
+        "transformers": ">=4.38.0",
+        "peft": ">=0.9.0",
+        "accelerate": ">=0.27.0"
+    }
+    # Check for required packages
+    for package, version in required_packages.items():
+        try:
+            if package == "unsloth" and not unsloth_available:
+                missing_packages.append(f"{package}{version}")
+            elif package == "peft" and not peft_available:
+                missing_packages.append(f"{package}{version}")
+            else:
+                module = __import__(package)
+                logger.info(f"Using {package} version {getattr(module, '__version__', 'unknown')}")
+        except ImportError:
+            missing_packages.append(f"{package}{version}")
+    # Check import order
     try:
         import sys
+        modules = list(sys.modules.keys())
         if 'transformers' in modules and 'unsloth' in modules:
+            try:
+                transformers_idx = modules.index('transformers')
+                unsloth_idx = modules.index('unsloth')
+                if transformers_idx < unsloth_idx:
+                    order_issues.append("For optimal performance, unsloth should be imported before transformers")
+            except ValueError:
+                pass
+    except Exception as e:
+        logger.warning(f"Could not check module import order: {str(e)}")
+    # Check optional dependencies
+    optional_packages = {
+        "flash_attn": "Flash attention support",
+        "bitsandbytes": "4-bit quantization support"
+    }
+    for package, feature in optional_packages.items():
+        if find_spec(package):
+            logger.info(f"Found {package} - {feature} enabled")
+        else:
+            logger.warning(f"{package} not found - {feature} will not be available")
+    # Report missing required packages
     if missing_packages:
         logger.error("Critical dependencies missing:")
         for pkg in missing_packages:
     for issue in order_issues:
         logger.warning(issue)
     return True
 def update_huggingface_space():
             # Set up training arguments
             log_info("Setting up training arguments")
+            # Handle FSDP configuration
+            fsdp_config = transformers_config.get("distributed_training", {}).get("fsdp_config", {})
+            fsdp_enabled = fsdp_config.get("enabled", False)
+            # Only set FSDP args if explicitly enabled
             fsdp_args = None
+            if fsdp_enabled and is_distributed and NUM_GPUS > 1:
+                fsdp_args = {
+                    "fsdp": ["full_shard", "auto_wrap"],
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_offload_params": fsdp_config.get("offload_params", False),
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "fsdp_sharding_strategy": 1,  # FULL_SHARD
+                }
+                log_info("FSDP configuration enabled")
+            else:
+                log_info("FSDP disabled, using standard data parallel")
             # Check if we're running in a Space
             is_space = bool(os.environ.get("SPACE_ID"))
+            # Create training arguments
             training_args = TrainingArguments(
                 output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
                 num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
                 max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
                 push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
                 hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
                 hub_token=None if is_space else os.environ.get("HF_TOKEN", None),
                 report_to="tensorboard",
                 remove_unused_columns=False,  # Keep all columns
                 dataloader_drop_last=False,  # Process all examples
                 dataloader_num_workers=dataloader_workers,
                 no_cuda=False if CUDA_AVAILABLE else True,  # Use CUDA if available
+                **({} if fsdp_args is None else fsdp_args)  # Only include FSDP args if configured
             )
             log_info("Training arguments created successfully")
             trainer = Trainer(
                 model=model,
                 args=training_args,
+                train_dataset=dataset,
                 data_collator=data_collator,
+                callbacks=[LoggingCallback(model=model, dataset=dataset)],
             )
             # Then override the get_train_dataloader method
                     log_info("Cleared CUDA cache before training")
                 # Display compact training info
+                total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs
                 log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
                 trainer.train()

transformers_config.json CHANGED Viewed

@@ -136,11 +136,14 @@
     },
     "data_formatting": {
       "chat_template": "phi",
       "roles": {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
-        "assistant": "Assistant: {content}\n\n",
-        "user": "Human: {content}\n\n"
       }
     },
     "data_loading": {

     },
     "data_formatting": {
       "chat_template": "phi",
+      "conversation_structure": {
+        "system_identifier": "[RESEARCH INTRODUCTION]",
+        "turn_order": ["human", "assistant"]
+      },
       "roles": {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
+        "assistant": "Assistant: {content}\n\n"
       }
     },
     "data_loading": {