Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 9

Commit

7e5a6ad

verified ·

1 Parent(s): 0364d5c

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +94 -133

run_transformers_training.py CHANGED Viewed

@@ -297,20 +297,27 @@ def load_dataset_with_mapping(dataset_config):
                 else:
                     logger.warning(f"Expected column '{col}' not found in dataset")
-        # Sort dataset if required
-        sort_by_id = dataset_config.get("dataset", {}).get("processing", {}).get("sort_by_id", False)
-        if sort_by_id and "id" in dataset.column_names:
-            logger.info("Sorting dataset by ID")
-            dataset = dataset.sort("id")
-            # Log the first few IDs to verify sorting
             sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
-            logger.info(f"First few IDs after sorting: {sample_ids}")
-            # Log example of conversations structure to verify format
-            if "conversations" in dataset.column_names:
-                sample_conv = dataset["conversations"][0] if len(dataset) > 0 else []
-                logger.info(f"Example conversation structure: {sample_conv}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
@@ -374,142 +381,91 @@ class SimpleDataCollator:
         self.dataset_config = dataset_config
         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
-        self.paper_counters = {}
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
-        self.include_metadata = False  # Disable automatic metadata inclusion as it's already in content
-        self.roles = dataset_config.get("data_formatting", {}).get("roles", {})
-        logger.info(f"SimpleDataCollator initialized - using phi-4 chat format with max_seq_length={self.max_seq_length}")
-        logger.info("Metadata handling disabled - using metadata from content field")
         # Check if we're on GPU
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"SimpleDataCollator using device: {self.device}")
-    def normalize_conversation(self, conversation):
-        """Normalize conversation format to ensure consistent structure."""
-        normalized = []
-        # Handle non-list or empty inputs
-        if not isinstance(conversation, list):
-            logger.warning(f"Conversation is not a list: {type(conversation)}")
-            if hasattr(conversation, 'items'):  # It's a dict-like object
-                conversation = [conversation]
-            else:
-                return []
-        # Get introductory message if present (should be first and without chunk number)
-        intro_msg = None
-        for i, turn in enumerate(conversation):
-            if isinstance(turn, dict) and turn.get('content') and "[RESEARCH INTRODUCTION]" in turn.get('content', ''):
-                intro_msg = turn
-                break
-        # Process introduction message first if found
-        if intro_msg:
-            normalized.append({
-                "role": "system",
-                "content": intro_msg.get('content', '')
-            })
-            # Remove intro from further processing
-            conversation = [t for t in conversation if t != intro_msg]
-        # Process remaining messages
-        for turn in conversation:
-            # Skip empty or None entries
-            if not turn:
-                continue
-            # Handle string entries (convert to user message)
-            if isinstance(turn, str):
-                normalized.append({"role": "user", "content": turn})
-                continue
-            # Handle dict-like entries
-            if not isinstance(turn, dict) and hasattr(turn, 'get'):
-                # Convert to dict
-                turn = {k: turn.get(k) for k in ['role', 'content'] if hasattr(turn, 'get') and turn.get(k) is not None}
-            # Ensure both role and content exist
-            if not isinstance(turn, dict) or 'role' not in turn or 'content' not in turn:
-                logger.warning(f"Skipping malformatted conversation turn: {turn}")
-                continue
-            # Normalize role field
-            role = turn.get('role', '').lower()
-            if role == 'user' or role == 'human':
-                role = 'user'
-            elif role == 'assistant' or role == 'bot':
-                role = 'assistant'
-            # Add normalized turn
-            normalized.append({
-                "role": role,
-                "content": str(turn.get('content', ''))
-            })
-        return normalized
     def __call__(self, features):
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
         for example in features:
             try:
-                # Get ID and conversation fields
                 paper_id = example.get("id", "")
-                # Handle conversation field - could be under 'conversations' or 'text'
-                conversation = example.get("conversations", example.get("text", []))
-                # Normalize conversation format
-                conversation = self.normalize_conversation(conversation)
-                if not conversation:
                     self.stats["skipped"] += 1
                     continue
-                # Format conversation with research introduction and chunk info
-                formatted_content = format_phi_chat(conversation, self.dataset_config)
-                # Tokenize with the model's chat template
-                inputs = self.tokenizer(
-                    formatted_content,
-                    add_special_tokens=True,
-                    truncation=True,
-                    max_length=self.max_seq_length,
-                    return_tensors=None,
-                    padding=False,  # Don't pad here, we'll pad the batch later
-                )
-                if len(inputs["input_ids"]) > 0:
                     # For causal language modeling, labels are the same as inputs
-                    labels = inputs["input_ids"].copy()
-                    batch["input_ids"].append(inputs["input_ids"])
-                    batch["attention_mask"].append(inputs["attention_mask"])
                     batch["labels"].append(labels)
                     self.stats["processed"] += 1
-                    self.stats["total_tokens"] += len(inputs["input_ids"])
                     # Debug logging for first few examples
                     log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
                     if self.stats["processed"] <= log_samples:
-                        logger.info(f"Example {self.stats['processed']} format:")
                         logger.info(f"Paper ID: {paper_id}")
-                        logger.info(f"Token count: {len(inputs['input_ids'])}")
-                        logger.info(f"Content preview:\n{formatted_content[:500]}...")
-                        logger.info(f"Conversation structure: {conversation[:2]}...")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
-                logger.warning(f"Problematic example: {str(example)[:200]}...")
                 self.stats["skipped"] += 1
                 continue
         if not batch["input_ids"]:
             logger.warning("Empty batch, returning dummy tensors")
-            # Return tensors on the right device
             return {
                 "input_ids": torch.zeros((1, 1), dtype=torch.long),
                 "attention_mask": torch.zeros((1, 1), dtype=torch.long),
@@ -526,7 +482,7 @@ class SimpleDataCollator:
                 batch["attention_mask"][i].extend([0] * padding_length)
                 batch["labels"][i].extend([-100] * padding_length)
-        # Convert to tensors on CPU first
         batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
         # Log stats periodically
@@ -534,8 +490,7 @@ class SimpleDataCollator:
         if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
             logger.info(f"Data collator stats: processed={self.stats['processed']}, "
                        f"skipped={self.stats['skipped']}, "
-                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}, "
-                       f"unique_papers={len(self.paper_counters)}")
         return batch
@@ -731,21 +686,35 @@ def main():
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
         )
-        # Custom dataloader to ensure no shuffling of dataset
-        # This preserves the order of chunks in papers
-        def get_train_dataloader_no_shuffle():
-            logger.info("Creating data loader with sequential sampler to maintain paper order")
             if getattr(training_args, "no_cuda", False):
                 batch_size = training_args.per_device_train_batch_size
             else:
-                batch_size = max(training_args.per_device_train_batch_size * torch.cuda.device_count(), 1)
-            # Use sequential sampler to preserve order
-            sequential_sampler = torch.utils.data.SequentialSampler(dataset["train"])
-            logger.info(f"Using sequential sampler for batch size {batch_size}")
             return torch.utils.data.DataLoader(
-                dataset["train"],
                 batch_size=batch_size,
                 sampler=sequential_sampler,
                 collate_fn=data_collator,
@@ -754,16 +723,8 @@ def main():
                 pin_memory=training_args.dataloader_pin_memory,
             )
-        # Set up trainer with custom dataloader
-        logger.info("Initializing Trainer")
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            get_train_dataloader=get_train_dataloader_no_shuffle,
-            tokenizer=tokenizer,
-            data_collator=data_collator,
-            callbacks=[LoggingCallback()]
-        )
         # Start training
         logger.info("Starting training process")

                 else:
                     logger.warning(f"Expected column '{col}' not found in dataset")
+        # Note: Explicitly NOT sorting the dataset to preserve original order
+        logger.info("Preserving original dataset order (no sorting)")
+        # Log examples without printing full content
+        if "conversations" in dataset.column_names:
             sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
+            logger.info(f"First few IDs: {sample_ids}")
+            # Log conversation structure without full content
+            if len(dataset) > 0:
+                sample_conv_structure = []
+                for msg in dataset["conversations"][0]:
+                    if isinstance(msg, dict):
+                        content = msg.get('content', '')
+                        preview = content[:50] + "..." if len(content) > 50 else content
+                        sample_conv_structure.append({
+                            "role": msg.get('role', ''),
+                            "content_length": len(content),
+                            "preview": preview
+                        })
+                logger.info(f"Conversation structure: {sample_conv_structure}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
         self.dataset_config = dataset_config
         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
+        logger.info("Using exact dataset structure without reformatting")
         # Check if we're on GPU
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"SimpleDataCollator using device: {self.device}")
     def __call__(self, features):
+        """Process examples preserving exact JSONL structure"""
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
         for example in features:
             try:
+                # Get ID
                 paper_id = example.get("id", "")
+                # Get conversations - these should already contain role and content
+                conversations = example.get("conversations", [])
+                if not conversations:
                     self.stats["skipped"] += 1
                     continue
+                # Directly use the conversations array as input to the model's chat template
+                # This preserves the exact structure with roles and content as they are
+                try:
+                    # Let tokenizer handle the content with the model's chat template
+                    inputs = self.tokenizer.apply_chat_template(
+                        conversations,
+                        return_tensors=None,
+                        add_generation_prompt=False
+                    )
+                except:
+                    # Fallback if apply_chat_template fails
+                    logger.warning(f"Chat template application failed for example {paper_id}, using basic tokenization")
+                    # Create a basic representation of the conversation
+                    conversation_text = ""
+                    for msg in conversations:
+                        if isinstance(msg, dict) and 'content' in msg:
+                            conversation_text += msg.get('content', '') + "\n\n"
+                    # Basic tokenization
+                    inputs = self.tokenizer(
+                        conversation_text,
+                        add_special_tokens=True,
+                        return_tensors=None
+                    )
+                # Apply length cap if needed (shouldn't be necessary for pre-audited data)
+                if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
+                    logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
+                    inputs = inputs[:self.max_seq_length]
+                # Create attention mask (1 for all tokens)
+                attention_mask = [1] * len(inputs)
+                if len(inputs) > 0:
                     # For causal language modeling, labels are the same as inputs
+                    labels = inputs.copy()
+                    batch["input_ids"].append(inputs)
+                    batch["attention_mask"].append(attention_mask)
                     batch["labels"].append(labels)
                     self.stats["processed"] += 1
+                    self.stats["total_tokens"] += len(inputs)
                     # Debug logging for first few examples
                     log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
                     if self.stats["processed"] <= log_samples:
+                        logger.info(f"Example {self.stats['processed']}:")
                         logger.info(f"Paper ID: {paper_id}")
+                        logger.info(f"Token count: {len(inputs)}")
+                        logger.info(f"Conversation entries: {len(conversations)}")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
+                logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
                 self.stats["skipped"] += 1
                 continue
         if not batch["input_ids"]:
             logger.warning("Empty batch, returning dummy tensors")
             return {
                 "input_ids": torch.zeros((1, 1), dtype=torch.long),
                 "attention_mask": torch.zeros((1, 1), dtype=torch.long),
                 batch["attention_mask"][i].extend([0] * padding_length)
                 batch["labels"][i].extend([-100] * padding_length)
+        # Convert to tensors
         batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
         # Log stats periodically
         if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
             logger.info(f"Data collator stats: processed={self.stats['processed']}, "
                        f"skipped={self.stats['skipped']}, "
+                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
         return batch
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
         )
+        # Create sequential sampler to maintain original dataset order
+        sequential_sampler = torch.utils.data.SequentialSampler(dataset)
+        # Initialize trainer first
+        logger.info("Initializing Trainer")
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=dataset,  # We'll override this with our custom dataloader
+            data_collator=data_collator,
+            callbacks=[LoggingCallback()],
+        )
+        # Then override the get_train_dataloader method
+        def custom_get_train_dataloader():
+            """Custom dataloader that preserves original dataset order"""
+            logger.info("Creating sequential dataloader to maintain original dataset order")
+            # Calculate batch size based on device availability
             if getattr(training_args, "no_cuda", False):
                 batch_size = training_args.per_device_train_batch_size
             else:
+                batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
+            logger.info(f"Using sequential sampler with batch size {batch_size}")
+            # Return DataLoader with sequential sampler
             return torch.utils.data.DataLoader(
+                dataset,
                 batch_size=batch_size,
                 sampler=sequential_sampler,
                 collate_fn=data_collator,
                 pin_memory=training_args.dataloader_pin_memory,
             )
+        # Override the get_train_dataloader method
+        trainer.get_train_dataloader = custom_get_train_dataloader
         # Start training
         logger.info("Starting training process")