Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

b033a7b

verified ·

1 Parent(s): a843fab

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +46 -107

run_transformers_training.py CHANGED Viewed

@@ -494,144 +494,84 @@ class SimpleDataCollator:
         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
-        logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
-        logger.info("Using exact dataset structure without reformatting")
-        # Check if we're on GPU
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"SimpleDataCollator using device: {self.device}")
     def __call__(self, features):
-        """Process examples preserving exact JSONL structure"""
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
         for example in features:
             try:
                 # Get ID for logging
-                paper_id = example.get("article_id", example.get("id", "unknown"))
-                # Safely get conversations with explicit None check
-                raw_conversations = example.get("conversations")
-                if raw_conversations is None:
-                    logger.warning(f"Conversations is None for example {paper_id}")
                     self.stats["skipped"] += 1
                     continue
-                # Ensure conversations is a list
-                if not isinstance(raw_conversations, list):
-                    logger.warning(f"Conversations is not a list for example {paper_id} (type: {type(raw_conversations)})")
                     self.stats["skipped"] += 1
                     continue
-                # Check for empty conversations list
-                if not raw_conversations:
-                    logger.warning(f"Empty conversations list for example {paper_id}")
                     self.stats["skipped"] += 1
                     continue
-                # Extract only the 'content' field from each conversation item
                 try:
-                    # Convert conversations to the simple format with only content
-                    simplified_conversations = []
-                    for item in raw_conversations:
-                        # Skip None items
-                        if item is None:
-                            logger.warning(f"Skipping None conversation item in example {paper_id}")
-                            continue
-                        if isinstance(item, dict):
-                            # Get content with explicit None check
-                            content = item.get("content")
-                            if content is not None:
-                                simplified_conversations.append({"role": "user", "content": content})
-                            else:
-                                logger.warning(f"Skipping conversation item with None content in example {paper_id}")
-                        elif isinstance(item, str):
-                            # If it's just a string, treat it as content
-                            simplified_conversations.append({"role": "user", "content": item})
-                        else:
-                            logger.warning(f"Skipping invalid conversation item type: {type(item)} in example {paper_id}")
-                    # Skip if no valid conversations after filtering
-                    if not simplified_conversations:
-                        logger.warning(f"No valid conversations after filtering for example {paper_id}")
-                        self.stats["skipped"] += 1
-                        continue
-                    # Log the simplified content for debugging
-                    if len(simplified_conversations) > 0:
-                        first_content = simplified_conversations[0].get("content", "")
-                        if first_content:
-                            logger.debug(f"First content: {first_content[:50]}...")
-                    # Let tokenizer handle the simplified conversations
-                    try:
-                        inputs = self.tokenizer.apply_chat_template(
-                            simplified_conversations,
-                            return_tensors=None,
-                            add_generation_prompt=False
-                        )
-                    except Exception as chat_error:
-                        # Fallback if apply_chat_template fails
-                        logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)}")
-                        # Create a basic representation of just the content
-                        conversation_text = ""
-                        for msg in simplified_conversations:
-                            if isinstance(msg, dict) and msg.get("content"):
-                                conversation_text += msg["content"] + "\n\n"
-                        if not conversation_text:
-                            logger.warning(f"No valid content to tokenize in example {paper_id}")
-                            self.stats["skipped"] += 1
-                            continue
-                        # Basic tokenization
-                        inputs = self.tokenizer(
-                            conversation_text,
-                            add_special_tokens=True,
-                            return_tensors=None
-                        )
-                    # Apply length cap if needed
-                    if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
-                        logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
-                        inputs = inputs[:self.max_seq_length]
-                    # Create attention mask (1 for all tokens)
-                    attention_mask = [1] * len(inputs)
-                    if len(inputs) > 0:
-                        # For causal language modeling, labels are the same as inputs
-                        labels = inputs.copy()
-                        batch["input_ids"].append(inputs)
                         batch["attention_mask"].append(attention_mask)
-                        batch["labels"].append(labels)
                         self.stats["processed"] += 1
-                        self.stats["total_tokens"] += len(inputs)
                     else:
-                        logger.warning(f"Empty inputs after tokenization for example {paper_id}")
                         self.stats["skipped"] += 1
                 except Exception as e:
-                    logger.warning(f"Error processing conversations in example {paper_id}: {str(e)}")
                     self.stats["skipped"] += 1
                     continue
             except Exception as e:
-                logger.warning(f"Error processing example: {str(e)[:100]}...")
-                logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
                 self.stats["skipped"] += 1
                 continue
         if not batch["input_ids"]:
             logger.warning("Empty batch, returning dummy tensors")
             return {
-                "input_ids": torch.zeros((1, 1), dtype=torch.long),
-                "attention_mask": torch.zeros((1, 1), dtype=torch.long),
-                "labels": torch.zeros((1, 1), dtype=torch.long)
             }
         # Pad the batch
@@ -642,17 +582,16 @@ class SimpleDataCollator:
             if padding_length > 0:
                 batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
                 batch["attention_mask"][i].extend([0] * padding_length)
-                batch["labels"][i].extend([-100] * padding_length)
         # Convert to tensors
-        batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
         # Log stats periodically
-        log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
-        if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
-            logger.info(f"Data collator stats: processed={self.stats['processed']}, "
                        f"skipped={self.stats['skipped']}, "
-                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
         return batch

         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        logger.info(f"SimpleDataCollator initialized with max_seq_length={self.max_seq_length}")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, features):
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
         for example in features:
             try:
                 # Get ID for logging
+                paper_id = example.get("article_id", "unknown")
+                # Get conversations - we expect a list with a single dict containing 'content'
+                conversations = example.get("conversations", [])
+                # Skip if conversations is None or empty
+                if not conversations:
+                    logger.warning(f"Empty conversations for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
+                # Get the first (and should be only) conversation item
+                conv_item = conversations[0] if conversations else None
+                # Skip if no valid conversation item
+                if not isinstance(conv_item, dict):
+                    logger.warning(f"Invalid conversation format for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
+                # Get the content directly
+                content = conv_item.get("content", "")
+                # Skip if no content
+                if not content:
+                    logger.warning(f"Empty content for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
+                # Tokenize the content directly
                 try:
+                    inputs = self.tokenizer(
+                        content,
+                        add_special_tokens=True,
+                        return_tensors=None,
+                        truncation=True,
+                        max_length=self.max_seq_length
+                    )
+                    input_ids = inputs["input_ids"]
+                    attention_mask = inputs["attention_mask"]
+                    if len(input_ids) > 0:
+                        batch["input_ids"].append(input_ids)
                         batch["attention_mask"].append(attention_mask)
+                        batch["labels"].append(input_ids.copy())  # For causal LM, labels = input_ids
                         self.stats["processed"] += 1
+                        self.stats["total_tokens"] += len(input_ids)
                     else:
+                        logger.warning(f"Empty tokenization output for paper_id {paper_id}")
                         self.stats["skipped"] += 1
                 except Exception as e:
+                    logger.warning(f"Tokenization failed for paper_id {paper_id}: {str(e)}")
                     self.stats["skipped"] += 1
                     continue
             except Exception as e:
+                logger.warning(f"Error processing example: {str(e)}")
                 self.stats["skipped"] += 1
                 continue
         if not batch["input_ids"]:
             logger.warning("Empty batch, returning dummy tensors")
             return {
+                "input_ids": torch.zeros((1, 1), dtype=torch.long, device=self.device),
+                "attention_mask": torch.zeros((1, 1), dtype=torch.long, device=self.device),
+                "labels": torch.zeros((1, 1), dtype=torch.long, device=self.device)
             }
         # Pad the batch
             if padding_length > 0:
                 batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
                 batch["attention_mask"][i].extend([0] * padding_length)
+                batch["labels"][i].extend([-100] * padding_length)  # -100 is the ignore index for loss
         # Convert to tensors
+        batch = {k: torch.tensor(v, dtype=torch.long, device=self.device) for k, v in batch.items()}
         # Log stats periodically
+        if self.stats["processed"] % 100 == 0:
+            logger.info(f"Collator stats: processed={self.stats['processed']}, "
                        f"skipped={self.stats['skipped']}, "
+                       f"avg_tokens={self.stats['total_tokens']/max(1, self.stats['processed']):.1f}")
         return batch