Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

ae4e1de

verified ·

1 Parent(s): b033a7b

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +35 -43

run_transformers_training.py CHANGED Viewed

@@ -494,7 +494,7 @@ class SimpleDataCollator:
         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
-        logger.info(f"SimpleDataCollator initialized with max_seq_length={self.max_seq_length}")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, features):
@@ -504,65 +504,57 @@ class SimpleDataCollator:
             try:
                 # Get ID for logging
                 paper_id = example.get("article_id", "unknown")
-                # Get conversations - we expect a list with a single dict containing 'content'
                 conversations = example.get("conversations", [])
-                # Skip if conversations is None or empty
                 if not conversations:
-                    logger.warning(f"Empty conversations for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
-                # Get the first (and should be only) conversation item
-                conv_item = conversations[0] if conversations else None
-                # Skip if no valid conversation item
-                if not isinstance(conv_item, dict):
-                    logger.warning(f"Invalid conversation format for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
-                # Get the content directly
-                content = conv_item.get("content", "")
-                # Skip if no content
                 if not content:
-                    logger.warning(f"Empty content for paper_id {paper_id}")
                     self.stats["skipped"] += 1
                     continue
-                # Tokenize the content directly
-                try:
-                    inputs = self.tokenizer(
-                        content,
-                        add_special_tokens=True,
-                        return_tensors=None,
-                        truncation=True,
-                        max_length=self.max_seq_length
-                    )
-                    input_ids = inputs["input_ids"]
-                    attention_mask = inputs["attention_mask"]
-                    if len(input_ids) > 0:
-                        batch["input_ids"].append(input_ids)
-                        batch["attention_mask"].append(attention_mask)
-                        batch["labels"].append(input_ids.copy())  # For causal LM, labels = input_ids
-                        self.stats["processed"] += 1
-                        self.stats["total_tokens"] += len(input_ids)
-                    else:
-                        logger.warning(f"Empty tokenization output for paper_id {paper_id}")
-                        self.stats["skipped"] += 1
-                except Exception as e:
-                    logger.warning(f"Tokenization failed for paper_id {paper_id}: {str(e)}")
-                    self.stats["skipped"] += 1
-                    continue
             except Exception as e:
-                logger.warning(f"Error processing example: {str(e)}")
                 self.stats["skipped"] += 1
                 continue

         self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
         self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
         self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        logger.info(f"SimpleDataCollator initialized - using pre-tokenized chunks with max_seq_length={self.max_seq_length}")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, features):
             try:
                 # Get ID for logging
                 paper_id = example.get("article_id", "unknown")
+                prompt_num = example.get("prompt_number", "unknown")
+                # Get the conversations list - should be a single item
                 conversations = example.get("conversations", [])
+                # Skip if no conversations
                 if not conversations:
+                    logger.warning(f"Empty conversations for paper_id {paper_id}, prompt {prompt_num}")
                     self.stats["skipped"] += 1
                     continue
+                # Get the first conversation item (should be the only one)
+                conv_item = conversations[0]
+                # Skip if invalid format
+                if not isinstance(conv_item, dict) or "content" not in conv_item:
+                    logger.warning(f"Invalid conversation format for paper_id {paper_id}, prompt {prompt_num}")
                     self.stats["skipped"] += 1
                     continue
+                # Get the pre-tokenized content
+                content = conv_item["content"]
+                # Skip if empty content
                 if not content:
+                    logger.warning(f"Empty content for paper_id {paper_id}, prompt {prompt_num}")
                     self.stats["skipped"] += 1
                     continue
+                # Create input IDs and attention mask directly from the content
+                # The content is already pre-tokenized and properly chunked
+                input_ids = self.tokenizer.encode(content, add_special_tokens=False)
+                # Truncate if needed
+                if len(input_ids) > self.max_seq_length:
+                    input_ids = input_ids[:self.max_seq_length]
+                    logger.warning(f"Truncated sequence for paper_id {paper_id}, prompt {prompt_num}")
+                # Create attention mask (1s for all tokens)
+                attention_mask = [1] * len(input_ids)
+                # Add to batch
+                batch["input_ids"].append(input_ids)
+                batch["attention_mask"].append(attention_mask)
+                batch["labels"].append(input_ids.copy())  # For causal LM, labels = input_ids
+                self.stats["processed"] += 1
+                self.stats["total_tokens"] += len(input_ids)
             except Exception as e:
+                logger.warning(f"Error processing example {paper_id}, prompt {prompt_num}: {str(e)}")
                 self.stats["skipped"] += 1
                 continue