hf-train-frontend

Paused

App Files Files Community

George-API commited on Mar 10

Commit

2b5da3a

verified ·

1 Parent(s): 4ce739a

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +83 -93

run_transformers_training.py CHANGED Viewed

@@ -11,13 +11,25 @@ from datetime import datetime
 import time
 import warnings
 from importlib.util import find_spec
 # Check hardware capabilities first
-import torch
-CUDA_AVAILABLE = torch.cuda.is_available()
 NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
 DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
 # Configure logging early
 logging.basicConfig(
     level=logging.INFO,
@@ -435,104 +447,82 @@ def format_phi_chat(messages, dataset_config):
 class SimpleDataCollator:
     def __init__(self, tokenizer, dataset_config):
         self.tokenizer = tokenizer
-        self.dataset_config = dataset_config
-        self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
-        self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
-        self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
-        logger.info(f"SimpleDataCollator initialized - using pre-tokenized chunks with max_seq_length={self.max_seq_length}")
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, features):
-        batch = {"input_ids": [], "attention_mask": [], "labels": []}
-        for example in features:
-            try:
-                # Get ID for logging
-                paper_id = example.get("article_id", "unknown")
-                prompt_num = example.get("prompt_number", "unknown")
-                # Get the conversations list
-                conversations = example.get("conversations", [])
-                # Skip if no conversations
-                if not conversations:
-                    logger.warning(f"Empty conversations for paper_id {paper_id}, prompt {prompt_num}")
-                    self.stats["skipped"] += 1
-                    continue
-                # Get the pre-tokenized content directly
-                # The content should already be properly tokenized and formatted
-                content = conversations[0].get("content", "")
-                if not content:
-                    logger.warning(f"Empty content for paper_id {paper_id}, prompt {prompt_num}")
-                    self.stats["skipped"] += 1
-                    continue
-                # Convert string of numbers to list of integers if needed
-                if isinstance(content, str):
-                    try:
-                        # Assuming content is space-separated numbers
-                        input_ids = [int(x) for x in content.split()]
-                    except ValueError:
-                        logger.warning(f"Invalid pre-tokenized content format for paper_id {paper_id}, prompt {prompt_num}")
-                        self.stats["skipped"] += 1
-                        continue
-                else:
-                    input_ids = content
-                # Truncate if needed
-                if len(input_ids) > self.max_seq_length:
-                    input_ids = input_ids[:self.max_seq_length]
-                    logger.warning(f"Truncated sequence for paper_id {paper_id}, prompt {prompt_num}")
-                # Create attention mask (1s for all tokens)
-                attention_mask = [1] * len(input_ids)
-                # Add to batch
-                batch["input_ids"].append(input_ids)
-                batch["attention_mask"].append(attention_mask)
-                batch["labels"].append(input_ids.copy())  # For causal LM, labels = input_ids
-                self.stats["processed"] += 1
-                self.stats["total_tokens"] += len(input_ids)
-                # Log first few examples for verification
-                if self.stats["processed"] <= 3:
-                    logger.info(f"Sample {self.stats['processed']} token count: {len(input_ids)}")
-            except Exception as e:
-                logger.warning(f"Error processing example {paper_id}, prompt {prompt_num}: {str(e)}")
                 self.stats["skipped"] += 1
                 continue
-        if not batch["input_ids"]:
-            logger.warning("Empty batch, returning dummy tensors")
-            return {
-                "input_ids": torch.zeros((1, 1), dtype=torch.long, device=self.device),
-                "attention_mask": torch.zeros((1, 1), dtype=torch.long, device=self.device),
-                "labels": torch.zeros((1, 1), dtype=torch.long, device=self.device)
-            }
-        # Pad the batch
-        max_length = max(len(ids) for ids in batch["input_ids"])
-        for i in range(len(batch["input_ids"])):
-            padding_length = max_length - len(batch["input_ids"][i])
-            if padding_length > 0:
-                batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
-                batch["attention_mask"][i].extend([0] * padding_length)
-                batch["labels"][i].extend([-100] * padding_length)  # -100 is the ignore index for loss
-        # Convert to tensors
-        batch = {k: torch.tensor(v, dtype=torch.long, device=self.device) for k, v in batch.items()}
-        # Log stats periodically
         if self.stats["processed"] % 100 == 0:
-            logger.info(f"Collator stats: processed={self.stats['processed']}, "
-                       f"skipped={self.stats['skipped']}, "
-                       f"avg_tokens={self.stats['total_tokens']/max(1, self.stats['processed']):.1f}")
-        return batch
 class LoggingCallback(TrainerCallback):
     def __init__(self, model=None, dataset=None):

 import time
 import warnings
 from importlib.util import find_spec
+import multiprocessing
 # Check hardware capabilities first
+CUDA_AVAILABLE = "CUDA_VISIBLE_DEVICES" in os.environ or os.environ.get("NVIDIA_VISIBLE_DEVICES") != ""
 NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
 DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
+# Set the multiprocessing start method to 'spawn' for CUDA compatibility
+if CUDA_AVAILABLE:
+    try:
+        multiprocessing.set_start_method('spawn', force=True)
+        print("Set multiprocessing start method to 'spawn' for CUDA compatibility")
+    except RuntimeError:
+        # Method already set, which is fine
+        print("Multiprocessing start method already set")
+# Now import the rest of the modules
+import torch
 # Configure logging early
 logging.basicConfig(
     level=logging.INFO,
 class SimpleDataCollator:
     def __init__(self, tokenizer, dataset_config):
         self.tokenizer = tokenizer
+        self.max_seq_length = min(dataset_config.get("max_seq_length", 2048), tokenizer.model_max_length)
+        self.stats = {
+            "processed": 0,
+            "skipped": 0,
+            "total_tokens": 0
+        }
+        logger.info(f"Initialized SimpleDataCollator with max_seq_length={self.max_seq_length}")
     def __call__(self, features):
+        # Initialize tensors on CPU to save GPU memory
+        batch = {
+            "input_ids": [],
+            "attention_mask": [],
+            "labels": []
+        }
+        for feature in features:
+            paper_id = feature.get("article_id", "unknown")
+            prompt_num = feature.get("prompt_number", 0)
+            conversations = feature.get("conversations", [])
+            if not conversations:
+                logger.warning(f"No conversations for paper_id {paper_id}, prompt {prompt_num}")
+                self.stats["skipped"] += 1
+                continue
+            # Get the content directly
+            content = conversations[0].get("content", "")
+            if not content:
+                logger.warning(f"Empty content for paper_id {paper_id}, prompt {prompt_num}")
                 self.stats["skipped"] += 1
                 continue
+            # Process the content string by tokenizing it
+            if isinstance(content, str):
+                # Tokenize the content string
+                input_ids = self.tokenizer.encode(content, add_special_tokens=True)
+            else:
+                # If somehow the content is already tokenized (not a string), use it directly
+                input_ids = content
+            # Truncate if needed
+            if len(input_ids) > self.max_seq_length:
+                input_ids = input_ids[:self.max_seq_length]
+                logger.warning(f"Truncated sequence for paper_id {paper_id}, prompt {prompt_num}")
+            # Create attention mask (1s for all tokens)
+            attention_mask = [1] * len(input_ids)
+            # Add to batch
+            batch["input_ids"].append(input_ids)
+            batch["attention_mask"].append(attention_mask)
+            batch["labels"].append(input_ids.copy())  # For causal LM, labels = input_ids
+            self.stats["processed"] += 1
+            self.stats["total_tokens"] += len(input_ids)
+        # Log statistics periodically
         if self.stats["processed"] % 100 == 0:
+            avg_tokens = self.stats["total_tokens"] / max(1, self.stats["processed"])
+            logger.info(f"Data collation stats: processed={self.stats['processed']}, "
+                       f"skipped={self.stats['skipped']}, avg_tokens={avg_tokens:.1f}")
+        # Convert to tensors or pad sequences (PyTorch will handle)
+        if batch["input_ids"]:
+            # Pad sequences to max length in batch using the tokenizer
+            batch = self.tokenizer.pad(
+                batch,
+                padding="max_length",
+                max_length=self.max_seq_length,
+                return_tensors="pt"
+            )
+            return batch
+        else:
+            # Return empty batch if no valid examples
+            return {k: [] for k in batch}
 class LoggingCallback(TrainerCallback):
     def __init__(self, model=None, dataset=None):