hf-train-frontend

Paused

App Files Files Community

George-API commited on Mar 10

Commit

71642d9

verified ·

1 Parent(s): 90530d1

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

requirements.txt +1 -1
run_transformers_training.py +44 -158
update_space.py +12 -4

requirements.txt CHANGED Viewed

@@ -4,7 +4,6 @@ bitsandbytes>=0.41.0
 datasets>=2.15.0
 einops>=0.7.0
 filelock>=3.13.1
-flash-attn==2.5.2
 gradio>=5.17.0
 huggingface-hub>=0.19.0
 matplotlib>=3.7.0
@@ -23,3 +22,4 @@ tqdm>=4.65.0
 transformers>=4.36.0
 typing-extensions>=4.8.0
 unsloth>=2024.3

 datasets>=2.15.0
 einops>=0.7.0
 filelock>=3.13.1
 gradio>=5.17.0
 huggingface-hub>=0.19.0
 matplotlib>=3.7.0
 transformers>=4.36.0
 typing-extensions>=4.8.0
 unsloth>=2024.3
+flash-attn==2.5.2

run_transformers_training.py CHANGED Viewed

@@ -158,38 +158,13 @@ def load_model_and_tokenizer(config):
         logger.info("Using Unsloth optimizations with pre-quantized model")
-        # Check for flash attention
-        use_flash_attention = config.get("use_flash_attention", True)
-        if use_flash_attention and not find_spec("flash_attn"):
-            logger.warning("flash-attn not found. Will continue without flash attention.")
-            logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
-            use_flash_attention = False
         # First detect if we have a GPU
         if torch.cuda.is_available():
             gpu_count = torch.cuda.device_count()
-            logger.info(f"CUDA available, found {gpu_count} GPU(s)")
-            # Log GPU info
-            for i in range(gpu_count):
-                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
-                logger.info(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
-            # Create an optimized device map for better balance
-            if gpu_count > 1:
-                logger.info(f"Creating balanced device map for {gpu_count} GPUs")
-                # Use auto mapping but with memory tracking
-                device_map = "auto"
-                # Set max memory for better balancing
-                max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.85 / 1024**3)}GiB" for i in range(gpu_count)}
-                logger.info(f"Max memory settings: {max_memory}")
-            else:
-                device_map = "auto"
-                max_memory = None
         else:
-            logger.warning("No CUDA available, falling back to CPU")
-            device_map = {"": "cpu"}  # Force CPU placement
-            max_memory = None
         # Set default dtype for better numerics
         if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
@@ -205,6 +180,13 @@ def load_model_and_tokenizer(config):
             dtype = None
             logger.info("Using default precision (CPU)")
         # Load model with proper error handling for out-of-memory
         try:
             # Improved memory settings for multi-GPU setup
@@ -300,6 +282,16 @@ def load_dataset_with_mapping(dataset_config):
         else:
             logger.info(f"Dataset has all required fields: {required_fields}")
         # Log a few samples for verification
         if len(dataset) > 0:
             sample_indices = range(min(5, len(dataset)))
@@ -524,54 +516,15 @@ class LoggingCallback(TrainerCallback):
         log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
         log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
-        # Set up sequence verification with actual sample capturing
-        try:
-            self.verify_sequence = dataset_config.get("validation", {}).get("verify_sequence_integrity", False)
-            if self.verify_sequence:
-                log_info("Sequence integrity verification enabled during training")
-                # Save actual samples for later verification
-                if trainer and hasattr(trainer, 'train_dataset') and trainer.train_dataset is not None:
-                    # Get some reference samples from the beginning of the dataset defensively
-                    self.sample_indices = []
-                    self.sequence_samples = []
-                    max_samples = min(5, len(trainer.train_dataset))
-                    for i in range(max_samples):
-                        try:
-                            if i < len(trainer.train_dataset):
-                                self.sample_indices.append(i)
-                                self.sequence_samples.append(trainer.train_dataset[i])
-                        except Exception as e:
-                            log_info(f"Warning: Error capturing reference sample at index {i}: {e}")
-                    if self.sequence_samples:
-                        log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
-                        # Log sample prompt numbers for debugging
-                        sample_prompt_numbers = []
-                        for s in self.sequence_samples:
-                            if isinstance(s, dict) and 'prompt_number' in s and s['prompt_number'] is not None:
-                                sample_prompt_numbers.append(s.get('prompt_number'))
-                        if sample_prompt_numbers:
-                            log_info(f"Reference sample prompt numbers: {sample_prompt_numbers}")
-                            if sample_prompt_numbers == list(range(1, len(sample_prompt_numbers) + 1)):
-                                log_info("Prompt numbers are sequential (1-indexed) - sequence integrity confirmed")
-                            else:
-                                log_info("Prompt numbers are not in expected sequence - will verify during training")
-                    else:
-                        log_info("Warning: No reference samples were captured")
-                else:
-                    log_info("Warning: Could not capture reference samples - verification will be limited")
-        except Exception as e:
-            log_info(f"Warning: Could not set up sequence integrity verification: {e}")
-            self.verify_sequence = False
         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
         total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
         log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")
@@ -585,90 +538,12 @@ class LoggingCallback(TrainerCallback):
                 memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
             log_info(f"Initial memory usage - {', '.join(memory_info)}")
     def on_step_end(self, args, state, control, **kwargs):
         # Log every 50 steps or every 5 minutes, whichever comes first
         current_time = time.time()
-        # Perform actual sequence integrity verification if enabled
-        if self.verify_sequence is True and state.global_step % 100 == 0 and self.sequence_samples:
-            try:
-                # Get a batch of data without disturbing the training
-                train_dataloader = trainer.get_train_dataloader()
-                if train_dataloader is None:
-                    log_info("Warning: Could not get train dataloader for verification")
-                else:
-                    batch_iterator = iter(train_dataloader)
-                    if batch_iterator is None:
-                        log_info("Warning: Could not get batch iterator for verification")
-                    else:
-                        try:
-                            batch = next(batch_iterator)
-                            if batch is None:
-                                log_info("Warning: Could not get batch for verification")
-                            elif 'input_ids' in batch and 'labels' in batch:
-                                log_info("Verifying data sequence integrity...")
-                                # Check if we can access some of our reference samples
-                                if not hasattr(trainer, 'train_dataset') or trainer.train_dataset is None:
-                                    log_info("Warning: Train dataset is not available")
-                                else:
-                                    # Get current samples defensively
-                                    current_samples = []
-                                    current_indices = list(range(min(3, len(trainer.train_dataset))))
-                                    for idx in current_indices:
-                                        try:
-                                            if idx < len(trainer.train_dataset):
-                                                current_samples.append(trainer.train_dataset[idx])
-                                        except Exception as e:
-                                            log_info(f"Warning: Error accessing dataset at index {idx}: {e}")
-                                    # Only proceed if we have samples to compare
-                                    if current_samples and self.sequence_samples:
-                                        # Compare current samples with our reference samples from training start
-                                        is_sequence_maintained = True
-                                        for i, (orig_idx, orig_sample) in enumerate(zip(self.sample_indices, self.sequence_samples)):
-                                            # Check if sample index is valid
-                                            if i < len(current_samples):
-                                                current_sample = current_samples[i]
-                                                # Compare prompt numbers if available - this is our primary check now
-                                                if ('prompt_number' in orig_sample and
-                                                    'prompt_number' in current_sample and
-                                                    orig_sample['prompt_number'] is not None and
-                                                    current_sample['prompt_number'] is not None):
-                                                    if orig_sample['prompt_number'] != current_sample['prompt_number']:
-                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} prompt number changed from {orig_sample['prompt_number']} to {current_sample['prompt_number']}")
-                                                        is_sequence_maintained = False
-                                                    else:
-                                                        # This is now our primary verification
-                                                        log_info(f"Prompt number match confirmed for sample {i}: {orig_sample['prompt_number']}")
-                                                # Also compare article_id as a backup check
-                                                elif ('article_id' in orig_sample and
-                                                      'article_id' in current_sample and
-                                                      orig_sample['article_id'] is not None and
-                                                      current_sample['article_id'] is not None):
-                                                    if orig_sample['article_id'] != current_sample['article_id']:
-                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} article_id changed from {orig_sample['article_id']} to {current_sample['article_id']}")
-                                                        is_sequence_maintained = False
-                                        if is_sequence_maintained:
-                                            log_info("Data sequence integrity check: OK - prompt numbers preserved")
-                                        else:
-                                            log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
-                                    else:
-                                        log_info("Warning: Not enough samples available for sequence verification")
-                        except StopIteration:
-                            log_info("Warning: No batches available in the dataloader")
-                        except Exception as e:
-                            log_info(f"Warning: Error iterating through dataloader: {e}")
-            except Exception as e:
-                log_info(f"Warning: Couldn't verify sequence integrity: {e}")
         # Log progress at regular intervals
         if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
@@ -708,13 +583,6 @@ def check_dependencies():
     if not peft_available:
         missing_packages.append("peft>=0.9.0")
-    # Optional packages - don't add to missing list, just log
-    if find_spec("flash_attn"):
-        logger.info("flash-attn found. Flash attention will be used for faster training.")
-    else:
-        logger.warning("flash-attn not found. Training will work but may be slower.")
-        logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
     # If critical packages are missing, exit with instructions
     if missing_packages:
         logger.error("Critical dependencies missing:")
@@ -723,6 +591,13 @@ def check_dependencies():
         logger.error("Please ensure the space has these packages in requirements.txt")
         return False
     return True
 def main():
@@ -934,6 +809,17 @@ def main():
             # Log our approach clearly
             log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
             log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
             # Calculate batch size based on device availability

         logger.info("Using Unsloth optimizations with pre-quantized model")
         # First detect if we have a GPU
         if torch.cuda.is_available():
             gpu_count = torch.cuda.device_count()
+            logger.info(f"Found {gpu_count} CUDA devices")
         else:
+            logger.warning("No CUDA devices detected. Training will be slow on CPU!")
+            gpu_count = 0
         # Set default dtype for better numerics
         if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
             dtype = None
             logger.info("Using default precision (CPU)")
+        # Check for flash attention as the last dependency check
+        use_flash_attention = config.get("use_flash_attention", True)
+        if use_flash_attention and not find_spec("flash_attn"):
+            logger.warning("flash-attn not found. Will continue without flash attention.")
+            logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
+            use_flash_attention = False
         # Load model with proper error handling for out-of-memory
         try:
             # Improved memory settings for multi-GPU setup
         else:
             logger.info(f"Dataset has all required fields: {required_fields}")
+        # Verify that column order matches our expectation
+        expected_order = ["prompt_number", "article_id", "conversations"]
+        actual_order = dataset.column_names
+        if actual_order == expected_order:
+            logger.info("Dataset column order matches expected order (prompt_number, article_id, conversations)")
+        else:
+            logger.warning(f"Dataset column order ({', '.join(actual_order)}) differs from expected order ({', '.join(expected_order)})")
+            logger.warning("This should not affect processing but is noted for debugging purposes")
         # Log a few samples for verification
         if len(dataset) > 0:
             sample_indices = range(min(5, len(dataset)))
         log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
         log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
+        # Disable sequence verification
+        self.verify_sequence = False
         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
         total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
+        total_steps = int(len(dataset) / (args.per_device_train_batch_size * NUM_GPUS * args.gradient_accumulation_steps) * args.num_train_epochs)
+        log_info(f"Training plan: {len(dataset)} examples over {args.num_train_epochs} epochs ≈ {total_steps} steps")
         log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")
                 memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
             log_info(f"Initial memory usage - {', '.join(memory_info)}")
     def on_step_end(self, args, state, control, **kwargs):
         # Log every 50 steps or every 5 minutes, whichever comes first
         current_time = time.time()
+        # Sequence verification removed
         # Log progress at regular intervals
         if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
     if not peft_available:
         missing_packages.append("peft>=0.9.0")
     # If critical packages are missing, exit with instructions
     if missing_packages:
         logger.error("Critical dependencies missing:")
         logger.error("Please ensure the space has these packages in requirements.txt")
         return False
+    # Optional packages - moved to the end
+    if find_spec("flash_attn"):
+        logger.info("flash-attn found. Flash attention will be used for faster training.")
+    else:
+        logger.warning("flash-attn not found. Training will work but may be slower.")
+        logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
     return True
 def main():
             # Log our approach clearly
             log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
+            # Verify column order
+            expected_order = ["prompt_number", "article_id", "conversations"]
+            if hasattr(dataset, 'column_names'):
+                actual_order = dataset.column_names
+                if actual_order == expected_order:
+                    log_info(f"Confirmed dataset columns are in expected order: {', '.join(expected_order)}")
+                else:
+                    log_info(f"Note: Dataset columns ({', '.join(actual_order)}) are not in expected order ({', '.join(expected_order)})")
+                    log_info("This is handled correctly by field-based access, but noting for clarity")
             log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
             # Calculate batch size based on device availability

update_space.py CHANGED Viewed

@@ -121,17 +121,25 @@ def update_requirements():
     # Add new requirements
     updated_requirements = existing_requirements.union(required_packages)
-    # Write updated requirements with torch first
     with open(req_path, 'w') as f:
         # Ensure torch is first
         torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
         f.write(f"{torch_req}\n")
-        # Write remaining requirements
-        for req in sorted(r for r in updated_requirements if not r.startswith("torch")):
             f.write(f"{req}\n")
-    logger.info("Updated requirements.txt with necessary packages (torch listed first)")
 def create_space(username, space_name):
     """Create or get a Hugging Face Space."""

     # Add new requirements
     updated_requirements = existing_requirements.union(required_packages)
+    # Write updated requirements with torch first and flash-attn last
     with open(req_path, 'w') as f:
         # Ensure torch is first
         torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
         f.write(f"{torch_req}\n")
+        # Extract flash-attn to add it last
+        flash_attn_req = next((req for req in updated_requirements if req.startswith("flash-attn")), None)
+        # Write all other requirements (excluding torch and flash-attn)
+        for req in sorted(r for r in updated_requirements
+                         if not r.startswith("torch") and not r.startswith("flash-attn")):
             f.write(f"{req}\n")
+        # Add flash-attn as the very last package
+        if flash_attn_req:
+            f.write(f"{flash_attn_req}\n")
+    logger.info("Updated requirements.txt with torch listed first and flash-attn listed last")
 def create_space(username, space_name):
     """Create or get a Hugging Face Space."""