Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

862c3c6

verified ·

1 Parent(s): c58ed8b

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +30 -9

run_cloud_training.py CHANGED Viewed

@@ -412,6 +412,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     os.environ["XFORMERS_DISABLED"] = "1"
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
@@ -428,6 +431,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     # Skip Unsloth and use standard HuggingFace loading
     logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
     # Load with standard HuggingFace
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
@@ -442,10 +449,14 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         config=config,
-        device_map="auto",
         torch_dtype=dtype or torch.float16,
         quantization_config=bnb_config,
         trust_remote_code=True,
@@ -465,6 +476,9 @@ def train(config_path, dataset_name, output_dir):
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     # Try to unload xformers if it's loaded
     if 'xformers' in sys.modules:
         logger.info("Removing xformers from sys.modules")
@@ -510,6 +524,12 @@ def train(config_path, dataset_name, output_dir):
         logger.info(f"Output directory: {output_dir}")
         logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
         # Load and prepare the dataset
         dataset = load_and_prepare_dataset(dataset_name, config)
@@ -524,9 +544,9 @@ def train(config_path, dataset_name, output_dir):
         # Initialize model
         logger.info("Initializing model (preserving 4-bit quantization)")
-        # Reduce max sequence length to avoid memory issues
-        max_seq_length = min(training_config.get("max_seq_length", 2048), 1024)
-        logger.info(f"Using reduced max sequence length: {max_seq_length} to avoid memory issues")
         # Create LoRA config directly
         logger.info("Creating LoRA configuration")
@@ -582,10 +602,10 @@ def train(config_path, dataset_name, output_dir):
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
-        # Set up training arguments with correct parameters
-        # REDUCE BATCH SIZE to avoid memory issues with attention
-        per_device_train_batch_size = 1  # Reduced from default of 2
-        logger.info(f"Using reduced batch size: {per_device_train_batch_size} to avoid memory issues")
         training_args_dict = {
             "output_dir": output_dir,
@@ -607,7 +627,8 @@ def train(config_path, dataset_name, output_dir):
             "logging_first_step": training_config.get("logging_first_step", True),
             "disable_tqdm": training_config.get("disable_tqdm", False),
             "remove_unused_columns": False,
-            "seed": 42
         }
         # Create TrainingArguments with validated parameters

     os.environ["XFORMERS_DISABLED"] = "1"
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+    # Configure PyTorch memory allocator for better memory management with multiple GPUs
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
     # Skip Unsloth and use standard HuggingFace loading
     logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
+    # Check available GPUs
+    gpu_count = torch.cuda.device_count()
+    logger.info(f"Found {gpu_count} GPU(s) available")
     # Load with standard HuggingFace
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # Use auto device mapping for multi-GPU setup
+    device_map = "auto" if gpu_count > 1 else "auto"
+    logger.info(f"Using device_map={device_map} for model distribution")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         config=config,
+        device_map=device_map,
         torch_dtype=dtype or torch.float16,
         quantization_config=bnb_config,
         trust_remote_code=True,
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    # Configure PyTorch memory allocator for better memory management with multiple GPUs
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     # Try to unload xformers if it's loaded
     if 'xformers' in sys.modules:
         logger.info("Removing xformers from sys.modules")
         logger.info(f"Output directory: {output_dir}")
         logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
+        # Check GPU availability
+        gpu_count = torch.cuda.device_count()
+        logger.info(f"Found {gpu_count} GPU(s) available")
+        for i in range(gpu_count):
+            logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
         # Load and prepare the dataset
         dataset = load_and_prepare_dataset(dataset_name, config)
         # Initialize model
         logger.info("Initializing model (preserving 4-bit quantization)")
+        # Use full sequence length of 2048 as required for pre-tokenized dataset
+        max_seq_length = training_config.get("max_seq_length", 2048)
+        logger.info(f"Using sequence length: {max_seq_length} as required for pre-tokenized dataset")
         # Create LoRA config directly
         logger.info("Creating LoRA configuration")
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
+        # Optimize batch size for multi-GPU setup
+        # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
+        per_device_train_batch_size = 4 if gpu_count >= 4 else 2
+        logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
         training_args_dict = {
             "output_dir": output_dir,
             "logging_first_step": training_config.get("logging_first_step", True),
             "disable_tqdm": training_config.get("disable_tqdm", False),
             "remove_unused_columns": False,
+            "seed": 42,
+            "dataloader_num_workers": 4  # Use multiple workers for data loading
         }
         # Create TrainingArguments with validated parameters