Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 15

Commit

3f693be

verified ·

1 Parent(s): f3ab403

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +15 -51

run_cloud_training.py CHANGED Viewed

@@ -18,8 +18,8 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # L40S-specific CUDA optimization
 os.environ["CUDA_AUTO_BOOST"] = "1"
-# Explicitly disable DeepSpeed MPI requirement
-os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
 import json
 import logging
@@ -46,24 +46,16 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Set up DeepSpeed without requiring MPI
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "9994"
 os.environ["RANK"] = "0"
 os.environ["LOCAL_RANK"] = "0"
 os.environ["WORLD_SIZE"] = "1"
-# Try to import deepspeed, with fallback for environments without MPI
 deepspeed_available = False
-try:
-    import deepspeed
-    deepspeed_available = True
-    logger.info("DeepSpeed successfully imported")
-except ImportError as e:
-    logger.warning(f"Failed to import DeepSpeed: {e}")
-    logger.warning("Will continue without DeepSpeed support")
-    # Set a flag to disable DeepSpeed
-    os.environ["DISABLE_DEEPSPEED"] = "1"
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -611,35 +603,11 @@ def train(config_path, dataset_name, output_dir):
             logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
         # Check if DeepSpeed config is available and if DeepSpeed is available
-        deepspeed_config = config.get("deepspeed_config", None)
-        if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
-            logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
-            # Create a temporary DeepSpeed config file
-            ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
-            # Update DeepSpeed config with dynamic values
-            if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
-                deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
-            if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
-                deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
-            # Write the DeepSpeed config to a file
-            with open(ds_config_path, 'w') as f:
-                json.dump(deepspeed_config, f, indent=2)
-            logger.info(f"Created DeepSpeed config at {ds_config_path}")
-            # Set using_deepspeed flag
-            using_deepspeed = True
-        elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
-            logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
-            ds_config_path = None
-            using_deepspeed = False
-        else:
-            logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
-            ds_config_path = None
-            using_deepspeed = False
         # Initialize model with our safe loading function
         logger.info("Loading pre-quantized model with eager attention")
@@ -707,22 +675,18 @@ def train(config_path, dataset_name, output_dir):
         }
         # Add DeepSpeed config path if available and enabled
-        if using_deepspeed and ds_config_path:
-            logger.info("Adding DeepSpeed configuration to training arguments")
-            training_args_dict["deepspeed"] = ds_config_path
-        else:
-            logger.info("DeepSpeed is disabled - using standard distributed training")
         # Create TrainingArguments with validated parameters
         try:
             training_args = TrainingArguments(**training_args_dict)
         except Exception as e:
-            logger.error(f"Failed to create training arguments with DeepSpeed: {e}")
             if "deepspeed" in training_args_dict:
-                logger.warning("Removing DeepSpeed configuration and trying again")
                 del training_args_dict["deepspeed"]
-                training_args = TrainingArguments(**training_args_dict)
-                using_deepspeed = False
         # Create trainer with pre-tokenized collator
         trainer = Trainer(

 # L40S-specific CUDA optimization
 os.environ["CUDA_AUTO_BOOST"] = "1"
+# Completely disable DeepSpeed for Hugging Face Spaces to avoid compatibility issues
+os.environ["DISABLE_DEEPSPEED"] = "1"
 import json
 import logging
 )
 logger = logging.getLogger(__name__)
+# Set up environment variables
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "9994"
 os.environ["RANK"] = "0"
 os.environ["LOCAL_RANK"] = "0"
 os.environ["WORLD_SIZE"] = "1"
+# DeepSpeed is disabled for Hugging Face Spaces due to compatibility issues
+logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
 deepspeed_available = False
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
             logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
         # Check if DeepSpeed config is available and if DeepSpeed is available
+        # Note: DeepSpeed is now disabled by default for HF Spaces
+        deepspeed_config = None
+        logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
+        ds_config_path = None
+        using_deepspeed = False
         # Initialize model with our safe loading function
         logger.info("Loading pre-quantized model with eager attention")
         }
         # Add DeepSpeed config path if available and enabled
+        # DeepSpeed is disabled for Hugging Face Spaces
+        logger.info("DeepSpeed is disabled - using standard training")
         # Create TrainingArguments with validated parameters
         try:
             training_args = TrainingArguments(**training_args_dict)
         except Exception as e:
+            logger.error(f"Failed to create training arguments: {e}")
             if "deepspeed" in training_args_dict:
+                logger.warning("Removing any DeepSpeed configuration")
                 del training_args_dict["deepspeed"]
+            training_args = TrainingArguments(**training_args_dict)
         # Create trainer with pre-tokenized collator
         trainer = Trainer(