Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +29 -54
run_cloud_training.py
CHANGED
|
@@ -6,6 +6,7 @@ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unslo
|
|
| 6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
| 8 |
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
# Set critical environment variables before any imports
|
|
@@ -17,6 +18,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
| 17 |
# L40S-specific CUDA optimization
|
| 18 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
import json
|
| 21 |
import logging
|
| 22 |
import argparse
|
|
@@ -31,32 +35,35 @@ from transformers.data.data_collator import DataCollatorMixin
|
|
| 31 |
from peft import LoraConfig
|
| 32 |
from unsloth import FastLanguageModel
|
| 33 |
|
| 34 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
os.environ["MASTER_ADDR"] = "localhost"
|
| 36 |
os.environ["MASTER_PORT"] = "9994"
|
| 37 |
os.environ["RANK"] = "0"
|
| 38 |
os.environ["LOCAL_RANK"] = "0"
|
| 39 |
os.environ["WORLD_SIZE"] = "1"
|
| 40 |
|
| 41 |
-
# Try to import deepspeed,
|
|
|
|
| 42 |
try:
|
| 43 |
import deepspeed
|
|
|
|
|
|
|
| 44 |
except ImportError as e:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py"])
|
| 50 |
-
import deepspeed
|
| 51 |
-
logger.info("Successfully installed mpi4py and imported deepspeed")
|
| 52 |
-
except Exception as install_error:
|
| 53 |
-
logger.warning(f"Failed to install mpi4py: {install_error}")
|
| 54 |
-
logger.warning("Continuing without DeepSpeed MPI support")
|
| 55 |
-
# Set a flag to disable DeepSpeed later
|
| 56 |
-
os.environ["DISABLE_DEEPSPEED_MPI"] = "1"
|
| 57 |
-
else:
|
| 58 |
-
logger.error(f"Failed to import deepspeed: {e}")
|
| 59 |
-
raise
|
| 60 |
|
| 61 |
# Disable all attention optimizations that might cause issues
|
| 62 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
@@ -84,17 +91,6 @@ class XFormersBlocker:
|
|
| 84 |
# Add our import blocker to sys.meta_path
|
| 85 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
| 86 |
|
| 87 |
-
# Configure logging first
|
| 88 |
-
logging.basicConfig(
|
| 89 |
-
level=logging.INFO,
|
| 90 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 91 |
-
handlers=[
|
| 92 |
-
logging.StreamHandler(),
|
| 93 |
-
logging.FileHandler("training.log")
|
| 94 |
-
]
|
| 95 |
-
)
|
| 96 |
-
logger = logging.getLogger(__name__)
|
| 97 |
-
|
| 98 |
# Make sure torch is installed and available before proceeding
|
| 99 |
try:
|
| 100 |
logger.info("Importing torch...")
|
|
@@ -614,9 +610,9 @@ def train(config_path, dataset_name, output_dir):
|
|
| 614 |
per_device_train_batch_size = 2
|
| 615 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
| 616 |
|
| 617 |
-
# Check if DeepSpeed config is available and if
|
| 618 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 619 |
-
if deepspeed_config and os.environ.get("
|
| 620 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 621 |
|
| 622 |
# Create a temporary DeepSpeed config file
|
|
@@ -629,40 +625,19 @@ def train(config_path, dataset_name, output_dir):
|
|
| 629 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 630 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 631 |
|
| 632 |
-
# L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
|
| 633 |
-
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
|
| 634 |
-
logger.info("Configuring DeepSpeed specifically for L40S GPU")
|
| 635 |
-
# Adjust ZeRO stage for L40S (48GB VRAM)
|
| 636 |
-
deepspeed_config["zero_optimization"]["stage"] = 2
|
| 637 |
-
# Enable CPU offloading for optimizer states to save GPU memory
|
| 638 |
-
deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
| 639 |
-
# Adjust communication efficiency for single high-end GPU
|
| 640 |
-
deepspeed_config["reduce_bucket_size"] = 1e9
|
| 641 |
-
deepspeed_config["allgather_bucket_size"] = 1e9
|
| 642 |
-
|
| 643 |
-
# Ensure communication backend is set to avoid MPI
|
| 644 |
-
if "communication_data_type" not in deepspeed_config:
|
| 645 |
-
deepspeed_config["communication_data_type"] = "fp16"
|
| 646 |
-
|
| 647 |
# Write the DeepSpeed config to a file
|
| 648 |
with open(ds_config_path, 'w') as f:
|
| 649 |
json.dump(deepspeed_config, f, indent=2)
|
| 650 |
|
| 651 |
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 652 |
-
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
| 653 |
-
|
| 654 |
-
# Enable CPU offloading if configured
|
| 655 |
-
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
| 656 |
-
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
| 657 |
-
|
| 658 |
# Set using_deepspeed flag
|
| 659 |
using_deepspeed = True
|
| 660 |
-
elif os.environ.get("
|
| 661 |
-
logger.warning("DeepSpeed
|
| 662 |
ds_config_path = None
|
| 663 |
using_deepspeed = False
|
| 664 |
else:
|
| 665 |
-
logger.warning("
|
| 666 |
ds_config_path = None
|
| 667 |
using_deepspeed = False
|
| 668 |
|
|
|
|
| 6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
| 8 |
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
| 9 |
+
SUPPORTS ENVIRONMENTS WITHOUT MPI
|
| 10 |
"""
|
| 11 |
|
| 12 |
# Set critical environment variables before any imports
|
|
|
|
| 18 |
# L40S-specific CUDA optimization
|
| 19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
| 20 |
|
| 21 |
+
# Explicitly disable DeepSpeed MPI requirement
|
| 22 |
+
os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
|
| 23 |
+
|
| 24 |
import json
|
| 25 |
import logging
|
| 26 |
import argparse
|
|
|
|
| 35 |
from peft import LoraConfig
|
| 36 |
from unsloth import FastLanguageModel
|
| 37 |
|
| 38 |
+
# Configure logging first (before any potential errors with imports)
|
| 39 |
+
logging.basicConfig(
|
| 40 |
+
level=logging.INFO,
|
| 41 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 42 |
+
handlers=[
|
| 43 |
+
logging.StreamHandler(),
|
| 44 |
+
logging.FileHandler("training.log")
|
| 45 |
+
]
|
| 46 |
+
)
|
| 47 |
+
logger = logging.getLogger(__name__)
|
| 48 |
+
|
| 49 |
+
# Set up DeepSpeed without requiring MPI
|
| 50 |
os.environ["MASTER_ADDR"] = "localhost"
|
| 51 |
os.environ["MASTER_PORT"] = "9994"
|
| 52 |
os.environ["RANK"] = "0"
|
| 53 |
os.environ["LOCAL_RANK"] = "0"
|
| 54 |
os.environ["WORLD_SIZE"] = "1"
|
| 55 |
|
| 56 |
+
# Try to import deepspeed, with fallback for environments without MPI
|
| 57 |
+
deepspeed_available = False
|
| 58 |
try:
|
| 59 |
import deepspeed
|
| 60 |
+
deepspeed_available = True
|
| 61 |
+
logger.info("DeepSpeed successfully imported")
|
| 62 |
except ImportError as e:
|
| 63 |
+
logger.warning(f"Failed to import DeepSpeed: {e}")
|
| 64 |
+
logger.warning("Will continue without DeepSpeed support")
|
| 65 |
+
# Set a flag to disable DeepSpeed
|
| 66 |
+
os.environ["DISABLE_DEEPSPEED"] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Disable all attention optimizations that might cause issues
|
| 69 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
| 91 |
# Add our import blocker to sys.meta_path
|
| 92 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
# Make sure torch is installed and available before proceeding
|
| 95 |
try:
|
| 96 |
logger.info("Importing torch...")
|
|
|
|
| 610 |
per_device_train_batch_size = 2
|
| 611 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
| 612 |
|
| 613 |
+
# Check if DeepSpeed config is available and if DeepSpeed is available
|
| 614 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 615 |
+
if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
|
| 616 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 617 |
|
| 618 |
# Create a temporary DeepSpeed config file
|
|
|
|
| 625 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 626 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 627 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
# Write the DeepSpeed config to a file
|
| 629 |
with open(ds_config_path, 'w') as f:
|
| 630 |
json.dump(deepspeed_config, f, indent=2)
|
| 631 |
|
| 632 |
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
# Set using_deepspeed flag
|
| 634 |
using_deepspeed = True
|
| 635 |
+
elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
|
| 636 |
+
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
| 637 |
ds_config_path = None
|
| 638 |
using_deepspeed = False
|
| 639 |
else:
|
| 640 |
+
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
| 641 |
ds_config_path = None
|
| 642 |
using_deepspeed = False
|
| 643 |
|