Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +31 -8
run_cloud_training.py
CHANGED
|
@@ -2,17 +2,20 @@
|
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
|
| 4 |
"""
|
| 5 |
-
Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
|
| 6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
# Set critical environment variables before any imports
|
| 11 |
import os
|
| 12 |
-
# Configure PyTorch memory allocator for better memory management with
|
| 13 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 14 |
os.environ["XFORMERS_DISABLED"] = "1"
|
| 15 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
|
|
|
| 16 |
|
| 17 |
import json
|
| 18 |
import logging
|
|
@@ -597,10 +600,19 @@ def train(config_path, dataset_name, output_dir):
|
|
| 597 |
# Initialize ds_config_path to None before checking
|
| 598 |
ds_config_path = None
|
| 599 |
|
| 600 |
-
# Optimize batch size for
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
|
| 605 |
# Check if DeepSpeed config is available and if MPI is disabled
|
| 606 |
deepspeed_config = config.get("deepspeed_config", None)
|
|
@@ -617,6 +629,17 @@ def train(config_path, dataset_name, output_dir):
|
|
| 617 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 618 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
# Ensure communication backend is set to avoid MPI
|
| 621 |
if "communication_data_type" not in deepspeed_config:
|
| 622 |
deepspeed_config["communication_data_type"] = "fp16"
|
|
@@ -764,7 +787,7 @@ def train(config_path, dataset_name, output_dir):
|
|
| 764 |
remove_training_marker()
|
| 765 |
|
| 766 |
if __name__ == "__main__":
|
| 767 |
-
parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
|
| 768 |
parser.add_argument("--config", type=str, default="transformers_config.json",
|
| 769 |
help="Path to the transformers config JSON file")
|
| 770 |
parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
|
|
|
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
|
| 4 |
"""
|
| 5 |
+
Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unsloth
|
| 6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
| 8 |
+
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
| 9 |
"""
|
| 10 |
|
| 11 |
# Set critical environment variables before any imports
|
| 12 |
import os
|
| 13 |
+
# Configure PyTorch memory allocator for better memory management with L40S GPU
|
| 14 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
|
| 15 |
os.environ["XFORMERS_DISABLED"] = "1"
|
| 16 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 17 |
+
# L40S-specific CUDA optimization
|
| 18 |
+
os.environ["CUDA_AUTO_BOOST"] = "1"
|
| 19 |
|
| 20 |
import json
|
| 21 |
import logging
|
|
|
|
| 600 |
# Initialize ds_config_path to None before checking
|
| 601 |
ds_config_path = None
|
| 602 |
|
| 603 |
+
# Optimize batch size for L40S GPU
|
| 604 |
+
gpu_info = torch.cuda.get_device_properties(0)
|
| 605 |
+
logger.info(f"GPU Model: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
| 606 |
+
|
| 607 |
+
# For L40S GPU, we can use a larger batch size and shard model across the single GPU
|
| 608 |
+
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9: # Check if it's L40S (>40GB VRAM)
|
| 609 |
+
logger.info("Detected L40S GPU - optimizing for high-memory GPU")
|
| 610 |
+
per_device_train_batch_size = training_config.get("per_device_train_batch_size", 6)
|
| 611 |
+
logger.info(f"Using optimized batch size for L40S: {per_device_train_batch_size}")
|
| 612 |
+
else:
|
| 613 |
+
# Default to a smaller batch size for other GPUs
|
| 614 |
+
per_device_train_batch_size = 2
|
| 615 |
+
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
| 616 |
|
| 617 |
# Check if DeepSpeed config is available and if MPI is disabled
|
| 618 |
deepspeed_config = config.get("deepspeed_config", None)
|
|
|
|
| 629 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 630 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 631 |
|
| 632 |
+
# L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
|
| 633 |
+
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
|
| 634 |
+
logger.info("Configuring DeepSpeed specifically for L40S GPU")
|
| 635 |
+
# Adjust ZeRO stage for L40S (48GB VRAM)
|
| 636 |
+
deepspeed_config["zero_optimization"]["stage"] = 2
|
| 637 |
+
# Enable CPU offloading for optimizer states to save GPU memory
|
| 638 |
+
deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
| 639 |
+
# Adjust communication efficiency for single high-end GPU
|
| 640 |
+
deepspeed_config["reduce_bucket_size"] = 1e9
|
| 641 |
+
deepspeed_config["allgather_bucket_size"] = 1e9
|
| 642 |
+
|
| 643 |
# Ensure communication backend is set to avoid MPI
|
| 644 |
if "communication_data_type" not in deepspeed_config:
|
| 645 |
deepspeed_config["communication_data_type"] = "fp16"
|
|
|
|
| 787 |
remove_training_marker()
|
| 788 |
|
| 789 |
if __name__ == "__main__":
|
| 790 |
+
parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit model (RESEARCH ONLY)")
|
| 791 |
parser.add_argument("--config", type=str, default="transformers_config.json",
|
| 792 |
help="Path to the transformers config JSON file")
|
| 793 |
parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
|