Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +15 -51
run_cloud_training.py
CHANGED
|
@@ -18,8 +18,8 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
| 18 |
# L40S-specific CUDA optimization
|
| 19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
os.environ["
|
| 23 |
|
| 24 |
import json
|
| 25 |
import logging
|
|
@@ -46,24 +46,16 @@ logging.basicConfig(
|
|
| 46 |
)
|
| 47 |
logger = logging.getLogger(__name__)
|
| 48 |
|
| 49 |
-
# Set up
|
| 50 |
os.environ["MASTER_ADDR"] = "localhost"
|
| 51 |
os.environ["MASTER_PORT"] = "9994"
|
| 52 |
os.environ["RANK"] = "0"
|
| 53 |
os.environ["LOCAL_RANK"] = "0"
|
| 54 |
os.environ["WORLD_SIZE"] = "1"
|
| 55 |
|
| 56 |
-
#
|
|
|
|
| 57 |
deepspeed_available = False
|
| 58 |
-
try:
|
| 59 |
-
import deepspeed
|
| 60 |
-
deepspeed_available = True
|
| 61 |
-
logger.info("DeepSpeed successfully imported")
|
| 62 |
-
except ImportError as e:
|
| 63 |
-
logger.warning(f"Failed to import DeepSpeed: {e}")
|
| 64 |
-
logger.warning("Will continue without DeepSpeed support")
|
| 65 |
-
# Set a flag to disable DeepSpeed
|
| 66 |
-
os.environ["DISABLE_DEEPSPEED"] = "1"
|
| 67 |
|
| 68 |
# Disable all attention optimizations that might cause issues
|
| 69 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
@@ -611,35 +603,11 @@ def train(config_path, dataset_name, output_dir):
|
|
| 611 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
| 612 |
|
| 613 |
# Check if DeepSpeed config is available and if DeepSpeed is available
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
| 620 |
-
|
| 621 |
-
# Update DeepSpeed config with dynamic values
|
| 622 |
-
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
| 623 |
-
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
| 624 |
-
|
| 625 |
-
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 626 |
-
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 627 |
-
|
| 628 |
-
# Write the DeepSpeed config to a file
|
| 629 |
-
with open(ds_config_path, 'w') as f:
|
| 630 |
-
json.dump(deepspeed_config, f, indent=2)
|
| 631 |
-
|
| 632 |
-
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 633 |
-
# Set using_deepspeed flag
|
| 634 |
-
using_deepspeed = True
|
| 635 |
-
elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
|
| 636 |
-
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
| 637 |
-
ds_config_path = None
|
| 638 |
-
using_deepspeed = False
|
| 639 |
-
else:
|
| 640 |
-
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
| 641 |
-
ds_config_path = None
|
| 642 |
-
using_deepspeed = False
|
| 643 |
|
| 644 |
# Initialize model with our safe loading function
|
| 645 |
logger.info("Loading pre-quantized model with eager attention")
|
|
@@ -707,22 +675,18 @@ def train(config_path, dataset_name, output_dir):
|
|
| 707 |
}
|
| 708 |
|
| 709 |
# Add DeepSpeed config path if available and enabled
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
training_args_dict["deepspeed"] = ds_config_path
|
| 713 |
-
else:
|
| 714 |
-
logger.info("DeepSpeed is disabled - using standard distributed training")
|
| 715 |
|
| 716 |
# Create TrainingArguments with validated parameters
|
| 717 |
try:
|
| 718 |
training_args = TrainingArguments(**training_args_dict)
|
| 719 |
except Exception as e:
|
| 720 |
-
logger.error(f"Failed to create training arguments
|
| 721 |
if "deepspeed" in training_args_dict:
|
| 722 |
-
logger.warning("Removing DeepSpeed configuration
|
| 723 |
del training_args_dict["deepspeed"]
|
| 724 |
-
|
| 725 |
-
using_deepspeed = False
|
| 726 |
|
| 727 |
# Create trainer with pre-tokenized collator
|
| 728 |
trainer = Trainer(
|
|
|
|
| 18 |
# L40S-specific CUDA optimization
|
| 19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
| 20 |
|
| 21 |
+
# Completely disable DeepSpeed for Hugging Face Spaces to avoid compatibility issues
|
| 22 |
+
os.environ["DISABLE_DEEPSPEED"] = "1"
|
| 23 |
|
| 24 |
import json
|
| 25 |
import logging
|
|
|
|
| 46 |
)
|
| 47 |
logger = logging.getLogger(__name__)
|
| 48 |
|
| 49 |
+
# Set up environment variables
|
| 50 |
os.environ["MASTER_ADDR"] = "localhost"
|
| 51 |
os.environ["MASTER_PORT"] = "9994"
|
| 52 |
os.environ["RANK"] = "0"
|
| 53 |
os.environ["LOCAL_RANK"] = "0"
|
| 54 |
os.environ["WORLD_SIZE"] = "1"
|
| 55 |
|
| 56 |
+
# DeepSpeed is disabled for Hugging Face Spaces due to compatibility issues
|
| 57 |
+
logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
|
| 58 |
deepspeed_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# Disable all attention optimizations that might cause issues
|
| 61 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
| 603 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
| 604 |
|
| 605 |
# Check if DeepSpeed config is available and if DeepSpeed is available
|
| 606 |
+
# Note: DeepSpeed is now disabled by default for HF Spaces
|
| 607 |
+
deepspeed_config = None
|
| 608 |
+
logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
|
| 609 |
+
ds_config_path = None
|
| 610 |
+
using_deepspeed = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
# Initialize model with our safe loading function
|
| 613 |
logger.info("Loading pre-quantized model with eager attention")
|
|
|
|
| 675 |
}
|
| 676 |
|
| 677 |
# Add DeepSpeed config path if available and enabled
|
| 678 |
+
# DeepSpeed is disabled for Hugging Face Spaces
|
| 679 |
+
logger.info("DeepSpeed is disabled - using standard training")
|
|
|
|
|
|
|
|
|
|
| 680 |
|
| 681 |
# Create TrainingArguments with validated parameters
|
| 682 |
try:
|
| 683 |
training_args = TrainingArguments(**training_args_dict)
|
| 684 |
except Exception as e:
|
| 685 |
+
logger.error(f"Failed to create training arguments: {e}")
|
| 686 |
if "deepspeed" in training_args_dict:
|
| 687 |
+
logger.warning("Removing any DeepSpeed configuration")
|
| 688 |
del training_args_dict["deepspeed"]
|
| 689 |
+
training_args = TrainingArguments(**training_args_dict)
|
|
|
|
| 690 |
|
| 691 |
# Create trainer with pre-tokenized collator
|
| 692 |
trainer = Trainer(
|