Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +32 -3
run_transformers_training.py
CHANGED
|
@@ -980,6 +980,28 @@ def main():
|
|
| 980 |
|
| 981 |
# Set up training arguments
|
| 982 |
log_info("Setting up training arguments")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 983 |
training_args = TrainingArguments(
|
| 984 |
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
| 985 |
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
|
@@ -998,7 +1020,8 @@ def main():
|
|
| 998 |
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
| 999 |
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
| 1000 |
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
| 1001 |
-
hub_token
|
|
|
|
| 1002 |
report_to="tensorboard",
|
| 1003 |
remove_unused_columns=False, # Keep all columns
|
| 1004 |
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
|
@@ -1008,12 +1031,18 @@ def main():
|
|
| 1008 |
dataloader_drop_last=False, # Process all examples
|
| 1009 |
dataloader_num_workers=dataloader_workers,
|
| 1010 |
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
| 1011 |
-
#
|
| 1012 |
-
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
| 1013 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
| 1015 |
# Create sequential sampler to maintain original dataset order
|
| 1016 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
|
|
|
| 1017 |
|
| 1018 |
# Initialize trainer first
|
| 1019 |
log_info("Initializing Trainer")
|
|
|
|
| 980 |
|
| 981 |
# Set up training arguments
|
| 982 |
log_info("Setting up training arguments")
|
| 983 |
+
|
| 984 |
+
# Validate FSDP config before using it
|
| 985 |
+
fsdp_args = None
|
| 986 |
+
if fsdp_config is not None and is_distributed and multi_gpu_strategy == "fsdp":
|
| 987 |
+
try:
|
| 988 |
+
# Convert FSDP config to proper format expected by TrainingArguments
|
| 989 |
+
fsdp_args = {
|
| 990 |
+
"fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", []),
|
| 991 |
+
"fsdp_offload_params": fsdp_config.get("fsdp_offload_params", False),
|
| 992 |
+
"fsdp_backward_prefetch": fsdp_config.get("fsdp_backward_prefetch", "BACKWARD_PRE"),
|
| 993 |
+
"fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1e6),
|
| 994 |
+
"fsdp_sharding_strategy": fsdp_config.get("fsdp_sharding_strategy", 1),
|
| 995 |
+
}
|
| 996 |
+
log_info("FSDP config validated and prepared")
|
| 997 |
+
except Exception as e:
|
| 998 |
+
log_info(f"Error preparing FSDP config: {str(e)}, disabling FSDP")
|
| 999 |
+
fsdp_args = None
|
| 1000 |
+
|
| 1001 |
+
# Check if we're running in a Space
|
| 1002 |
+
is_space = bool(os.environ.get("SPACE_ID"))
|
| 1003 |
+
|
| 1004 |
+
# Create training arguments with validated FSDP config
|
| 1005 |
training_args = TrainingArguments(
|
| 1006 |
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
| 1007 |
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
|
|
|
| 1020 |
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
| 1021 |
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
| 1022 |
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
| 1023 |
+
# Don't set hub_token when running in a Space - it will use Space secrets automatically
|
| 1024 |
+
hub_token=None if is_space else os.environ.get("HF_TOKEN", None),
|
| 1025 |
report_to="tensorboard",
|
| 1026 |
remove_unused_columns=False, # Keep all columns
|
| 1027 |
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
|
|
|
| 1031 |
dataloader_drop_last=False, # Process all examples
|
| 1032 |
dataloader_num_workers=dataloader_workers,
|
| 1033 |
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
| 1034 |
+
fsdp=fsdp_args, # Use validated FSDP config
|
|
|
|
| 1035 |
)
|
| 1036 |
+
|
| 1037 |
+
log_info("Training arguments created successfully")
|
| 1038 |
+
|
| 1039 |
+
# Validate dataset before creating sampler
|
| 1040 |
+
if dataset is None:
|
| 1041 |
+
raise ValueError("Dataset is None - cannot create sampler")
|
| 1042 |
|
| 1043 |
# Create sequential sampler to maintain original dataset order
|
| 1044 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
| 1045 |
+
log_info("Sequential sampler created")
|
| 1046 |
|
| 1047 |
# Initialize trainer first
|
| 1048 |
log_info("Initializing Trainer")
|