Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +5 -5
run_cloud_training.py
CHANGED
|
@@ -571,6 +571,11 @@ def train(config_path, dataset_name, output_dir):
|
|
| 571 |
# Initialize ds_config_path to None before checking
|
| 572 |
ds_config_path = None
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
# Check if DeepSpeed config is available
|
| 575 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 576 |
if deepspeed_config:
|
|
@@ -644,11 +649,6 @@ def train(config_path, dataset_name, output_dir):
|
|
| 644 |
reports = ["none"]
|
| 645 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 646 |
|
| 647 |
-
# Optimize batch size for multi-GPU setup
|
| 648 |
-
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
| 649 |
-
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 650 |
-
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 651 |
-
|
| 652 |
training_args_dict = {
|
| 653 |
"output_dir": output_dir,
|
| 654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
|
| 571 |
# Initialize ds_config_path to None before checking
|
| 572 |
ds_config_path = None
|
| 573 |
|
| 574 |
+
# Optimize batch size for multi-GPU setup
|
| 575 |
+
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
| 576 |
+
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 577 |
+
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 578 |
+
|
| 579 |
# Check if DeepSpeed config is available
|
| 580 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 581 |
if deepspeed_config:
|
|
|
|
| 649 |
reports = ["none"]
|
| 650 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 651 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
training_args_dict = {
|
| 653 |
"output_dir": output_dir,
|
| 654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|