Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +32 -42
- transformers_config.json +5 -2
run_transformers_training.py
CHANGED
|
@@ -7,6 +7,7 @@ import json
|
|
| 7 |
import argparse
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
|
|
|
| 10 |
|
| 11 |
# Import Unsloth first, before other ML imports
|
| 12 |
try:
|
|
@@ -618,61 +619,50 @@ def main():
|
|
| 618 |
# Simple logging callback
|
| 619 |
class LoggingCallback(TrainerCallback):
|
| 620 |
def __init__(self):
|
| 621 |
-
self.last_log_time =
|
| 622 |
-
self.training_start_time = datetime.now()
|
| 623 |
|
| 624 |
def on_step_end(self, args, state, control, **kwargs):
|
| 625 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
| 626 |
-
current_time =
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
if state.global_step % 50 == 0 or time_diff > 300: # 300 seconds = 5 minutes
|
| 631 |
-
loss = state.log_history[-1]['loss'] if state.log_history else 'N/A'
|
| 632 |
-
lr = state.log_history[-1]['learning_rate'] if state.log_history else 'N/A'
|
| 633 |
-
|
| 634 |
-
if isinstance(loss, float):
|
| 635 |
-
loss_str = f"{loss:.4f}"
|
| 636 |
-
else:
|
| 637 |
-
loss_str = str(loss)
|
| 638 |
-
|
| 639 |
-
if isinstance(lr, float):
|
| 640 |
-
lr_str = f"{lr:.8f}"
|
| 641 |
-
else:
|
| 642 |
-
lr_str = str(lr)
|
| 643 |
-
|
| 644 |
-
logger.info(f"Step: {state.global_step} | Loss: {loss_str} | LR: {lr_str} | Elapsed: {elapsed_time:.2f} min")
|
| 645 |
self.last_log_time = current_time
|
| 646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
# Set up training arguments
|
| 648 |
logger.info("Setting up training arguments")
|
| 649 |
training_args = TrainingArguments(
|
| 650 |
-
output_dir=model_config.get("output_dir", "./results"),
|
| 651 |
-
num_train_epochs=model_config.get("num_train_epochs", 3),
|
| 652 |
-
per_device_train_batch_size=model_config.get("
|
| 653 |
-
gradient_accumulation_steps=model_config.get("gradient_accumulation_steps",
|
| 654 |
-
learning_rate=model_config.get("learning_rate",
|
| 655 |
-
weight_decay=model_config.get("weight_decay", 0.01),
|
| 656 |
-
warmup_ratio=model_config.get("warmup_ratio", 0.
|
| 657 |
-
lr_scheduler_type=model_config.get("lr_scheduler_type", "cosine"),
|
| 658 |
-
logging_steps=model_config.get("logging_steps", 10),
|
| 659 |
-
save_strategy=model_config.get("save_strategy", "steps"),
|
| 660 |
-
save_steps=model_config.get("save_steps", 100),
|
| 661 |
-
save_total_limit=model_config.get("save_total_limit", 3),
|
| 662 |
-
fp16=
|
| 663 |
-
bf16=
|
| 664 |
-
max_grad_norm=model_config.get("max_grad_norm", 1.0),
|
| 665 |
-
push_to_hub=model_config.get("push_to_hub", False),
|
| 666 |
-
hub_model_id=model_config.get("hub_model_id", None),
|
| 667 |
hub_token=os.environ.get("HF_TOKEN", None),
|
| 668 |
report_to="tensorboard",
|
| 669 |
-
remove_unused_columns=False, # Keep
|
| 670 |
-
gradient_checkpointing=model_config.get("gradient_checkpointing", True),
|
| 671 |
dataloader_pin_memory=False, # Reduce memory usage
|
| 672 |
-
optim=model_config.get("optim", "adamw_torch"),
|
| 673 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
| 674 |
dataloader_drop_last=False, # Process all examples
|
| 675 |
-
dataloader_num_workers=
|
| 676 |
)
|
| 677 |
|
| 678 |
# Create a sequential sampler to ensure dataset is processed in order
|
|
|
|
| 7 |
import argparse
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
+
import time
|
| 11 |
|
| 12 |
# Import Unsloth first, before other ML imports
|
| 13 |
try:
|
|
|
|
| 619 |
# Simple logging callback
|
| 620 |
class LoggingCallback(TrainerCallback):
|
| 621 |
def __init__(self):
|
| 622 |
+
self.last_log_time = time.time()
|
|
|
|
| 623 |
|
| 624 |
def on_step_end(self, args, state, control, **kwargs):
|
| 625 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
| 626 |
+
current_time = time.time()
|
| 627 |
+
if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
|
| 628 |
+
logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
self.last_log_time = current_time
|
| 630 |
|
| 631 |
+
# Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
|
| 632 |
+
use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
|
| 633 |
+
use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
| 634 |
+
|
| 635 |
+
logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
| 636 |
+
|
| 637 |
# Set up training arguments
|
| 638 |
logger.info("Setting up training arguments")
|
| 639 |
training_args = TrainingArguments(
|
| 640 |
+
output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
|
| 641 |
+
num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
|
| 642 |
+
per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
|
| 643 |
+
gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
|
| 644 |
+
learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
|
| 645 |
+
weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
|
| 646 |
+
warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
|
| 647 |
+
lr_scheduler_type=model_config.get("training", {}).get("lr_scheduler_type", "cosine"),
|
| 648 |
+
logging_steps=model_config.get("training", {}).get("logging_steps", 10),
|
| 649 |
+
save_strategy=model_config.get("checkpointing", {}).get("save_strategy", "steps"),
|
| 650 |
+
save_steps=model_config.get("checkpointing", {}).get("save_steps", 100),
|
| 651 |
+
save_total_limit=model_config.get("checkpointing", {}).get("save_total_limit", 3),
|
| 652 |
+
fp16=use_fp16,
|
| 653 |
+
bf16=use_bf16,
|
| 654 |
+
max_grad_norm=model_config.get("training", {}).get("max_grad_norm", 1.0),
|
| 655 |
+
push_to_hub=model_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
| 656 |
+
hub_model_id=model_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
| 657 |
hub_token=os.environ.get("HF_TOKEN", None),
|
| 658 |
report_to="tensorboard",
|
| 659 |
+
remove_unused_columns=False, # Keep all columns
|
| 660 |
+
gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
|
| 661 |
dataloader_pin_memory=False, # Reduce memory usage
|
| 662 |
+
optim=model_config.get("training", {}).get("optim", "adamw_torch"),
|
| 663 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
| 664 |
dataloader_drop_last=False, # Process all examples
|
| 665 |
+
dataloader_num_workers=4, # Sequential data loading
|
| 666 |
)
|
| 667 |
|
| 668 |
# Create a sequential sampler to ensure dataset is processed in order
|
transformers_config.json
CHANGED
|
@@ -29,7 +29,9 @@
|
|
| 29 |
"warmup_ratio": 0.05,
|
| 30 |
"weight_decay": 0.01,
|
| 31 |
"max_grad_norm": 1.0,
|
| 32 |
-
"neftune_noise_alpha": 5
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
|
| 35 |
"checkpointing": {
|
|
@@ -83,5 +85,6 @@
|
|
| 83 |
"model_revision": "main",
|
| 84 |
"use_flash_attention": true,
|
| 85 |
"torch_dtype": "bfloat16",
|
| 86 |
-
"bf16": true
|
|
|
|
| 87 |
}
|
|
|
|
| 29 |
"warmup_ratio": 0.05,
|
| 30 |
"weight_decay": 0.01,
|
| 31 |
"max_grad_norm": 1.0,
|
| 32 |
+
"neftune_noise_alpha": 5,
|
| 33 |
+
"fp16": false,
|
| 34 |
+
"bf16": true
|
| 35 |
},
|
| 36 |
|
| 37 |
"checkpointing": {
|
|
|
|
| 85 |
"model_revision": "main",
|
| 86 |
"use_flash_attention": true,
|
| 87 |
"torch_dtype": "bfloat16",
|
| 88 |
+
"bf16": true,
|
| 89 |
+
"fp16": false
|
| 90 |
}
|