Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +30 -9
run_cloud_training.py
CHANGED
|
@@ -412,6 +412,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
| 412 |
os.environ["XFORMERS_DISABLED"] = "1"
|
| 413 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 414 |
|
|
|
|
|
|
|
|
|
|
| 415 |
# Create BitsAndBytesConfig for 4-bit quantization
|
| 416 |
from transformers import BitsAndBytesConfig
|
| 417 |
bnb_config = BitsAndBytesConfig(
|
|
@@ -428,6 +431,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
| 428 |
# Skip Unsloth and use standard HuggingFace loading
|
| 429 |
logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
# Load with standard HuggingFace
|
| 432 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
| 433 |
|
|
@@ -442,10 +449,14 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
| 442 |
|
| 443 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
model = AutoModelForCausalLM.from_pretrained(
|
| 446 |
model_name,
|
| 447 |
config=config,
|
| 448 |
-
device_map=
|
| 449 |
torch_dtype=dtype or torch.float16,
|
| 450 |
quantization_config=bnb_config,
|
| 451 |
trust_remote_code=True,
|
|
@@ -465,6 +476,9 @@ def train(config_path, dataset_name, output_dir):
|
|
| 465 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 466 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 467 |
|
|
|
|
|
|
|
|
|
|
| 468 |
# Try to unload xformers if it's loaded
|
| 469 |
if 'xformers' in sys.modules:
|
| 470 |
logger.info("Removing xformers from sys.modules")
|
|
@@ -510,6 +524,12 @@ def train(config_path, dataset_name, output_dir):
|
|
| 510 |
logger.info(f"Output directory: {output_dir}")
|
| 511 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
# Load and prepare the dataset
|
| 514 |
dataset = load_and_prepare_dataset(dataset_name, config)
|
| 515 |
|
|
@@ -524,9 +544,9 @@ def train(config_path, dataset_name, output_dir):
|
|
| 524 |
# Initialize model
|
| 525 |
logger.info("Initializing model (preserving 4-bit quantization)")
|
| 526 |
|
| 527 |
-
#
|
| 528 |
-
max_seq_length =
|
| 529 |
-
logger.info(f"Using
|
| 530 |
|
| 531 |
# Create LoRA config directly
|
| 532 |
logger.info("Creating LoRA configuration")
|
|
@@ -582,10 +602,10 @@ def train(config_path, dataset_name, output_dir):
|
|
| 582 |
reports = ["none"]
|
| 583 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 584 |
|
| 585 |
-
#
|
| 586 |
-
#
|
| 587 |
-
per_device_train_batch_size =
|
| 588 |
-
logger.info(f"Using
|
| 589 |
|
| 590 |
training_args_dict = {
|
| 591 |
"output_dir": output_dir,
|
|
@@ -607,7 +627,8 @@ def train(config_path, dataset_name, output_dir):
|
|
| 607 |
"logging_first_step": training_config.get("logging_first_step", True),
|
| 608 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
| 609 |
"remove_unused_columns": False,
|
| 610 |
-
"seed": 42
|
|
|
|
| 611 |
}
|
| 612 |
|
| 613 |
# Create TrainingArguments with validated parameters
|
|
|
|
| 412 |
os.environ["XFORMERS_DISABLED"] = "1"
|
| 413 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 414 |
|
| 415 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
| 416 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 417 |
+
|
| 418 |
# Create BitsAndBytesConfig for 4-bit quantization
|
| 419 |
from transformers import BitsAndBytesConfig
|
| 420 |
bnb_config = BitsAndBytesConfig(
|
|
|
|
| 431 |
# Skip Unsloth and use standard HuggingFace loading
|
| 432 |
logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
|
| 433 |
|
| 434 |
+
# Check available GPUs
|
| 435 |
+
gpu_count = torch.cuda.device_count()
|
| 436 |
+
logger.info(f"Found {gpu_count} GPU(s) available")
|
| 437 |
+
|
| 438 |
# Load with standard HuggingFace
|
| 439 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
| 440 |
|
|
|
|
| 449 |
|
| 450 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 451 |
|
| 452 |
+
# Use auto device mapping for multi-GPU setup
|
| 453 |
+
device_map = "auto" if gpu_count > 1 else "auto"
|
| 454 |
+
logger.info(f"Using device_map={device_map} for model distribution")
|
| 455 |
+
|
| 456 |
model = AutoModelForCausalLM.from_pretrained(
|
| 457 |
model_name,
|
| 458 |
config=config,
|
| 459 |
+
device_map=device_map,
|
| 460 |
torch_dtype=dtype or torch.float16,
|
| 461 |
quantization_config=bnb_config,
|
| 462 |
trust_remote_code=True,
|
|
|
|
| 476 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 477 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 478 |
|
| 479 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
| 480 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 481 |
+
|
| 482 |
# Try to unload xformers if it's loaded
|
| 483 |
if 'xformers' in sys.modules:
|
| 484 |
logger.info("Removing xformers from sys.modules")
|
|
|
|
| 524 |
logger.info(f"Output directory: {output_dir}")
|
| 525 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
| 526 |
|
| 527 |
+
# Check GPU availability
|
| 528 |
+
gpu_count = torch.cuda.device_count()
|
| 529 |
+
logger.info(f"Found {gpu_count} GPU(s) available")
|
| 530 |
+
for i in range(gpu_count):
|
| 531 |
+
logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
| 532 |
+
|
| 533 |
# Load and prepare the dataset
|
| 534 |
dataset = load_and_prepare_dataset(dataset_name, config)
|
| 535 |
|
|
|
|
| 544 |
# Initialize model
|
| 545 |
logger.info("Initializing model (preserving 4-bit quantization)")
|
| 546 |
|
| 547 |
+
# Use full sequence length of 2048 as required for pre-tokenized dataset
|
| 548 |
+
max_seq_length = training_config.get("max_seq_length", 2048)
|
| 549 |
+
logger.info(f"Using sequence length: {max_seq_length} as required for pre-tokenized dataset")
|
| 550 |
|
| 551 |
# Create LoRA config directly
|
| 552 |
logger.info("Creating LoRA configuration")
|
|
|
|
| 602 |
reports = ["none"]
|
| 603 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 604 |
|
| 605 |
+
# Optimize batch size for multi-GPU setup
|
| 606 |
+
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
| 607 |
+
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 608 |
+
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 609 |
|
| 610 |
training_args_dict = {
|
| 611 |
"output_dir": output_dir,
|
|
|
|
| 627 |
"logging_first_step": training_config.get("logging_first_step", True),
|
| 628 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
| 629 |
"remove_unused_columns": False,
|
| 630 |
+
"seed": 42,
|
| 631 |
+
"dataloader_num_workers": 4 # Use multiple workers for data loading
|
| 632 |
}
|
| 633 |
|
| 634 |
# Create TrainingArguments with validated parameters
|