Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +24 -18
run_cloud_training.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
"""
|
| 4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
| 5 |
-
- Optimized for
|
| 6 |
- Research training only (no inference)
|
| 7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
| 8 |
"""
|
|
@@ -21,9 +21,9 @@ from peft import LoraConfig, get_peft_model
|
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
from huggingface_hub import HfApi, upload_folder
|
| 23 |
|
| 24 |
-
# Basic environment setup for
|
| 25 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:
|
| 26 |
-
os.environ["
|
| 27 |
|
| 28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
| 29 |
os.environ["FORCE_GPU"] = "1"
|
|
@@ -469,13 +469,17 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 469 |
use_4bit = False
|
| 470 |
logger.warning("Using CPU mode without quantization")
|
| 471 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
# For pre-quantized models, always use device_map="auto"
|
| 473 |
if is_pre_quantized and is_gpu_available():
|
| 474 |
logger.info("Loading pre-quantized model with GPU support")
|
| 475 |
model = AutoModelForCausalLM.from_pretrained(
|
| 476 |
model_name,
|
| 477 |
device_map="auto",
|
| 478 |
-
torch_dtype=
|
| 479 |
trust_remote_code=True,
|
| 480 |
use_cache=model_config.get("use_cache", False)
|
| 481 |
)
|
|
@@ -484,9 +488,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 484 |
logger.info(f"Loading model with 4-bit quantization")
|
| 485 |
|
| 486 |
# Create quantization config for GPU
|
|
|
|
| 487 |
bnb_config = BitsAndBytesConfig(
|
| 488 |
load_in_4bit=True,
|
| 489 |
-
bnb_4bit_compute_dtype=
|
| 490 |
bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
|
| 491 |
bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
|
| 492 |
)
|
|
@@ -496,10 +501,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 496 |
model_name,
|
| 497 |
quantization_config=bnb_config,
|
| 498 |
device_map="auto",
|
| 499 |
-
torch_dtype=
|
| 500 |
trust_remote_code=True,
|
| 501 |
use_cache=model_config.get("use_cache", False),
|
| 502 |
-
attn_implementation=hardware_config.get("attn_implementation", "
|
| 503 |
)
|
| 504 |
else:
|
| 505 |
# CPU fallback (or non-quantized GPU) mode
|
|
@@ -571,14 +576,14 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 571 |
gpu_info = torch.cuda.get_device_properties(0)
|
| 572 |
logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
| 573 |
|
| 574 |
-
# Check if it's an
|
| 575 |
-
if "
|
| 576 |
-
logger.info("Detected
|
| 577 |
-
per_device_train_batch_size = training_config.get("per_device_train_batch_size",
|
| 578 |
else:
|
| 579 |
# Use a smaller batch size for other GPUs
|
| 580 |
per_device_train_batch_size = 2
|
| 581 |
-
logger.info(f"Using conservative batch size for non-
|
| 582 |
else:
|
| 583 |
# Use minimal batch size for CPU
|
| 584 |
per_device_train_batch_size = 1
|
|
@@ -587,9 +592,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 587 |
# Use full training parameters for pre-quantized models or GPU mode
|
| 588 |
if is_pre_quantized or can_use_4bit or not is_running_in_space():
|
| 589 |
num_train_epochs = training_config.get("num_train_epochs", 3)
|
| 590 |
-
gradient_accumulation_steps = training_config.get("gradient_accumulation_steps",
|
| 591 |
-
fp16 = torch.cuda.is_available() and hardware_config.get("fp16",
|
| 592 |
-
bf16 = torch.cuda.is_available() and hardware_config.get("bf16",
|
| 593 |
# Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
|
| 594 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
|
| 595 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
|
@@ -633,14 +638,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 633 |
logging_steps=training_config.get("logging_steps", 10),
|
| 634 |
save_steps=training_config.get("save_steps", 200),
|
| 635 |
save_total_limit=training_config.get("save_total_limit", 3),
|
| 636 |
-
eval_strategy=eval_strategy,
|
| 637 |
load_best_model_at_end=load_best_model_at_end,
|
| 638 |
report_to=reports,
|
| 639 |
logging_first_step=training_config.get("logging_first_step", True),
|
| 640 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
| 641 |
remove_unused_columns=False,
|
| 642 |
gradient_checkpointing=gradient_checkpointing,
|
| 643 |
-
dataloader_num_workers=dataloader_workers
|
|
|
|
| 644 |
)
|
| 645 |
|
| 646 |
# Create trainer with pre-tokenized collator
|
|
|
|
| 2 |
|
| 3 |
"""
|
| 4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
| 5 |
+
- Optimized for A100 GPU with pre-tokenized datasets
|
| 6 |
- Research training only (no inference)
|
| 7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
| 8 |
"""
|
|
|
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
from huggingface_hub import HfApi, upload_folder
|
| 23 |
|
| 24 |
+
# Basic environment setup for A100
|
| 25 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
|
| 26 |
+
os.environ["NCCL_P2P_DISABLE"] = "1" # Can help with A100 multi-GPU setups
|
| 27 |
|
| 28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
| 29 |
os.environ["FORCE_GPU"] = "1"
|
|
|
|
| 469 |
use_4bit = False
|
| 470 |
logger.warning("Using CPU mode without quantization")
|
| 471 |
|
| 472 |
+
# Determine compute dtype based on hardware config
|
| 473 |
+
compute_dtype = torch.bfloat16 if hardware_config.get("bf16", False) else torch.float16
|
| 474 |
+
logger.info(f"Using compute dtype: {compute_dtype}")
|
| 475 |
+
|
| 476 |
# For pre-quantized models, always use device_map="auto"
|
| 477 |
if is_pre_quantized and is_gpu_available():
|
| 478 |
logger.info("Loading pre-quantized model with GPU support")
|
| 479 |
model = AutoModelForCausalLM.from_pretrained(
|
| 480 |
model_name,
|
| 481 |
device_map="auto",
|
| 482 |
+
torch_dtype=compute_dtype,
|
| 483 |
trust_remote_code=True,
|
| 484 |
use_cache=model_config.get("use_cache", False)
|
| 485 |
)
|
|
|
|
| 488 |
logger.info(f"Loading model with 4-bit quantization")
|
| 489 |
|
| 490 |
# Create quantization config for GPU
|
| 491 |
+
bnb_compute_dtype = torch.bfloat16 if quant_config.get("bnb_4bit_compute_dtype", "float16") == "bfloat16" else torch.float16
|
| 492 |
bnb_config = BitsAndBytesConfig(
|
| 493 |
load_in_4bit=True,
|
| 494 |
+
bnb_4bit_compute_dtype=bnb_compute_dtype,
|
| 495 |
bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
|
| 496 |
bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
|
| 497 |
)
|
|
|
|
| 501 |
model_name,
|
| 502 |
quantization_config=bnb_config,
|
| 503 |
device_map="auto",
|
| 504 |
+
torch_dtype=compute_dtype,
|
| 505 |
trust_remote_code=True,
|
| 506 |
use_cache=model_config.get("use_cache", False),
|
| 507 |
+
attn_implementation=hardware_config.get("attn_implementation", "flash_attention_2")
|
| 508 |
)
|
| 509 |
else:
|
| 510 |
# CPU fallback (or non-quantized GPU) mode
|
|
|
|
| 576 |
gpu_info = torch.cuda.get_device_properties(0)
|
| 577 |
logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
| 578 |
|
| 579 |
+
# Check if it's an A100 or high-memory GPU
|
| 580 |
+
if "A100" in gpu_info.name or "A10G" in gpu_info.name or gpu_info.total_memory > 40e9:
|
| 581 |
+
logger.info("Detected A100 GPU - optimizing for A100")
|
| 582 |
+
per_device_train_batch_size = training_config.get("per_device_train_batch_size", 3)
|
| 583 |
else:
|
| 584 |
# Use a smaller batch size for other GPUs
|
| 585 |
per_device_train_batch_size = 2
|
| 586 |
+
logger.info(f"Using conservative batch size for non-A100 GPU: {per_device_train_batch_size}")
|
| 587 |
else:
|
| 588 |
# Use minimal batch size for CPU
|
| 589 |
per_device_train_batch_size = 1
|
|
|
|
| 592 |
# Use full training parameters for pre-quantized models or GPU mode
|
| 593 |
if is_pre_quantized or can_use_4bit or not is_running_in_space():
|
| 594 |
num_train_epochs = training_config.get("num_train_epochs", 3)
|
| 595 |
+
gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 2)
|
| 596 |
+
fp16 = torch.cuda.is_available() and hardware_config.get("fp16", False)
|
| 597 |
+
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", True)
|
| 598 |
# Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
|
| 599 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
|
| 600 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
|
|
|
| 638 |
logging_steps=training_config.get("logging_steps", 10),
|
| 639 |
save_steps=training_config.get("save_steps", 200),
|
| 640 |
save_total_limit=training_config.get("save_total_limit", 3),
|
| 641 |
+
eval_strategy=eval_strategy,
|
| 642 |
load_best_model_at_end=load_best_model_at_end,
|
| 643 |
report_to=reports,
|
| 644 |
logging_first_step=training_config.get("logging_first_step", True),
|
| 645 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
| 646 |
remove_unused_columns=False,
|
| 647 |
gradient_checkpointing=gradient_checkpointing,
|
| 648 |
+
dataloader_num_workers=dataloader_workers,
|
| 649 |
+
group_by_length=training_config.get("group_by_length", True)
|
| 650 |
)
|
| 651 |
|
| 652 |
# Create trainer with pre-tokenized collator
|