Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 16

Commit

23c5657

verified ·

1 Parent(s): 493e679

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +24 -18

run_cloud_training.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """
 Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
-- Optimized for L40S GPU with pre-tokenized datasets
 - Research training only (no inference)
 - CLOUD BASED TRAINING - Hugging Face Spaces
 """
@@ -21,9 +21,9 @@ from peft import LoraConfig, get_peft_model
 from dotenv import load_dotenv
 from huggingface_hub import HfApi, upload_folder
-# Basic environment setup for L40S
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
-os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
@@ -469,13 +469,17 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             use_4bit = False
             logger.warning("Using CPU mode without quantization")
         # For pre-quantized models, always use device_map="auto"
         if is_pre_quantized and is_gpu_available():
             logger.info("Loading pre-quantized model with GPU support")
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 device_map="auto",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 trust_remote_code=True,
                 use_cache=model_config.get("use_cache", False)
             )
@@ -484,9 +488,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             logger.info(f"Loading model with 4-bit quantization")
             # Create quantization config for GPU
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
                 bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
             )
@@ -496,10 +501,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
                 model_name,
                 quantization_config=bnb_config,
                 device_map="auto",
-                torch_dtype=torch.float16,
                 trust_remote_code=True,
                 use_cache=model_config.get("use_cache", False),
-                attn_implementation=hardware_config.get("attn_implementation", "eager")
             )
         else:
             # CPU fallback (or non-quantized GPU) mode
@@ -571,14 +576,14 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
                 gpu_info = torch.cuda.get_device_properties(0)
                 logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
-                # Check if it's an L40S or high-memory GPU
-                if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
-                    logger.info("Detected L40S GPU - optimizing for high-memory GPU")
-                    per_device_train_batch_size = training_config.get("per_device_train_batch_size", 2)
                 else:
                     # Use a smaller batch size for other GPUs
                     per_device_train_batch_size = 2
-                    logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
             else:
                 # Use minimal batch size for CPU
                 per_device_train_batch_size = 1
@@ -587,9 +592,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         # Use full training parameters for pre-quantized models or GPU mode
         if is_pre_quantized or can_use_4bit or not is_running_in_space():
             num_train_epochs = training_config.get("num_train_epochs", 3)
-            gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
-            fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
-            bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
             # Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
             gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
@@ -633,14 +638,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
-            eval_strategy=eval_strategy,  # Updated from evaluation_strategy
             load_best_model_at_end=load_best_model_at_end,
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
             gradient_checkpointing=gradient_checkpointing,
-            dataloader_num_workers=dataloader_workers
         )
         # Create trainer with pre-tokenized collator

 """
 Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
+- Optimized for A100 GPU with pre-tokenized datasets
 - Research training only (no inference)
 - CLOUD BASED TRAINING - Hugging Face Spaces
 """
 from dotenv import load_dotenv
 from huggingface_hub import HfApi, upload_folder
+# Basic environment setup for A100
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
+os.environ["NCCL_P2P_DISABLE"] = "1"  # Can help with A100 multi-GPU setups
 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
             use_4bit = False
             logger.warning("Using CPU mode without quantization")
+        # Determine compute dtype based on hardware config
+        compute_dtype = torch.bfloat16 if hardware_config.get("bf16", False) else torch.float16
+        logger.info(f"Using compute dtype: {compute_dtype}")
         # For pre-quantized models, always use device_map="auto"
         if is_pre_quantized and is_gpu_available():
             logger.info("Loading pre-quantized model with GPU support")
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 device_map="auto",
+                torch_dtype=compute_dtype,
                 trust_remote_code=True,
                 use_cache=model_config.get("use_cache", False)
             )
             logger.info(f"Loading model with 4-bit quantization")
             # Create quantization config for GPU
+            bnb_compute_dtype = torch.bfloat16 if quant_config.get("bnb_4bit_compute_dtype", "float16") == "bfloat16" else torch.float16
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit=True,
+                bnb_4bit_compute_dtype=bnb_compute_dtype,
                 bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
                 bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
             )
                 model_name,
                 quantization_config=bnb_config,
                 device_map="auto",
+                torch_dtype=compute_dtype,
                 trust_remote_code=True,
                 use_cache=model_config.get("use_cache", False),
+                attn_implementation=hardware_config.get("attn_implementation", "flash_attention_2")
             )
         else:
             # CPU fallback (or non-quantized GPU) mode
                 gpu_info = torch.cuda.get_device_properties(0)
                 logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
+                # Check if it's an A100 or high-memory GPU
+                if "A100" in gpu_info.name or "A10G" in gpu_info.name or gpu_info.total_memory > 40e9:
+                    logger.info("Detected A100 GPU - optimizing for A100")
+                    per_device_train_batch_size = training_config.get("per_device_train_batch_size", 3)
                 else:
                     # Use a smaller batch size for other GPUs
                     per_device_train_batch_size = 2
+                    logger.info(f"Using conservative batch size for non-A100 GPU: {per_device_train_batch_size}")
             else:
                 # Use minimal batch size for CPU
                 per_device_train_batch_size = 1
         # Use full training parameters for pre-quantized models or GPU mode
         if is_pre_quantized or can_use_4bit or not is_running_in_space():
             num_train_epochs = training_config.get("num_train_epochs", 3)
+            gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 2)
+            fp16 = torch.cuda.is_available() and hardware_config.get("fp16", False)
+            bf16 = torch.cuda.is_available() and hardware_config.get("bf16", True)
             # Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
             gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
+            eval_strategy=eval_strategy,
             load_best_model_at_end=load_best_model_at_end,
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
             gradient_checkpointing=gradient_checkpointing,
+            dataloader_num_workers=dataloader_workers,
+            group_by_length=training_config.get("group_by_length", True)
         )
         # Create trainer with pre-tokenized collator