Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 16

Commit

90cba36

verified ·

1 Parent(s): 7e2aaf9

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +32 -3

run_cloud_training.py CHANGED Viewed

@@ -28,6 +28,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
 # Create triton directory to avoid warning
 os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
@@ -524,12 +527,24 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         # Create LoRA config
         logger.info("Creating LoRA configuration")
         lora_config_obj = LoraConfig(
             r=lora_config.get("r", 16),
             lora_alpha=lora_config.get("lora_alpha", 32),
             lora_dropout=lora_config.get("lora_dropout", 0.05),
             bias=lora_config.get("bias", "none"),
-            target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
         )
         # Apply LoRA to model
@@ -537,6 +552,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA")
         # Always use minimal batch size for HF Space CPU
         if is_running_in_space() and not can_use_4bit and not is_pre_quantized:
             per_device_train_batch_size = 1
@@ -566,10 +590,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
             fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
             bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
-            gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
-            eval_strategy = training_config.get("eval_strategy", "no")  # Changed from "steps" to "no" since we don't have an eval_dataset
             load_best_model_at_end = False  # Must be False when eval_strategy is "no"
             logger.info("Using full training parameters for GPU mode")
         else:
             # For Space CPU training mode, use minimal parameters

 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
+# Disable tokenizers parallelism warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Create triton directory to avoid warning
 os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
         # Create LoRA config
         logger.info("Creating LoRA configuration")
+        # For pre-quantized models, we need proper target modules
+        default_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
+        # For pre-quantized models, especially Unsloth ones, we need to be careful with the target modules
+        if is_pre_quantized:
+            # For Unsloth models, use special configuration
+            if "unsloth" in model_name.lower():
+                default_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+                logger.info("Using Unsloth-specific LoRA target modules")
         lora_config_obj = LoraConfig(
             r=lora_config.get("r", 16),
             lora_alpha=lora_config.get("lora_alpha", 32),
             lora_dropout=lora_config.get("lora_dropout", 0.05),
             bias=lora_config.get("bias", "none"),
+            task_type="CAUSAL_LM",  # Explicitly set the task type
+            target_modules=lora_config.get("target_modules", default_target_modules)
         )
         # Apply LoRA to model
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA")
+        # Ensure model parameters that need gradients are properly set
+        if is_pre_quantized:
+            logger.info("Verifying gradient settings for pre-quantized model")
+            for name, param in model.named_parameters():
+                if 'lora' in name:  # Only LoRA parameters should be trained
+                    if not param.requires_grad:
+                        logger.warning(f"LoRA parameter {name} doesn't have requires_grad=True, fixing...")
+                        param.requires_grad = True
         # Always use minimal batch size for HF Space CPU
         if is_running_in_space() and not can_use_4bit and not is_pre_quantized:
             per_device_train_batch_size = 1
             gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
             fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
             bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
+            # Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
+            gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
+            eval_strategy = training_config.get("eval_strategy", "no")
             load_best_model_at_end = False  # Must be False when eval_strategy is "no"
+            if is_pre_quantized:
+                logger.info("Disabled gradient checkpointing for pre-quantized model to avoid gradient issues")
             logger.info("Using full training parameters for GPU mode")
         else:
             # For Space CPU training mode, use minimal parameters