Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

f3c357b

verified ·

1 Parent(s): 6b2c2bc

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +92 -3

run_cloud_training.py CHANGED Viewed

@@ -14,6 +14,7 @@ import argparse
 import numpy as np
 from dotenv import load_dotenv
 import torch
 from datasets import load_dataset
 import transformers
 from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
@@ -26,6 +27,21 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 os.environ["XFORMERS_DISABLED"] = "1"
 # Configure PyTorch memory allocator for better memory management
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
@@ -391,6 +407,41 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     os.environ["XFORMERS_DISABLED"] = "1"
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
@@ -412,7 +463,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                 max_seq_length=max_seq_length,
                 dtype=dtype,
                 quantization_config=bnb_config,
-                attn_implementation="eager"  # Force eager attention
             )
             logger.info("Model loaded successfully with unsloth")
@@ -424,6 +477,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     model.config.use_flash_attention = False
                 if hasattr(model.config, 'use_flash_attention_2'):
                     model.config.use_flash_attention_2 = False
             return model, tokenizer
@@ -432,7 +487,11 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
             logger.info("Falling back to standard Hugging Face loading...")
             # We'll try with HF loading
-            attn_params = {"attn_implementation": "eager"}  # Always use eager
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
@@ -446,6 +505,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     config.use_flash_attention = False
                 if hasattr(config, 'use_flash_attention_2'):
                     config.use_flash_attention_2 = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -476,6 +537,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     config.use_flash_attention = False
                 if hasattr(config, 'use_flash_attention_2'):
                     config.use_flash_attention_2 = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -486,7 +549,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     device_map="auto",
                     torch_dtype=dtype or torch.float16,
                     quantization_config=bnb_config,
-                    trust_remote_code=True
                 )
                 logger.info("Model loaded successfully with basic HF loading")
                 return model, tokenizer
@@ -513,6 +579,28 @@ def train(config_path, dataset_name, output_dir):
     os.environ["XFORMERS_DISABLED"] = "1"
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     # Update flash attention setting to always use eager
     global flash_attention_available
     flash_attention_available = False
@@ -521,6 +609,7 @@ def train(config_path, dataset_name, output_dir):
     # Update hardware config to ensure eager attention
     hardware_config["attn_implementation"] = "eager"
     hardware_config["use_flash_attention"] = False
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)

 import numpy as np
 from dotenv import load_dotenv
 import torch
+import sys
 from datasets import load_dataset
 import transformers
 from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 os.environ["XFORMERS_DISABLED"] = "1"
+# Completely disable xformers by removing it from sys.modules if it's loaded
+if 'xformers' in sys.modules:
+    del sys.modules['xformers']
+if 'xformers.ops' in sys.modules:
+    del sys.modules['xformers.ops']
+# Patch transformers to prevent xformers import
+def prevent_xformers_import(name, *args, **kwargs):
+    if 'xformers' in name:
+        raise ImportError(f"Import of {name} prevented")
+    return original_import(name, *args, **kwargs)
+original_import = __import__
+__builtins__['__import__'] = prevent_xformers_import
 # Configure PyTorch memory allocator for better memory management
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     os.environ["XFORMERS_DISABLED"] = "1"
+    # Patch transformers attention implementation
+    try:
+        # Try to patch transformers attention implementation to avoid xformers
+        import transformers.models.llama.modeling_llama as llama_modeling
+        # Store original attention implementation
+        if not hasattr(llama_modeling, '_original_forward'):
+            # Only patch if not already patched
+            logger.info("Patching LLaMA attention implementation to avoid xformers")
+            # Store original implementation
+            if hasattr(llama_modeling.LlamaAttention, 'forward'):
+                llama_modeling._original_forward = llama_modeling.LlamaAttention.forward
+                # Define a new forward method that doesn't use xformers
+                def safe_attention_forward(self, hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False):
+                    logger.info("Using safe attention implementation (no xformers)")
+                    # Force use_flash_attention to False
+                    self._attn_implementation = "eager"
+                    if hasattr(self, 'use_flash_attention'):
+                        self.use_flash_attention = False
+                    if hasattr(self, 'use_flash_attention_2'):
+                        self.use_flash_attention_2 = False
+                    # Call original implementation with flash attention disabled
+                    return llama_modeling._original_forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
+                # Replace the forward method
+                llama_modeling.LlamaAttention.forward = safe_attention_forward
+                logger.info("Successfully patched LLaMA attention implementation")
+    except Exception as e:
+        logger.warning(f"Failed to patch attention implementation: {e}")
+        logger.info("Will try to proceed with standard loading")
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
                 max_seq_length=max_seq_length,
                 dtype=dtype,
                 quantization_config=bnb_config,
+                attn_implementation="eager",  # Force eager attention
+                use_flash_attention=False,    # Explicitly disable flash attention
+                use_xformers_attention=False  # Explicitly disable xformers
             )
             logger.info("Model loaded successfully with unsloth")
                     model.config.use_flash_attention = False
                 if hasattr(model.config, 'use_flash_attention_2'):
                     model.config.use_flash_attention_2 = False
+                if hasattr(model.config, 'use_xformers_attention'):
+                    model.config.use_xformers_attention = False
             return model, tokenizer
             logger.info("Falling back to standard Hugging Face loading...")
             # We'll try with HF loading
+            attn_params = {
+                "attn_implementation": "eager",  # Always use eager
+                "use_flash_attention": False,    # Explicitly disable flash attention
+                "use_xformers_attention": False  # Explicitly disable xformers
+            }
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
                     config.use_flash_attention = False
                 if hasattr(config, 'use_flash_attention_2'):
                     config.use_flash_attention_2 = False
+                if hasattr(config, 'use_xformers_attention'):
+                    config.use_xformers_attention = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                     config.use_flash_attention = False
                 if hasattr(config, 'use_flash_attention_2'):
                     config.use_flash_attention_2 = False
+                if hasattr(config, 'use_xformers_attention'):
+                    config.use_xformers_attention = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                     device_map="auto",
                     torch_dtype=dtype or torch.float16,
                     quantization_config=bnb_config,
+                    trust_remote_code=True,
+                    attn_implementation="eager",
+                    use_flash_attention=False,
+                    use_xformers_attention=False
                 )
                 logger.info("Model loaded successfully with basic HF loading")
                 return model, tokenizer
     os.environ["XFORMERS_DISABLED"] = "1"
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    # Monkey patch torch.nn.functional to disable memory_efficient_attention
+    try:
+        import torch.nn.functional as F
+        if hasattr(F, 'scaled_dot_product_attention'):
+            logger.info("Monkey patching torch.nn.functional.scaled_dot_product_attention")
+            original_sdpa = F.scaled_dot_product_attention
+            def safe_sdpa(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
+                # Force disable memory efficient attention
+                logger.info("Using safe scaled_dot_product_attention (no xformers)")
+                return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
+            F.scaled_dot_product_attention = safe_sdpa
+    except Exception as e:
+        logger.warning(f"Failed to patch scaled_dot_product_attention: {e}")
+    # Completely remove xformers from sys.modules if it's loaded
+    for module_name in list(sys.modules.keys()):
+        if 'xformers' in module_name:
+            logger.info(f"Removing {module_name} from sys.modules")
+            del sys.modules[module_name]
     # Update flash attention setting to always use eager
     global flash_attention_available
     flash_attention_available = False
     # Update hardware config to ensure eager attention
     hardware_config["attn_implementation"] = "eager"
     hardware_config["use_flash_attention"] = False
+    hardware_config["use_xformers_attention"] = False
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)