Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

00a06ef

verified ·

1 Parent(s): 2457cec

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +95 -67

run_cloud_training.py CHANGED Viewed

@@ -21,12 +21,14 @@ from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
 # Configure PyTorch memory allocator for better memory management
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-# Disable flash attention globally
-os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Configure logging first
 logging.basicConfig(
     level=logging.INFO,
@@ -211,7 +213,7 @@ class PreTokenizedCollator(DataCollatorMixin):
     """
     def __init__(self, pad_token_id=0, tokenizer=None):
         self.pad_token_id = pad_token_id
-        self.tokenizer = tokenizer  # Keep a reference to the tokenizer for string conversion
     def __call__(self, features):
         # Print a sample feature to understand structure
@@ -221,66 +223,73 @@ class PreTokenizedCollator(DataCollatorMixin):
         # Extract input_ids from conversations if needed
         processed_features = []
         for feature in features:
             # If input_ids is not directly available, try to extract from conversations
             if 'input_ids' not in feature and 'conversations' in feature:
                 # Extract from conversations based on your dataset structure
                 conversations = feature['conversations']
-                # Debug the conversations structure
-                logger.info(f"Conversations type: {type(conversations)}")
-                if isinstance(conversations, list) and len(conversations) > 0:
-                    logger.info(f"First conversation type: {type(conversations[0])}")
-                    logger.info(f"First conversation: {conversations[0]}")
                 # Try different approaches to extract input_ids
                 if isinstance(conversations, list) and len(conversations) > 0:
-                    # Case 1: If conversations is a list of dicts with 'content' field
-                    if isinstance(conversations[0], dict) and 'content' in conversations[0]:
-                        content = conversations[0]['content']
-                        logger.info(f"Found content field: {type(content)}")
-                        # If content is a string, tokenize it
-                        if isinstance(content, str) and self.tokenizer:
-                            logger.info(f"Tokenizing string content: {content[:50]}...")
-                            feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
-                        # If content is already a list of integers, use it directly
-                        elif isinstance(content, list) and all(isinstance(x, int) for x in content):
-                            feature['input_ids'] = content
-                        # If content is already tokenized in some other format
-                        else:
-                            logger.warning(f"Unexpected content format: {type(content)}")
-                    # Case 2: If conversations is a list of dicts with 'input_ids' field
-                    elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
                         feature['input_ids'] = conversations[0]['input_ids']
-                    # Case 3: If conversations itself contains the input_ids
                     elif all(isinstance(x, int) for x in conversations):
                         feature['input_ids'] = conversations
-                    # Case 4: If conversations is a list of strings
-                    elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
-                        # Join all strings and tokenize
-                        full_text = " ".join(conversations)
-                        feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
             # Ensure input_ids is a list of integers
             if 'input_ids' in feature:
-                # If input_ids is a string, tokenize it
-                if isinstance(feature['input_ids'], str) and self.tokenizer:
-                    logger.info(f"Converting string input_ids to tokens: {feature['input_ids'][:50]}...")
-                    feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
                 # If input_ids is not a list, convert it
                 elif not isinstance(feature['input_ids'], list):
                     try:
                         feature['input_ids'] = list(feature['input_ids'])
                     except:
                         logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
             processed_features.append(feature)
         # If we still don't have input_ids, log an error
-        if len(processed_features) > 0 and 'input_ids' not in processed_features[0]:
             logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
             if 'conversations' in processed_features[0]:
                 logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
@@ -344,6 +353,11 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
     """
     global flash_attention_available
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
@@ -364,37 +378,42 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
-                quantization_config=bnb_config
             )
             logger.info("Model loaded successfully with unsloth")
             return model, tokenizer
         except Exception as e:
             logger.warning(f"Unsloth loading failed: {e}")
             logger.info("Falling back to standard Hugging Face loading...")
-            # We'll try two approaches with HF loading
-            attn_params = {}
-            # If flash attention is available, try to use it
-            if flash_attention_available:
-                logger.info("Flash Attention is available - setting appropriate parameters")
-                # For newer models that support attn_implementation parameter
-                attn_params = {"attn_implementation": "eager"}  # Default to eager for compatibility
-                # Try to use flash attention if available
-                try:
-                    # Try importing flash attention to confirm it's available
-                    import flash_attn
-                    logger.info(f"Using Flash Attention version {flash_attn.__version__}")
-                    attn_params = {"attn_implementation": "flash_attention_2"}
-                except Exception as flash_error:
-                    logger.warning(f"Flash Attention import failed: {flash_error}")
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
                 logger.info(f"Trying HF loading with attention parameters: {attn_params}")
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 # The proper way to set attention implementation in newer transformers
@@ -416,6 +435,15 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                 # Approach 2: Complete fallback with minimal parameters
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 # Most basic loading without any attention parameters
@@ -447,19 +475,19 @@ def train(config_path, dataset_name, output_dir):
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
-    # Update flash attention setting based on availability
     global flash_attention_available
-    if flash_attention_available:
-        logger.info("Flash Attention is available - updating configuration")
-        # If flash attention is available, set attn_implementation to flash_attention_2
-        hardware_config["attn_implementation"] = "flash_attention_2"
-    else:
-        logger.info("Flash Attention not available - setting to eager attention")
-        hardware_config["attn_implementation"] = "eager"
-    # Override flash attention setting to disable it if there are compatibility issues
-    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
-    logger.info("Flash attention has been DISABLED globally via environment variable")
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)

 from peft import LoraConfig
 from unsloth import FastLanguageModel
+# Disable all attention optimizations that might cause issues
+os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["XFORMERS_DISABLED"] = "1"
 # Configure PyTorch memory allocator for better memory management
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # Configure logging first
 logging.basicConfig(
     level=logging.INFO,
     """
     def __init__(self, pad_token_id=0, tokenizer=None):
         self.pad_token_id = pad_token_id
+        self.tokenizer = tokenizer  # Keep a reference to the tokenizer for debugging only
     def __call__(self, features):
         # Print a sample feature to understand structure
         # Extract input_ids from conversations if needed
         processed_features = []
         for feature in features:
+            # If input_ids is directly available, use it without tokenization
+            if 'input_ids' in feature and isinstance(feature['input_ids'], list):
+                # Already tokenized, no processing needed
+                processed_features.append(feature)
+                continue
             # If input_ids is not directly available, try to extract from conversations
             if 'input_ids' not in feature and 'conversations' in feature:
                 # Extract from conversations based on your dataset structure
                 conversations = feature['conversations']
+                # Debug the conversations structure (only for first batch)
+                if len(processed_features) == 0:
+                    logger.info(f"Conversations type: {type(conversations)}")
+                    if isinstance(conversations, list) and len(conversations) > 0:
+                        logger.info(f"First conversation type: {type(conversations[0])}")
                 # Try different approaches to extract input_ids
                 if isinstance(conversations, list) and len(conversations) > 0:
+                    # Case 1: If conversations is a list of dicts with 'input_ids' field (pre-tokenized)
+                    if isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
                         feature['input_ids'] = conversations[0]['input_ids']
+                    # Case 2: If conversations itself contains the input_ids (pre-tokenized)
                     elif all(isinstance(x, int) for x in conversations):
                         feature['input_ids'] = conversations
+                    # Case 3: If conversations is a list of dicts with 'content' field
+                    # This should be avoided for pre-tokenized datasets
+                    elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
+                        content = conversations[0]['content']
+                        # If content is already a list of integers, use it directly
+                        if isinstance(content, list) and all(isinstance(x, int) for x in content):
+                            feature['input_ids'] = content
+                        # AVOID TOKENIZATION: Log warning if content is a string
+                        elif isinstance(content, str):
+                            logger.warning("Found string content in pre-tokenized dataset. This should not happen.")
+                            logger.warning("Skipping this example to avoid tokenization.")
+                            continue
             # Ensure input_ids is a list of integers
             if 'input_ids' in feature:
+                # AVOID TOKENIZATION: Skip string input_ids
+                if isinstance(feature['input_ids'], str):
+                    logger.warning("Found string input_ids in pre-tokenized dataset. This should not happen.")
+                    logger.warning("Skipping this example to avoid tokenization.")
+                    continue
                 # If input_ids is not a list, convert it
                 elif not isinstance(feature['input_ids'], list):
                     try:
                         feature['input_ids'] = list(feature['input_ids'])
                     except:
                         logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
+                        continue
+            else:
+                logger.warning("No input_ids found in this example. Skipping.")
+                continue
             processed_features.append(feature)
         # If we still don't have input_ids, log an error
+        if len(processed_features) == 0:
+            logger.error("No valid examples found in batch. Check dataset format.")
+            raise ValueError("No valid examples found. Please check dataset structure.")
+        if 'input_ids' not in processed_features[0]:
             logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
             if 'conversations' in processed_features[0]:
                 logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
     """
     global flash_attention_available
+    # Force disable flash attention and xformers
+    flash_attention_available = False
+    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+    os.environ["XFORMERS_DISABLED"] = "1"
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
+                quantization_config=bnb_config,
+                attn_implementation="eager"  # Force eager attention
             )
             logger.info("Model loaded successfully with unsloth")
+            # Explicitly disable flash attention in model config
+            if hasattr(model, 'config'):
+                if hasattr(model.config, 'attn_implementation'):
+                    model.config.attn_implementation = "eager"
+                if hasattr(model.config, 'use_flash_attention'):
+                    model.config.use_flash_attention = False
+                if hasattr(model.config, 'use_flash_attention_2'):
+                    model.config.use_flash_attention_2 = False
             return model, tokenizer
         except Exception as e:
             logger.warning(f"Unsloth loading failed: {e}")
             logger.info("Falling back to standard Hugging Face loading...")
+            # We'll try with HF loading
+            attn_params = {"attn_implementation": "eager"}  # Always use eager
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
                 logger.info(f"Trying HF loading with attention parameters: {attn_params}")
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                # Disable flash attention in config
+                if hasattr(config, 'attn_implementation'):
+                    config.attn_implementation = "eager"
+                if hasattr(config, 'use_flash_attention'):
+                    config.use_flash_attention = False
+                if hasattr(config, 'use_flash_attention_2'):
+                    config.use_flash_attention_2 = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 # The proper way to set attention implementation in newer transformers
                 # Approach 2: Complete fallback with minimal parameters
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                # Disable flash attention in config
+                if hasattr(config, 'attn_implementation'):
+                    config.attn_implementation = "eager"
+                if hasattr(config, 'use_flash_attention'):
+                    config.use_flash_attention = False
+                if hasattr(config, 'use_flash_attention_2'):
+                    config.use_flash_attention_2 = False
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 # Most basic loading without any attention parameters
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
+    # Force disable flash attention and xformers
+    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+    os.environ["XFORMERS_DISABLED"] = "1"
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    # Update flash attention setting to always use eager
     global flash_attention_available
+    flash_attention_available = False
+    logger.info("Flash Attention has been DISABLED globally")
+    # Update hardware config to ensure eager attention
+    hardware_config["attn_implementation"] = "eager"
+    hardware_config["use_flash_attention"] = False
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)