Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

127e6b1

verified ·

1 Parent(s): 75f9a64

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

run_transformers_training.py +329 -62

run_transformers_training.py CHANGED Viewed

@@ -10,6 +10,7 @@ import logging
 from datetime import datetime
 import time
 import warnings
 from importlib.util import find_spec
 import multiprocessing
 import torch
@@ -31,64 +32,36 @@ if CUDA_AVAILABLE:
         # Method already set, which is fine
         print("Multiprocessing start method already set")
-# Now import the rest of the modules
-import torch
-# Configure logging early
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[logging.StreamHandler(sys.stdout)]
-)
-logger = logging.getLogger(__name__)
-# Set other loggers to WARNING to reduce noise and ensure our logs are visible
-logging.getLogger("transformers").setLevel(logging.WARNING)
-logging.getLogger("datasets").setLevel(logging.WARNING)
-logging.getLogger("accelerate").setLevel(logging.WARNING)
-logging.getLogger("torch").setLevel(logging.WARNING)
-logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
-# Import Unsloth first, before other ML imports
-try:
-    from unsloth import FastLanguageModel
-    from unsloth.chat_templates import get_chat_template
-    unsloth_available = True
-    logger.info("Unsloth successfully imported")
-except ImportError:
-    unsloth_available = False
-    logger.warning("Unsloth not available. Please install with: pip install unsloth")
-# Now import other ML libraries
-try:
     import transformers
-    from transformers import (
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        TrainingArguments,
-        Trainer,
-        TrainerCallback,
-        set_seed,
-        BitsAndBytesConfig
-    )
-    logger.info(f"Transformers version: {transformers.__version__}")
-except ImportError:
-    logger.error("Transformers not available. This is a critical dependency.")
-# Check availability of libraries
 peft_available = find_spec("peft") is not None
 if peft_available:
     import peft
-    logger.info(f"PEFT version: {peft.__version__}")
-else:
-    logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
-# Import datasets library after the main ML libraries
-try:
     from datasets import load_dataset
-    logger.info("Datasets library successfully imported")
-except ImportError:
-    logger.error("Datasets library not available. This is required for loading training data.")
 # Define a clean logging function for HF Space compatibility
 def log_info(message):
@@ -243,6 +216,17 @@ def load_model_and_tokenizer(config):
     chat_template = get_config_value(tokenizer_config, "chat_template", None)
     padding_side = get_config_value(tokenizer_config, "padding_side", "right")
     log_info(f"Loading model: {model_name} (revision: {model_revision})")
     log_info(f"Max sequence length: {max_seq_length}")
@@ -257,7 +241,7 @@ def load_model_and_tokenizer(config):
                 dtype=get_config_value(config, "torch_dtype", "bfloat16"),
                 revision=model_revision,
                 trust_remote_code=trust_remote_code,
-                use_flash_attention_2=get_config_value(config, "use_flash_attention", True)
             )
             # Configure tokenizer settings
@@ -294,11 +278,23 @@ def load_model_and_tokenizer(config):
                 max_seq_length=max_seq_length,
                 modules_to_save=None
             )
         else:
             # Standard HuggingFace loading
             log_info("Using standard HuggingFace model loading (Unsloth not available or disabled)")
             from transformers import AutoModelForCausalLM, AutoTokenizer
             # Load tokenizer first
             tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
@@ -327,7 +323,8 @@ def load_model_and_tokenizer(config):
                 trust_remote_code=trust_remote_code,
                 revision=model_revision,
                 torch_dtype=torch.bfloat16 if get_config_value(config, "torch_dtype", "bfloat16") == "bfloat16" else torch.float16,
-                device_map="auto" if CUDA_AVAILABLE else None
             )
             # Apply PEFT/LoRA if enabled but using standard loading
@@ -760,6 +757,63 @@ class LoggingCallback(TrainerCallback):
         """Called at the beginning of a step"""
         pass
 def check_dependencies():
     """
     Check for required and optional dependencies, ensuring proper versions and import order.
@@ -785,6 +839,7 @@ def check_dependencies():
     missing_packages = []
     package_versions = {}
     order_issues = []
     # Check required packages
     log_info("Checking required dependencies...")
@@ -822,6 +877,7 @@ def check_dependencies():
             log_info(f"✅ {package} - {feature} available")
         except ImportError:
             log_info(f"⚠️ {package} - {feature} not available")
     # Check import order for optimal performance
     if "transformers" in package_versions and "unsloth" in package_versions:
@@ -835,11 +891,19 @@ def check_dependencies():
                 order_issue = "⚠️ For optimal performance, import unsloth before transformers"
                 order_issues.append(order_issue)
                 log_info(order_issue)
             else:
                 log_info("✅ Import order: unsloth before transformers (optimal)")
         except (ValueError, IndexError) as e:
             log_info(f"⚠️ Could not verify import order: {str(e)}")
     # Report missing required packages
     if missing_packages:
         log_info("\n❌ Critical dependencies missing:")
@@ -990,10 +1054,22 @@ def setup_environment(args):
             os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
             log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
-    # Check dependencies before proceeding
     if not check_dependencies():
         raise RuntimeError("Critical dependencies missing")
     return transformers_config, seed
 def setup_model_and_tokenizer(config):
@@ -1001,21 +1077,206 @@ def setup_model_and_tokenizer(config):
     Load and configure the model and tokenizer.
     Args:
-        config: Complete configuration dictionary
     Returns:
         tuple: (model, tokenizer) - The loaded model and tokenizer
     """
-    log_info("Loading model and tokenizer...")
-    model, tokenizer = load_model_and_tokenizer(config)
-    if model is None or tokenizer is None:
-        raise ValueError("Failed to load model or tokenizer")
-    log_info(f"Model loaded successfully: {model.__class__.__name__}")
-    log_info(f"Tokenizer loaded: {tokenizer.__class__.__name__} (vocab size: {tokenizer.vocab_size})")
-    return model, tokenizer
 def setup_dataset_and_collator(config, tokenizer):
     """
@@ -1229,6 +1490,12 @@ def main():
     logger.info("Starting training process")
     try:
         # Parse command line arguments
         args = parse_args()

 from datetime import datetime
 import time
 import warnings
+import traceback
 from importlib.util import find_spec
 import multiprocessing
 import torch
         # Method already set, which is fine
         print("Multiprocessing start method already set")
+# Import order is important: unsloth should be imported before transformers
+# Check for libraries without importing them
+unsloth_available = find_spec("unsloth") is not None
+if unsloth_available:
+    import unsloth
+# Import torch first, then transformers if available
+import torch
+transformers_available = find_spec("transformers") is not None
+if transformers_available:
     import transformers
+    from transformers import AutoTokenizer, TrainingArguments, Trainer, set_seed
+    from torch.utils.data import DataLoader
 peft_available = find_spec("peft") is not None
 if peft_available:
     import peft
+# Only import HF datasets if available
+datasets_available = find_spec("datasets") is not None
+if datasets_available:
     from datasets import load_dataset
+# Set up the logger
+logger = logging.getLogger(__name__)
+log_handler = logging.StreamHandler()
+log_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+log_handler.setFormatter(log_format)
+logger.addHandler(log_handler)
+logger.setLevel(logging.INFO)
 # Define a clean logging function for HF Space compatibility
 def log_info(message):
     chat_template = get_config_value(tokenizer_config, "chat_template", None)
     padding_side = get_config_value(tokenizer_config, "padding_side", "right")
+    # Check for flash attention
+    use_flash_attention = get_config_value(config, "use_flash_attention", False)
+    flash_attention_available = False
+    try:
+        import flash_attn
+        flash_attention_available = True
+        log_info(f"Flash Attention detected (version: {flash_attn.__version__})")
+    except ImportError:
+        if use_flash_attention:
+            log_info("Flash Attention requested but not available")
     log_info(f"Loading model: {model_name} (revision: {model_revision})")
     log_info(f"Max sequence length: {max_seq_length}")
                 dtype=get_config_value(config, "torch_dtype", "bfloat16"),
                 revision=model_revision,
                 trust_remote_code=trust_remote_code,
+                use_flash_attention_2=use_flash_attention and flash_attention_available
             )
             # Configure tokenizer settings
                 max_seq_length=max_seq_length,
                 modules_to_save=None
             )
+            if use_flash_attention and flash_attention_available:
+                log_info("🚀 Using Flash Attention for faster training")
+            elif use_flash_attention and not flash_attention_available:
+                log_info("⚠️ Flash Attention requested but not available - using standard attention")
         else:
             # Standard HuggingFace loading
             log_info("Using standard HuggingFace model loading (Unsloth not available or disabled)")
             from transformers import AutoModelForCausalLM, AutoTokenizer
+            # Check if flash attention should be enabled in config
+            use_attn_implementation = None
+            if use_flash_attention and flash_attention_available:
+                use_attn_implementation = "flash_attention_2"
+                log_info("🚀 Using Flash Attention for faster training")
             # Load tokenizer first
             tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 trust_remote_code=trust_remote_code,
                 revision=model_revision,
                 torch_dtype=torch.bfloat16 if get_config_value(config, "torch_dtype", "bfloat16") == "bfloat16" else torch.float16,
+                device_map="auto" if CUDA_AVAILABLE else None,
+                attn_implementation=use_attn_implementation
             )
             # Apply PEFT/LoRA if enabled but using standard loading
         """Called at the beginning of a step"""
         pass
+def install_flash_attention():
+    """
+    Attempt to install Flash Attention for improved performance.
+    Returns True if installation was successful, False otherwise.
+    """
+    log_info("Attempting to install Flash Attention...")
+    # Check for CUDA before attempting installation
+    if not CUDA_AVAILABLE:
+        log_info("❌ Cannot install Flash Attention: CUDA not available")
+        return False
+    try:
+        # Check CUDA version to determine correct installation command
+        cuda_version = torch.version.cuda
+        if cuda_version is None:
+            log_info("❌ Cannot determine CUDA version for Flash Attention installation")
+            return False
+        import subprocess
+        # Use --no-build-isolation for better compatibility
+        install_cmd = [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "flash-attn",
+            "--no-build-isolation"
+        ]
+        log_info(f"Running: {' '.join(install_cmd)}")
+        result = subprocess.run(
+            install_cmd,
+            capture_output=True,
+            text=True,
+            check=False
+        )
+        if result.returncode == 0:
+            log_info("✅ Flash Attention installed successfully!")
+            # Attempt to import to verify installation
+            try:
+                import flash_attn
+                log_info(f"✅ Flash Attention version {flash_attn.__version__} is now available")
+                return True
+            except ImportError:
+                log_info("⚠️ Flash Attention installed but import failed")
+                return False
+        else:
+            log_info(f"❌ Flash Attention installation failed with error: {result.stderr}")
+            return False
+    except Exception as e:
+        log_info(f"❌ Error installing Flash Attention: {str(e)}")
+        return False
 def check_dependencies():
     """
     Check for required and optional dependencies, ensuring proper versions and import order.
     missing_packages = []
     package_versions = {}
     order_issues = []
+    missing_optional = []
     # Check required packages
     log_info("Checking required dependencies...")
             log_info(f"✅ {package} - {feature} available")
         except ImportError:
             log_info(f"⚠️ {package} - {feature} not available")
+            missing_optional.append(package)
     # Check import order for optimal performance
     if "transformers" in package_versions and "unsloth" in package_versions:
                 order_issue = "⚠️ For optimal performance, import unsloth before transformers"
                 order_issues.append(order_issue)
                 log_info(order_issue)
+                log_info("This might cause performance issues but won't prevent training")
             else:
                 log_info("✅ Import order: unsloth before transformers (optimal)")
         except (ValueError, IndexError) as e:
             log_info(f"⚠️ Could not verify import order: {str(e)}")
+    # Try to install missing optional packages
+    if "flash_attn" in missing_optional and CUDA_AVAILABLE:
+        log_info("\nFlash Attention is missing but would improve performance.")
+        install_result = install_flash_attention()
+        if install_result:
+            missing_optional.remove("flash_attn")
     # Report missing required packages
     if missing_packages:
         log_info("\n❌ Critical dependencies missing:")
             os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
             log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
+    # Check dependencies and install optional ones if needed
     if not check_dependencies():
         raise RuntimeError("Critical dependencies missing")
+    # Check if flash attention was successfully installed
+    flash_attention_available = False
+    try:
+        import flash_attn
+        flash_attention_available = True
+        log_info(f"Flash Attention will be used (version: {flash_attn.__version__})")
+        # Update config to use flash attention
+        if "use_flash_attention" not in transformers_config:
+            transformers_config["use_flash_attention"] = True
+    except ImportError:
+        log_info("Flash Attention not available, will use standard attention mechanism")
     return transformers_config, seed
 def setup_model_and_tokenizer(config):
     Load and configure the model and tokenizer.
     Args:
+        config (dict): Complete configuration dictionary
     Returns:
         tuple: (model, tokenizer) - The loaded model and tokenizer
     """
+    # Extract model configuration
+    model_config = get_config_value(config, "model", {})
+    model_name = get_config_value(model_config, "name", "unsloth/phi-4-unsloth-bnb-4bit")
+    use_fast_tokenizer = get_config_value(model_config, "use_fast_tokenizer", True)
+    trust_remote_code = get_config_value(model_config, "trust_remote_code", True)
+    model_revision = get_config_value(config, "model_revision", "main")
+    # Detect if model is already pre-quantized (includes '4bit', 'bnb', or 'int4' in name)
+    is_prequantized = any(q in model_name.lower() for q in ['4bit', 'bnb', 'int4', 'quant'])
+    if is_prequantized:
+        log_info("⚠️ Detected pre-quantized model. No additional quantization will be applied.")
+    # Unsloth configuration
+    unsloth_config = get_config_value(config, "unsloth", {})
+    unsloth_enabled = get_config_value(unsloth_config, "enabled", True)
+    # Tokenizer configuration
+    tokenizer_config = get_config_value(config, "tokenizer", {})
+    max_seq_length = min(
+        get_config_value(tokenizer_config, "max_seq_length", 2048),
+        4096  # Maximum supported by most models
+    )
+    add_eos_token = get_config_value(tokenizer_config, "add_eos_token", True)
+    chat_template = get_config_value(tokenizer_config, "chat_template", None)
+    padding_side = get_config_value(tokenizer_config, "padding_side", "right")
+    # Check for flash attention
+    use_flash_attention = get_config_value(config, "use_flash_attention", False)
+    flash_attention_available = False
+    try:
+        import flash_attn
+        flash_attention_available = True
+        log_info(f"Flash Attention detected (version: {flash_attn.__version__})")
+    except ImportError:
+        if use_flash_attention:
+            log_info("Flash Attention requested but not available")
+    log_info(f"Loading model: {model_name} (revision: {model_revision})")
+    log_info(f"Max sequence length: {max_seq_length}")
+    try:
+        if unsloth_enabled and unsloth_available:
+            log_info("Using Unsloth for LoRA fine-tuning")
+            if is_prequantized:
+                log_info("Using pre-quantized model - no additional quantization will be applied")
+            else:
+                log_info("Using 4-bit quantization for efficient training")
+            # Load using Unsloth
+            from unsloth import FastLanguageModel
+            model, tokenizer = FastLanguageModel.from_pretrained(
+                model_name=model_name,
+                max_seq_length=max_seq_length,
+                dtype=get_config_value(config, "torch_dtype", "bfloat16"),
+                revision=model_revision,
+                trust_remote_code=trust_remote_code,
+                use_flash_attention_2=use_flash_attention and flash_attention_available
+            )
+            # Configure tokenizer settings
+            tokenizer.padding_side = padding_side
+            if add_eos_token and tokenizer.eos_token is None:
+                log_info("Setting EOS token")
+                tokenizer.add_special_tokens({"eos_token": "</s>"})
+            # Set chat template if specified
+            if chat_template:
+                log_info(f"Setting chat template: {chat_template}")
+                if hasattr(tokenizer, "chat_template"):
+                    tokenizer.chat_template = chat_template
+                else:
+                    log_info("Tokenizer does not support chat templates, using default formatting")
+            # Apply LoRA
+            lora_r = get_config_value(unsloth_config, "r", 16)
+            lora_alpha = get_config_value(unsloth_config, "alpha", 32)
+            lora_dropout = get_config_value(unsloth_config, "dropout", 0)
+            target_modules = get_config_value(unsloth_config, "target_modules",
+                ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])
+            log_info(f"Applying LoRA with r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
+            model = FastLanguageModel.get_peft_model(
+                model,
+                r=lora_r,
+                target_modules=target_modules,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                bias="none",
+                use_gradient_checkpointing=get_config_value(config, "training.gradient_checkpointing", True),
+                random_state=0,
+                max_seq_length=max_seq_length,
+                modules_to_save=None
+            )
+            if use_flash_attention and flash_attention_available:
+                log_info("🚀 Using Flash Attention for faster training")
+            elif use_flash_attention and not flash_attention_available:
+                log_info("⚠️ Flash Attention requested but not available - using standard attention")
+        else:
+            # Standard HuggingFace loading
+            log_info("Using standard HuggingFace model loading (Unsloth not available or disabled)")
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            # Check if flash attention should be enabled in config
+            use_attn_implementation = None
+            if use_flash_attention and flash_attention_available:
+                use_attn_implementation = "flash_attention_2"
+                log_info("🚀 Using Flash Attention for faster training")
+            # Load tokenizer first
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
+                revision=model_revision,
+                padding_side=padding_side
+            )
+            # Configure tokenizer settings
+            if add_eos_token and tokenizer.eos_token is None:
+                log_info("Setting EOS token")
+                tokenizer.add_special_tokens({"eos_token": "</s>"})
+            # Set chat template if specified
+            if chat_template:
+                log_info(f"Setting chat template: {chat_template}")
+                if hasattr(tokenizer, "chat_template"):
+                    tokenizer.chat_template = chat_template
+                else:
+                    log_info("Tokenizer does not support chat templates, using default formatting")
+            # Only apply quantization config if model is not already pre-quantized
+            quantization_config = None
+            if not is_prequantized and CUDA_AVAILABLE:
+                try:
+                    from transformers import BitsAndBytesConfig
+                    log_info("Using 4-bit quantization (BitsAndBytes) for efficient training")
+                    quantization_config = BitsAndBytesConfig(
+                        load_in_4bit=True,
+                        bnb_4bit_quant_type="nf4",
+                        bnb_4bit_compute_dtype=torch.float16,
+                        bnb_4bit_use_double_quant=True
+                    )
+                except ImportError:
+                    log_info("BitsAndBytes not available - quantization disabled")
+            # Now load model with updated tokenizer
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+                revision=model_revision,
+                torch_dtype=torch.bfloat16 if get_config_value(config, "torch_dtype", "bfloat16") == "bfloat16" else torch.float16,
+                device_map="auto" if CUDA_AVAILABLE else None,
+                attn_implementation=use_attn_implementation,
+                quantization_config=quantization_config
+            )
+            # Apply PEFT/LoRA if enabled but using standard loading
+            if peft_available and get_config_value(unsloth_config, "enabled", True):
+                log_info("Applying standard PEFT/LoRA configuration")
+                from peft import LoraConfig, get_peft_model
+                lora_r = get_config_value(unsloth_config, "r", 16)
+                lora_alpha = get_config_value(unsloth_config, "alpha", 32)
+                lora_dropout = get_config_value(unsloth_config, "dropout", 0)
+                target_modules = get_config_value(unsloth_config, "target_modules",
+                    ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])
+                log_info(f"Applying LoRA with r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
+                lora_config = LoraConfig(
+                    r=lora_r,
+                    lora_alpha=lora_alpha,
+                    target_modules=target_modules,
+                    lora_dropout=lora_dropout,
+                    bias="none",
+                    task_type="CAUSAL_LM"
+                )
+                model = get_peft_model(model, lora_config)
+        # Print model summary
+        log_info(f"Model loaded successfully: {model.__class__.__name__}")
+        if hasattr(model, "print_trainable_parameters"):
+            model.print_trainable_parameters()
+        else:
+            total_params = sum(p.numel() for p in model.parameters())
+            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+            log_info(f"Model has {total_params:,} parameters, {trainable_params:,} trainable ({trainable_params/total_params:.2%})")
+        return model, tokenizer
+    except Exception as e:
+        log_info(f"Error loading model: {str(e)}")
+        traceback.print_exc()
+        return None, None
 def setup_dataset_and_collator(config, tokenizer):
     """
     logger.info("Starting training process")
     try:
+        # Check for potential import order issue and warn early
+        if "transformers" in sys.modules and "unsloth" in sys.modules:
+            if list(sys.modules.keys()).index("transformers") < list(sys.modules.keys()).index("unsloth"):
+                log_info("⚠️ Warning: transformers was imported before unsloth. This may affect performance.")
+                log_info("   For optimal performance in future runs, import unsloth first.")
         # Parse command line arguments
         args = parse_args()