Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

32e9f89

verified ·

1 Parent(s): 3edc673

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +18 -0
install_requirements.py +83 -0
requirements.txt +1 -8
run_transformers_training.py +203 -35
update_space.py +130 -54

README.md CHANGED Viewed

@@ -18,6 +18,24 @@ This space is dedicated to training Microsoft's Phi-4 model using Unsloth optimi
 This Hugging Face Space automatically installs dependencies from requirements.txt. The following packages are included:
 ### Essential Dependencies
 - **unsloth** (>=2024.3): Required for optimized 4-bit training

 This Hugging Face Space automatically installs dependencies from requirements.txt. The following packages are included:
+### Installation Process
+For clearer dependency management, the installation is split into multiple files:
+1. **Base Dependencies (requirements-base.txt)**:
+   - Core packages like torch, transformers, accelerate, etc.
+   - Install with: `pip install -r requirements-base.txt`
+2. **Standard Dependencies (requirements.txt)**:
+   - References base requirements and adds additional packages
+   - Install with: `pip install -r requirements.txt`
+3. **Flash Attention (requirements-flash.txt)** (Optional):
+   - For faster attention computation
+   - Install with: `pip install -r requirements-flash.txt --no-build-isolation`
+Using this staged approach helps prevent dependency conflicts and installation issues.
 ### Essential Dependencies
 - **unsloth** (>=2024.3): Required for optimized 4-bit training

install_requirements.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+Script to install requirements in the correct order for the Phi-4 training project.
+This ensures base requirements are installed first, followed by additional requirements.
+"""
+import os
+import sys
+import subprocess
+import argparse
+import logging
+from pathlib import Path
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+def install_requirements(include_flash=False):
+    """Install requirements in the correct order."""
+    current_dir = Path(__file__).parent
+    base_req_path = current_dir / "requirements-base.txt"
+    main_req_path = current_dir / "requirements.txt"
+    flash_req_path = current_dir / "requirements-flash.txt"
+    if not base_req_path.exists():
+        logger.error(f"Base requirements file not found: {base_req_path}")
+        return False
+    if not main_req_path.exists():
+        logger.error(f"Main requirements file not found: {main_req_path}")
+        return False
+    logger.info("Installing dependencies in sequential order...")
+    try:
+        # Step 1: Install base requirements
+        logger.info(f"Step 1: Installing base requirements from {base_req_path}")
+        subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(base_req_path)],
+                      check=True)
+        logger.info("Base requirements installed successfully")
+        # Step 2: Install main requirements
+        logger.info(f"Step 2: Installing additional requirements from {main_req_path}")
+        subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(main_req_path)],
+                      check=True)
+        logger.info("Additional requirements installed successfully")
+        # Step 3: Optionally install flash-attention
+        if include_flash and flash_req_path.exists():
+            logger.info(f"Step 3: Installing flash-attention from {flash_req_path}")
+            subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(flash_req_path), "--no-build-isolation"],
+                          check=True)
+            logger.info("Flash-attention installed successfully")
+        elif include_flash:
+            logger.warning(f"Flash requirements file not found: {flash_req_path}")
+        logger.info("All required packages installed successfully!")
+        return True
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error installing dependencies: {str(e)}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Install requirements for Phi-4 training")
+    parser.add_argument("--flash", action="store_true", help="Also install flash-attention (optional)")
+    args = parser.parse_args()
+    success = install_requirements(include_flash=args.flash)
+    if success:
+        logger.info("Installation completed successfully!")
+    else:
+        logger.error("Installation failed. Please check the logs for details.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,11 +1,6 @@
-torch>=2.0.0
-accelerate>=0.27.0
-bitsandbytes>=0.41.0
-datasets>=2.15.0
 einops>=0.7.0
 filelock>=3.13.1
-gradio>=5.17.0
-huggingface-hub>=0.19.0
 matplotlib>=3.7.0
 numpy>=1.24.0
 packaging>=23.0
@@ -17,8 +12,6 @@ regex>=2023.0.0
 requests>=2.31.0
 safetensors>=0.4.1
 sentencepiece>=0.1.99
-tensorboard>=2.15.0
 tqdm>=4.65.0
-transformers>=4.36.0
 typing-extensions>=4.8.0
 unsloth>=2024.3

+-r requirements-base.txt
 einops>=0.7.0
 filelock>=3.13.1
 matplotlib>=3.7.0
 numpy>=1.24.0
 packaging>=23.0
 requests>=2.31.0
 safetensors>=0.4.1
 sentencepiece>=0.1.99
 tqdm>=4.65.0
 typing-extensions>=4.8.0
 unsloth>=2024.3

run_transformers_training.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # coding=utf-8
 import os
 import sys
 import json
@@ -9,36 +10,15 @@ import logging
 from datetime import datetime
 import time
 import warnings
-import torch
 from importlib.util import find_spec
-# Global variables for hardware detection
 CUDA_AVAILABLE = torch.cuda.is_available()
 NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
 DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
-# Import Unsloth first, before other ML imports
-try:
-    from unsloth import FastLanguageModel
-    from unsloth.chat_templates import get_chat_template
-    unsloth_available = True
-except ImportError:
-    unsloth_available = False
-    logger = logging.getLogger(__name__)
-    logger.warning("Unsloth not available. Please install with: pip install unsloth")
-from datasets import load_dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TrainingArguments,
-    Trainer,
-    TrainerCallback,
-    set_seed,
-    BitsAndBytesConfig
-)
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -53,8 +33,46 @@ logging.getLogger("accelerate").setLevel(logging.WARNING)
 logging.getLogger("torch").setLevel(logging.WARNING)
 logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
 # Check availability of libraries
 peft_available = find_spec("peft") is not None
 # Define a clean logging function for HF Space compatibility
 def log_info(message):
@@ -99,8 +117,9 @@ def load_env_variables():
         # Try to load from .env file if not in a Space
         try:
             from dotenv import load_dotenv
-            # Updated path to .env file in the new directory structure
-            env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "shared", ".env")
             if os.path.exists(env_path):
                 load_dotenv(env_path)
                 logging.info(f"Loaded environment variables from {env_path}")
@@ -108,10 +127,22 @@ def load_env_variables():
                 logging.info(f"HF_USERNAME loaded from .env file: {bool(os.environ.get('HF_USERNAME'))}")
                 logging.info(f"HF_SPACE_NAME loaded from .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
             else:
-                logging.warning(f"No .env file found at {env_path}")
         except ImportError:
             logging.warning("python-dotenv not installed, not loading from .env file")
     if not os.environ.get("HF_USERNAME"):
         logger.warning("HF_USERNAME is not set. Using default username.")
@@ -187,6 +218,16 @@ def load_model_and_tokenizer(config):
             logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
             use_flash_attention = False
         # Load model with proper error handling for out-of-memory
         try:
             # Improved memory settings for multi-GPU setup
@@ -573,24 +614,60 @@ class LoggingCallback(TrainerCallback):
         log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
 def check_dependencies():
-    """Check if all required dependencies are installed."""
     missing_packages = []
-    # Critical packages
     if not unsloth_available:
         missing_packages.append("unsloth>=2024.3")
     if not peft_available:
         missing_packages.append("peft>=0.9.0")
     # If critical packages are missing, exit with instructions
     if missing_packages:
         logger.error("Critical dependencies missing:")
         for pkg in missing_packages:
             logger.error(f"  - {pkg}")
-        logger.error("Please ensure the space has these packages in requirements.txt")
         return False
     # Optional packages - moved to the end
     if find_spec("flash_attn"):
         logger.info("flash-attn found. Flash attention will be used for faster training.")
@@ -598,18 +675,110 @@ def check_dependencies():
         logger.warning("flash-attn not found. Training will work but may be slower.")
         logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
     return True
 def main():
     # Set up logging
     logger.info("Starting training process")
     # Parse arguments
     args = parse_args()
     # Load environment variables
     load_env_variables()
     # Load configuration
     try:
         transformers_config = load_configs(args.config)
@@ -620,11 +789,6 @@ def main():
         logger.error(f"Error loading configuration: {e}")
         return 1
-    # Check dependencies
-    if not check_dependencies():
-        logger.error("Aborting due to missing critical dependencies")
-        return 1
     # Check if we're in distributed mode
     is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
     if is_distributed:
@@ -870,6 +1034,10 @@ def main():
                 log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
                 trainer.push_to_hub()
                 log_info("Model successfully pushed to Hub")
             return 0
         except Exception as e:

 #!/usr/bin/env python
 # coding=utf-8
+# Basic Python imports
 import os
 import sys
 import json
 from datetime import datetime
 import time
 import warnings
 from importlib.util import find_spec
+# Check hardware capabilities first
+import torch
 CUDA_AVAILABLE = torch.cuda.is_available()
 NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
 DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
+# Configure logging early
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
 logging.getLogger("torch").setLevel(logging.WARNING)
 logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
+# Import Unsloth first, before other ML imports
+try:
+    from unsloth import FastLanguageModel
+    from unsloth.chat_templates import get_chat_template
+    unsloth_available = True
+    logger.info("Unsloth successfully imported")
+except ImportError:
+    unsloth_available = False
+    logger.warning("Unsloth not available. Please install with: pip install unsloth")
+# Now import other ML libraries
+try:
+    import transformers
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        TrainingArguments,
+        Trainer,
+        TrainerCallback,
+        set_seed,
+        BitsAndBytesConfig
+    )
+    logger.info(f"Transformers version: {transformers.__version__}")
+except ImportError:
+    logger.error("Transformers not available. This is a critical dependency.")
 # Check availability of libraries
 peft_available = find_spec("peft") is not None
+if peft_available:
+    import peft
+    logger.info(f"PEFT version: {peft.__version__}")
+else:
+    logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
+# Import datasets library after the main ML libraries
+try:
+    from datasets import load_dataset
+    logger.info("Datasets library successfully imported")
+except ImportError:
+    logger.error("Datasets library not available. This is required for loading training data.")
 # Define a clean logging function for HF Space compatibility
 def log_info(message):
         # Try to load from .env file if not in a Space
         try:
             from dotenv import load_dotenv
+            # First check the current directory
+            env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
             if os.path.exists(env_path):
                 load_dotenv(env_path)
                 logging.info(f"Loaded environment variables from {env_path}")
                 logging.info(f"HF_USERNAME loaded from .env file: {bool(os.environ.get('HF_USERNAME'))}")
                 logging.info(f"HF_SPACE_NAME loaded from .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
             else:
+                # Try the shared directory as fallback
+                shared_env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "shared", ".env")
+                if os.path.exists(shared_env_path):
+                    load_dotenv(shared_env_path)
+                    logging.info(f"Loaded environment variables from {shared_env_path}")
+                    logging.info(f"HF_TOKEN loaded from shared .env file: {bool(os.environ.get('HF_TOKEN'))}")
+                    logging.info(f"HF_USERNAME loaded from shared .env file: {bool(os.environ.get('HF_USERNAME'))}")
+                    logging.info(f"HF_SPACE_NAME loaded from shared .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
+                else:
+                    logging.warning(f"No .env file found in current or shared directory")
         except ImportError:
             logging.warning("python-dotenv not installed, not loading from .env file")
+    if not os.environ.get("HF_TOKEN"):
+        logger.warning("HF_TOKEN is not set. Pushing to Hugging Face Hub will not work.")
     if not os.environ.get("HF_USERNAME"):
         logger.warning("HF_USERNAME is not set. Using default username.")
             logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
             use_flash_attention = False
+        # Set device map based on config or default to "auto"
+        device_map = config.get("hardware", {}).get("hardware_setup", {}).get("device_map", "auto")
+        # Calculate max memory settings if multiple GPUs are available
+        max_memory = None
+        if gpu_count > 1:
+            memory_per_gpu = config.get("hardware", {}).get("specs", {}).get("vram_per_gpu", 24)
+            max_memory = {i: f"{int(memory_per_gpu * 0.85)}GiB" for i in range(gpu_count)}
+            max_memory["cpu"] = "64GiB"  # Allow CPU offloading if needed
         # Load model with proper error handling for out-of-memory
         try:
             # Improved memory settings for multi-GPU setup
         log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
 def check_dependencies():
+    """Check if all required dependencies are installed and in the correct order."""
     missing_packages = []
+    order_issues = []
+    # Check critical packages in the required order
+    # 1. First check for unsloth as it should be imported before transformers
     if not unsloth_available:
         missing_packages.append("unsloth>=2024.3")
+    # 2. Check transformers (imported at module level)
+    try:
+        import transformers
+        logger.info(f"Using transformers version {transformers.__version__}")
+    except ImportError:
+        missing_packages.append("transformers>=4.38.0")
+    # 3. Check for peft
     if not peft_available:
         missing_packages.append("peft>=0.9.0")
+    # 4. Check for accelerate
+    try:
+        import accelerate
+        logger.info(f"Using accelerate version {accelerate.__version__}")
+    except ImportError:
+        missing_packages.append("accelerate>=0.27.0")
+    # Check for order-specific issues
+    try:
+        import sys
+        modules = sys.modules.keys()
+        # Unsloth should be imported before transformers for optimal performance
+        if 'transformers' in modules and 'unsloth' in modules:
+            if modules.index('transformers') < modules.index('unsloth'):
+                order_issues.append("For optimal performance, unsloth should be imported before transformers")
+    except Exception:
+        # If we can't check order, just skip this check
+        pass
     # If critical packages are missing, exit with instructions
     if missing_packages:
         logger.error("Critical dependencies missing:")
         for pkg in missing_packages:
             logger.error(f"  - {pkg}")
+        logger.error("Please install the missing dependencies with:")
+        logger.error(f"  pip install {' '.join(missing_packages)}")
         return False
+    # Report order issues as warnings
+    for issue in order_issues:
+        logger.warning(issue)
     # Optional packages - moved to the end
     if find_spec("flash_attn"):
         logger.info("flash-attn found. Flash attention will be used for faster training.")
         logger.warning("flash-attn not found. Training will work but may be slower.")
         logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
+    # Additional optional packages that improve performance
+    if find_spec("bitsandbytes"):
+        logger.info("bitsandbytes found. Quantization will be available.")
+    else:
+        logger.warning("bitsandbytes not found. Quantization may not be available.")
+        logger.warning("To use quantization, install with: pip install bitsandbytes")
     return True
+def update_huggingface_space():
+    """Update the Hugging Face Space with the current code."""
+    log_info("Updating Hugging Face Space...")
+    update_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "update_space.py")
+    if not os.path.exists(update_script):
+        logger.warning(f"Update space script not found at {update_script}")
+        return False
+    try:
+        import subprocess
+        # Explicitly set space_name to ensure we're targeting the right Space
+        result = subprocess.run(
+            [sys.executable, update_script, "--force", "--space_name", "phi4training"],
+            capture_output=True, text=True, check=False
+        )
+        if result.returncode == 0:
+            log_info("Hugging Face Space updated successfully!")
+            log_info(f"Space URL: https://huggingface.co/spaces/George-API/phi4training")
+            return True
+        else:
+            logger.error(f"Failed to update Hugging Face Space: {result.stderr}")
+            return False
+    except Exception as e:
+        logger.error(f"Error updating Hugging Face Space: {str(e)}")
+        return False
+def validate_huggingface_credentials():
+    """Validate Hugging Face credentials to ensure they work correctly."""
+    if not os.environ.get("HF_TOKEN"):
+        logger.warning("HF_TOKEN not found. Skipping Hugging Face credentials validation.")
+        return False
+    try:
+        # Import here to avoid requiring huggingface_hub if not needed
+        from huggingface_hub import HfApi, login
+        # Try to login with the token
+        login(token=os.environ.get("HF_TOKEN"))
+        # Check if we can access the API
+        api = HfApi()
+        username = os.environ.get("HF_USERNAME", "George-API")
+        space_name = os.environ.get("HF_SPACE_NAME", "phi4training")
+        # Try to get whoami info
+        user_info = api.whoami()
+        logger.info(f"Successfully authenticated with Hugging Face as {user_info['name']}")
+        # Check if we're using the expected Space
+        expected_space_id = "George-API/phi4training"
+        actual_space_id = f"{username}/{space_name}"
+        if actual_space_id != expected_space_id:
+            logger.warning(f"Using Space '{actual_space_id}' instead of the expected '{expected_space_id}'")
+            logger.warning(f"Make sure this is intentional. To use the correct Space, update your .env file.")
+        else:
+            logger.info(f"Confirmed using Space: {expected_space_id}")
+        # Check if the space exists
+        try:
+            space_id = f"{username}/{space_name}"
+            space_info = api.space_info(repo_id=space_id)
+            logger.info(f"Space {space_id} is accessible at: https://huggingface.co/spaces/{space_id}")
+            return True
+        except Exception as e:
+            logger.warning(f"Could not access Space {username}/{space_name}: {str(e)}")
+            logger.warning("Space updating may not work correctly")
+            return False
+    except ImportError:
+        logger.warning("huggingface_hub not installed. Cannot validate Hugging Face credentials.")
+        return False
+    except Exception as e:
+        logger.warning(f"Error validating Hugging Face credentials: {str(e)}")
+        return False
 def main():
     # Set up logging
     logger.info("Starting training process")
+    # Check dependencies first, before any other operations
+    if not check_dependencies():
+        logger.error("Aborting due to missing critical dependencies")
+        return 1
     # Parse arguments
     args = parse_args()
     # Load environment variables
     load_env_variables()
+    # Validate Hugging Face credentials if we're going to use them
+    validate_huggingface_credentials()
     # Load configuration
     try:
         transformers_config = load_configs(args.config)
         logger.error(f"Error loading configuration: {e}")
         return 1
     # Check if we're in distributed mode
     is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
     if is_distributed:
                 log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
                 trainer.push_to_hub()
                 log_info("Model successfully pushed to Hub")
+            # Update the Hugging Face Space with current code
+            if os.environ.get("HF_TOKEN") and os.environ.get("HF_USERNAME") and os.environ.get("HF_SPACE_NAME"):
+                update_huggingface_space()
             return 0
         except Exception as e:

update_space.py CHANGED Viewed

@@ -26,6 +26,12 @@ logger = logging.getLogger(__name__)
 def load_env_variables():
     """Load environment variables from system or .env file."""
     # First try to load from local .env file
     try:
         from dotenv import load_dotenv
@@ -51,23 +57,19 @@ def load_env_variables():
             os.environ["HF_USERNAME"] = username
             logger.info(f"Set HF_USERNAME from SPACE_ID: {username}")
-    # Verify required variables
-    required_vars = {
-        "HF_TOKEN": os.environ.get("HF_TOKEN"),
-        "HF_USERNAME": os.environ.get("HF_USERNAME"),
-        "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
     }
-    # Ensure the space name is set correctly
-    if "HF_SPACE_NAME" not in os.environ:
-        os.environ["HF_SPACE_NAME"] = "phi4training"
-    missing_vars = [k for k, v in required_vars.items() if not v]
-    if missing_vars:
-        raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
-    logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
-    return required_vars
 def verify_configs():
     """Verify that all necessary configuration files exist and are valid."""
@@ -98,12 +100,14 @@ def verify_configs():
 def update_requirements():
     """Update requirements.txt with necessary packages using a two-stage installation process."""
     current_dir = Path(__file__).parent
     base_req_path = current_dir / "requirements-base.txt"
     flash_req_path = current_dir / "requirements-flash.txt"
     # First ensure base requirements exist
-    required_packages = {
         "torch>=2.0.0",
         "transformers>=4.36.0",
         "accelerate>=0.27.0",
@@ -114,6 +118,26 @@ def update_requirements():
         "datasets>=2.15.0"
     }
     # Read existing base requirements
     existing_requirements = set()
     if base_req_path.exists():
@@ -121,9 +145,9 @@ def update_requirements():
             existing_requirements = {line.strip() for line in f if line.strip() and not line.startswith('-r')}
     # Add new requirements
-    updated_requirements = existing_requirements.union(required_packages)
-    # Write updated base requirements
     with open(base_req_path, 'w') as f:
         # Ensure torch is first
         torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
@@ -133,18 +157,29 @@ def update_requirements():
         for req in sorted(r for r in updated_requirements if not r.startswith("torch")):
             f.write(f"{req}\n")
-    # Create or update flash-attn requirements
     with open(flash_req_path, 'w') as f:
         f.write("-r requirements-base.txt\n")
         f.write("flash-attn==2.5.2\n")
-    logger.info("Updated requirements files for two-stage installation:")
     logger.info(f"1. Base requirements in {base_req_path}")
-    logger.info(f"2. Flash-attention requirements in {flash_req_path}")
-    logger.info("This ensures torch is installed before flash-attn")
 def create_space(username, space_name):
     """Create or get a Hugging Face Space."""
     try:
         api = HfApi()
         space_id = f"{username}/{space_name}"
@@ -155,11 +190,10 @@ def create_space(username, space_name):
             space_info = api.space_info(repo_id=space_id)
             logger.info(f"Space {space_id} already exists")
             return space_info
-        except Exception as e:
             logger.info(f"Space {space_id} does not exist, creating new space...")
-        # Create new space
-        try:
             api.create_repo(
                 repo_id=space_id,
                 private=False,
@@ -168,50 +202,92 @@ def create_space(username, space_name):
             )
             logger.info(f"Created new space: {space_id}")
             return api.space_info(repo_id=space_id)
-        except Exception as e:
-            logger.error(f"Failed to create space: {str(e)}")
-            raise
     except Exception as e:
         raise RuntimeError(f"Error with Space {space_id}: {str(e)}")
 def main():
-    parser = argparse.ArgumentParser(description='Update Hugging Face Space for Phi-4 training')
-    parser.add_argument('--space_name', type=str, help='Space name (default: from env)')
-    parser.add_argument('--force', action='store_true', help='Skip confirmation')
-    args = parser.parse_args()
-    if not args.force:
-        print("\n" + "!"*80)
-        print("WARNING: Updating the Space will INTERRUPT any ongoing training!")
-        print("Make sure all checkpoints are saved before proceeding.")
-        print("!"*80 + "\n")
-        confirm = input("Type 'update' to confirm: ")
-        if confirm.lower() != 'update':
-            logger.info("Update cancelled")
-            return False
     try:
         # Load environment variables
         env_vars = load_env_variables()
         logger.info(f"Environment variables loaded: USERNAME={env_vars['HF_USERNAME']}, SPACE_NAME={env_vars['HF_SPACE_NAME']}")
-        # Verify configurations
-        verify_configs()
-        logger.info("All configuration files verified successfully")
         # Update requirements
         update_requirements()
         logger.info("Requirements updated successfully")
-        # Get space name from args or env, prioritize args
-        space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
         logger.info(f"Using space name: {space_name}")
         # Login to Hugging Face
         logger.info("Logging in to Hugging Face...")
-        login(token=env_vars["HF_TOKEN"])
-        logger.info("Successfully logged in to Hugging Face")
         # Create/get space
         space_info = create_space(env_vars["HF_USERNAME"], space_name)
@@ -219,7 +295,7 @@ def main():
         # Upload files
         current_dir = Path(__file__).parent
-        logger.info(f"Uploading files from {current_dir} to Space {env_vars['HF_USERNAME']}/{space_name}...")
         # Create .gitignore
         with open(current_dir / ".gitignore", "w") as f:
@@ -229,13 +305,13 @@ def main():
         api = HfApi()
         api.upload_folder(
             folder_path=str(current_dir),
-            repo_id=f"{env_vars['HF_USERNAME']}/{space_name}",
             repo_type="space",
             ignore_patterns=[".env", "*.pyc", "__pycache__", "TRAINING_IN_PROGRESS.lock"]
         )
         logger.info(f"Files uploaded successfully")
-        space_url = f"https://huggingface.co/spaces/{env_vars['HF_USERNAME']}/{space_name}"
         logger.info(f"Space URL: {space_url}")
         print(f"\nSpace created successfully! You can view it at:\n{space_url}")
         return True

 def load_env_variables():
     """Load environment variables from system or .env file."""
+    # Define default values that should be used
+    required_vars = {
+        "HF_USERNAME": os.environ.get("HF_USERNAME", "George-API"),
+        "HF_SPACE_NAME": "phi4training"  # Hardcode the correct space name
+    }
     # First try to load from local .env file
     try:
         from dotenv import load_dotenv
             os.environ["HF_USERNAME"] = username
             logger.info(f"Set HF_USERNAME from SPACE_ID: {username}")
+    # Always ensure we have the required variables
+    # And override HF_SPACE_NAME to ensure we use phi4training
+    result = {
+        "HF_TOKEN": os.environ.get("HF_TOKEN", ""),
+        "HF_USERNAME": os.environ.get("HF_USERNAME", required_vars["HF_USERNAME"]),
+        "HF_SPACE_NAME": required_vars["HF_SPACE_NAME"]  # Always use phi4training
     }
+    # Ensure the space name is set correctly in environment
+    os.environ["HF_SPACE_NAME"] = required_vars["HF_SPACE_NAME"]
+    logger.info(f"Using environment variables: USERNAME={result['HF_USERNAME']}, SPACE_NAME={result['HF_SPACE_NAME']}")
+    return result
 def verify_configs():
     """Verify that all necessary configuration files exist and are valid."""
 def update_requirements():
     """Update requirements.txt with necessary packages using a two-stage installation process."""
+    logger.info("Setting up requirements files for sequential installation...")
     current_dir = Path(__file__).parent
     base_req_path = current_dir / "requirements-base.txt"
+    main_req_path = current_dir / "requirements.txt"
     flash_req_path = current_dir / "requirements-flash.txt"
     # First ensure base requirements exist
+    required_base_packages = {
         "torch>=2.0.0",
         "transformers>=4.36.0",
         "accelerate>=0.27.0",
         "datasets>=2.15.0"
     }
+    # Additional packages for main requirements
+    required_additional_packages = {
+        "einops>=0.7.0",
+        "filelock>=3.13.1",
+        "matplotlib>=3.7.0",
+        "numpy>=1.24.0",
+        "packaging>=23.0",
+        "peft>=0.9.0",
+        "psutil>=5.9.0",
+        "python-dotenv>=1.0.0",
+        "pyyaml>=6.0.1",
+        "regex>=2023.0.0",
+        "requests>=2.31.0",
+        "safetensors>=0.4.1",
+        "sentencepiece>=0.1.99",
+        "tqdm>=4.65.0",
+        "typing-extensions>=4.8.0",
+        "unsloth>=2024.3"
+    }
     # Read existing base requirements
     existing_requirements = set()
     if base_req_path.exists():
             existing_requirements = {line.strip() for line in f if line.strip() and not line.startswith('-r')}
     # Add new requirements
+    updated_requirements = existing_requirements.union(required_base_packages)
+    # 1. Write updated base requirements
     with open(base_req_path, 'w') as f:
         # Ensure torch is first
         torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
         for req in sorted(r for r in updated_requirements if not r.startswith("torch")):
             f.write(f"{req}\n")
+    # 2. Create main requirements file (references base)
+    with open(main_req_path, 'w') as f:
+        f.write("-r requirements-base.txt\n")
+        for req in sorted(required_additional_packages):
+            f.write(f"{req}\n")
+    # 3. Create or update flash-attn requirements
     with open(flash_req_path, 'w') as f:
         f.write("-r requirements-base.txt\n")
         f.write("flash-attn==2.5.2\n")
+    logger.info("Updated requirements files for sequential installation:")
     logger.info(f"1. Base requirements in {base_req_path}")
+    logger.info(f"2. Main requirements in {main_req_path}")
+    logger.info(f"3. Flash-attention requirements in {flash_req_path}")
+    logger.info("This ensures packages are installed in the correct order")
 def create_space(username, space_name):
     """Create or get a Hugging Face Space."""
+    # Override with the correct values regardless of what's passed
+    username = "George-API"
+    space_name = "phi4training"
     try:
         api = HfApi()
         space_id = f"{username}/{space_name}"
             space_info = api.space_info(repo_id=space_id)
             logger.info(f"Space {space_id} already exists")
             return space_info
+        except Exception:
             logger.info(f"Space {space_id} does not exist, creating new space...")
+            # Create new space
             api.create_repo(
                 repo_id=space_id,
                 private=False,
             )
             logger.info(f"Created new space: {space_id}")
             return api.space_info(repo_id=space_id)
     except Exception as e:
+        logger.error(f"Failed to create space: {str(e)}")
+        # Don't proceed if we can't create/access the space
         raise RuntimeError(f"Error with Space {space_id}: {str(e)}")
 def main():
+    """Main function to update the Space."""
     try:
+        # Parse command line arguments
+        parser = argparse.ArgumentParser(description='Update Hugging Face Space for Phi-4 training')
+        parser.add_argument('--space_name', type=str, help='Space name (ignored, always using phi4training)')
+        parser.add_argument('--force', action='store_true', help='Skip confirmation when updating Space')
+        args = parser.parse_args()
         # Load environment variables
         env_vars = load_env_variables()
+        verify_configs()
+        # Verify we have the necessary variables
+        if not env_vars["HF_TOKEN"]:
+            logger.error("Missing HF_TOKEN. Please set it in your .env file or environment variables.")
+            return False
         logger.info(f"Environment variables loaded: USERNAME={env_vars['HF_USERNAME']}, SPACE_NAME={env_vars['HF_SPACE_NAME']}")
+        # Ask for confirmation unless forced
+        if not args.force:
+            print("\nWARNING: Updating the Space will INTERRUPT any ongoing training!")
+            confirm = input("Are you sure you want to update the Space? Type 'yes' to confirm: ")
+            if confirm.lower() != 'yes':
+                logger.info("Update cancelled by user")
+                return False
+            # Additional password check for safety
+            password = getpass.getpass("Enter your password to confirm update: ")
+            if password.strip() == "":
+                logger.info("No password entered. Update cancelled.")
+                return False
+        else:
+            logger.info("Skipping confirmation due to --force flag")
         # Update requirements
         update_requirements()
         logger.info("Requirements updated successfully")
+        # Always use phi4training as the space name regardless of arguments
+        space_name = "phi4training"
         logger.info(f"Using space name: {space_name}")
+        # Verify we're using the expected Space
+        expected_space = "George-API/phi4training"
+        actual_space = f"{env_vars['HF_USERNAME']}/{space_name}"
+        if actual_space != expected_space:
+            logger.warning(f"WARNING: Updating Space '{actual_space}' instead of '{expected_space}'")
+            logger.warning("Make sure the HF_USERNAME environment variable is set to 'George-API'")
+            # Safety check for non-force updates
+            if not args.force:
+                confirm = input(f"Continue updating '{actual_space}' instead of '{expected_space}'? (yes/no): ")
+                if confirm.lower() != "yes":
+                    logger.info("Update cancelled by user")
+                    return False
+        else:
+            logger.info(f"Confirmed using the expected Space: {expected_space}")
         # Login to Hugging Face
         logger.info("Logging in to Hugging Face...")
+        try:
+            login(token=env_vars["HF_TOKEN"])
+            logger.info("Successfully logged in to Hugging Face")
+            # Verify login with whoami
+            api = HfApi()
+            try:
+                user_info = api.whoami()
+                logger.info(f"Authenticated as: {user_info['name']}")
+            except Exception as e:
+                logger.error(f"Authentication verification failed: {str(e)}")
+                logger.error("Your HF_TOKEN may be invalid or expired.")
+                return False
+        except Exception as e:
+            logger.error(f"Login failed: {str(e)}")
+            logger.error("Make sure your HF_TOKEN is valid and not expired.")
+            return False
         # Create/get space
         space_info = create_space(env_vars["HF_USERNAME"], space_name)
         # Upload files
         current_dir = Path(__file__).parent
+        logger.info(f"Uploading files from {current_dir} to Space George-API/phi4training...")
         # Create .gitignore
         with open(current_dir / ".gitignore", "w") as f:
         api = HfApi()
         api.upload_folder(
             folder_path=str(current_dir),
+            repo_id="George-API/phi4training",  # Hardcoded repo ID
             repo_type="space",
             ignore_patterns=[".env", "*.pyc", "__pycache__", "TRAINING_IN_PROGRESS.lock"]
         )
         logger.info(f"Files uploaded successfully")
+        space_url = "https://huggingface.co/spaces/George-API/phi4training"
         logger.info(f"Space URL: {space_url}")
         print(f"\nSpace created successfully! You can view it at:\n{space_url}")
         return True