Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 15

Commit

3e18b42

verified ·

1 Parent(s): 467f05c

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +129 -4

run_cloud_training.py CHANGED Viewed

@@ -17,11 +17,15 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig, get_peft_model
 from dotenv import load_dotenv
 # Basic environment setup for L40S
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -41,6 +45,84 @@ def remove_training_marker():
         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
 # Custom data collator for pre-tokenized data
 class PreTokenizedCollator(DataCollatorMixin):
     def __init__(self, pad_token_id=0, tokenizer=None):
@@ -134,11 +216,23 @@ class PreTokenizedCollator(DataCollatorMixin):
 # Load and prepare dataset with proper sorting
 def load_and_prepare_dataset(dataset_name, config):
     """Load and prepare the dataset for fine-tuning with proper sorting"""
     logger.info(f"Loading dataset: {dataset_name}")
     try:
         # Load dataset
-        dataset = load_dataset(dataset_name)
         # Extract the split we want to use (usually 'train')
         if 'train' in dataset:
@@ -167,7 +261,7 @@ def load_and_prepare_dataset(dataset_name, config):
         raise
 # Main training function
-def train(config_path, dataset_name, output_dir):
     # Load environment variables
     load_dotenv()
@@ -186,6 +280,11 @@ def train(config_path, dataset_name, output_dir):
         lora_config = config.get("lora_config", {})
         dataset_config = config.get("dataset_config", {})
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
@@ -327,6 +426,16 @@ def train(config_path, dataset_name, output_dir):
             json.dump(config, f, indent=2)
         logger.info("Training complete - RESEARCH PHASE ONLY")
         return output_dir
     finally:
@@ -337,16 +446,32 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Fine-tune DeepSeek model (Research Only)")
     parser.add_argument("--config", type=str, default="transformers_config.json",
                       help="Path to the configuration file")
-    parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
                       help="Dataset name or path")
     parser.add_argument("--output_dir", type=str, default="fine_tuned_model",
                       help="Output directory for the fine-tuned model")
     args = parser.parse_args()
     try:
-        output_path = train(args.config, args.dataset, args.output_dir)
         print(f"Research training completed. Model saved to: {output_path}")
     except Exception as e:
         logging.error(f"Training failed: {str(e)}")
         remove_training_marker()  # Clean up marker if training fails

 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig, get_peft_model
 from dotenv import load_dotenv
+from huggingface_hub import HfApi, upload_folder
 # Basic environment setup for L40S
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+# Default dataset with proper namespace
+DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
+# Function to upload model to Hugging Face Hub
+def upload_to_huggingface(output_dir, repo_name=None, private=False):
+    """
+    Upload the trained model to Hugging Face Hub
+    Args:
+        output_dir: Directory containing the model files
+        repo_name: Name of the repository on HF Hub (default: derived from output_dir)
+        private: Whether the repository should be private (default: False)
+    Returns:
+        str: URL of the uploaded model on HF Hub
+    """
+    logger.info(f"Uploading model from {output_dir} to Hugging Face Hub")
+    # Get HF token from environment
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        logger.error("HF_TOKEN environment variable not set. Please set it to upload to Hugging Face Hub.")
+        logger.error("You can get a token from https://huggingface.co/settings/tokens")
+        raise ValueError("HF_TOKEN not set")
+    # Get or create repo name
+    if not repo_name:
+        # Use the output directory name as the repository name
+        repo_name = os.path.basename(os.path.normpath(output_dir))
+        logger.info(f"Using repository name: {repo_name}")
+    # Get HF username
+    api = HfApi(token=token)
+    user_info = api.whoami()
+    username = user_info["name"]
+    # Create full repository name
+    full_repo_name = f"{username}/{repo_name}"
+    logger.info(f"Creating repository: {full_repo_name}")
+    # Create repository if it doesn't exist
+    api.create_repo(
+        repo_id=full_repo_name,
+        exist_ok=True,
+        private=private
+    )
+    # Upload model files
+    logger.info(f"Uploading files from {output_dir} to {full_repo_name}")
+    api.upload_folder(
+        folder_path=output_dir,
+        repo_id=full_repo_name,
+        commit_message="Upload model files"
+    )
+    # Create model card
+    model_card = f"""
+# {repo_name}
+This model was fine-tuned using the script at https://github.com/George-API/phi4-cognitive-dataset.
+## Model details
+- Base model: DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
+- Dataset: {DEFAULT_DATASET}
+- Training: Research only
+    """
+    with open(os.path.join(output_dir, "README.md"), "w") as f:
+        f.write(model_card)
+    # Upload the model card
+    api.upload_file(
+        path_or_fileobj=os.path.join(output_dir, "README.md"),
+        path_in_repo="README.md",
+        repo_id=full_repo_name,
+        commit_message="Add model card"
+    )
+    logger.info(f"Model successfully uploaded to https://huggingface.co/{full_repo_name}")
+    return f"https://huggingface.co/{full_repo_name}"
 # Custom data collator for pre-tokenized data
 class PreTokenizedCollator(DataCollatorMixin):
     def __init__(self, pad_token_id=0, tokenizer=None):
 # Load and prepare dataset with proper sorting
 def load_and_prepare_dataset(dataset_name, config):
     """Load and prepare the dataset for fine-tuning with proper sorting"""
+    # Use the default dataset if the provided one matches the default name without namespace
+    if dataset_name == "phi4-cognitive-dataset":
+        dataset_name = DEFAULT_DATASET
+        logger.info(f"Using full dataset path: {dataset_name}")
     logger.info(f"Loading dataset: {dataset_name}")
     try:
         # Load dataset
+        try:
+            dataset = load_dataset(dataset_name)
+        except Exception as e:
+            if "doesn't exist on the Hub or cannot be accessed" in str(e):
+                logger.error(f"Dataset '{dataset_name}' not found. Make sure it exists and is accessible.")
+                logger.error(f"If using a private dataset, check your HF_TOKEN is set in your environment.")
+                logger.error(f"If missing namespace, try using the full path: 'George-API/phi4-cognitive-dataset'")
+            raise
         # Extract the split we want to use (usually 'train')
         if 'train' in dataset:
         raise
 # Main training function
+def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
     # Load environment variables
     load_dotenv()
         lora_config = config.get("lora_config", {})
         dataset_config = config.get("dataset_config", {})
+        # Log dataset info before loading
+        logger.info(f"Will load dataset: {dataset_name}")
+        if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
+            logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
             json.dump(config, f, indent=2)
         logger.info("Training complete - RESEARCH PHASE ONLY")
+        # Upload to Hugging Face Hub if requested
+        if upload_to_hub:
+            hub_url = upload_to_huggingface(
+                output_dir=output_dir,
+                repo_name=hub_repo_name,
+                private=private_repo
+            )
+            logger.info(f"Model uploaded to Hugging Face Hub: {hub_url}")
         return output_dir
     finally:
     parser = argparse.ArgumentParser(description="Fine-tune DeepSeek model (Research Only)")
     parser.add_argument("--config", type=str, default="transformers_config.json",
                       help="Path to the configuration file")
+    parser.add_argument("--dataset", type=str, default=DEFAULT_DATASET,
                       help="Dataset name or path")
     parser.add_argument("--output_dir", type=str, default="fine_tuned_model",
                       help="Output directory for the fine-tuned model")
+    parser.add_argument("--upload_to_hub", action="store_true",
+                      help="Upload the model to Hugging Face Hub after training")
+    parser.add_argument("--hub_repo_name", type=str, default=None,
+                      help="Repository name for the model on Hugging Face Hub")
+    parser.add_argument("--private_repo", action="store_true",
+                      help="Make the Hugging Face Hub repository private")
     args = parser.parse_args()
     try:
+        output_path = train(
+            args.config,
+            args.dataset,
+            args.output_dir,
+            upload_to_hub=args.upload_to_hub,
+            hub_repo_name=args.hub_repo_name,
+            private_repo=args.private_repo
+        )
         print(f"Research training completed. Model saved to: {output_path}")
+        if args.upload_to_hub:
+            print("Model was also uploaded to Hugging Face Hub.")
     except Exception as e:
         logging.error(f"Training failed: {str(e)}")
         remove_training_marker()  # Clean up marker if training fails