Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 16

Commit

1faea13

verified ·

1 Parent(s): 0349da5

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +57 -9

run_cloud_training.py CHANGED Viewed

@@ -2,8 +2,7 @@
 """
 Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
-- Optimized for L40S GPU
-- Works with pre-tokenized datasets
 - Research training only (no inference)
 - CLOUD BASED TRAINING - Hugging Face Spaces
 """
@@ -13,6 +12,8 @@ import logging
 import json
 import torch
 import argparse
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
 from transformers.data.data_collator import DataCollatorMixin
@@ -27,6 +28,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
 # Default dataset with proper namespace
 DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
@@ -294,8 +298,43 @@ class PreTokenizedCollator(DataCollatorMixin):
         return batch
 # Load and prepare dataset with proper sorting
-def load_and_prepare_dataset(dataset_name, config):
     """Load and prepare the dataset for fine-tuning with proper sorting"""
     # Use the default dataset if the provided one matches the default name without namespace
     if dataset_name == "phi4-cognitive-dataset":
@@ -323,6 +362,10 @@ def load_and_prepare_dataset(dataset_name, config):
         dataset_config = config.get("dataset_config", {})
         sort_field = dataset_config.get("sort_by_field", "prompt_number")
         # Sort in ascending order by specified field
         logger.info(f"Sorting dataset by {sort_field} in ascending order")
         dataset = dataset.sort(sort_field)
@@ -377,9 +420,6 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
             logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
-        # Load and prepare dataset with proper sorting
-        dataset = load_and_prepare_dataset(dataset_name, config)
         # Load model settings
         original_model_name = model_config.get("model_name_or_path")
@@ -408,6 +448,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         )
         tokenizer.pad_token = tokenizer.eos_token
         # Get quantization config
         quant_config = config.get("quantization_config", {})
@@ -525,7 +568,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
             gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
-            evaluation_strategy = training_config.get("evaluation_strategy", "steps")
             load_best_model_at_end = training_config.get("load_best_model_at_end", True)
             logger.info("Using full training parameters for GPU mode")
         else:
@@ -536,7 +579,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             bf16 = False
             gradient_checkpointing = False
             dataloader_workers = 0
-            evaluation_strategy = "no"
             load_best_model_at_end = False
             logger.warning("Using minimal parameters for CPU training in Space")
@@ -561,7 +604,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
-            evaluation_strategy=evaluation_strategy,
             load_best_model_at_end=load_best_model_at_end,
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
@@ -581,6 +624,11 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
                 pad_token_id=tokenizer.pad_token_id,
                 tokenizer=tokenizer
             ),
         )
         # Start training

 """
 Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
+- Optimized for L40S GPU with pre-tokenized datasets
 - Research training only (no inference)
 - CLOUD BASED TRAINING - Hugging Face Spaces
 """
 import json
 import torch
 import argparse
+import shutil
+from pathlib import Path
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
 from transformers.data.data_collator import DataCollatorMixin
 # Force GPU mode in Space if we're using a pre-quantized model
 os.environ["FORCE_GPU"] = "1"
+# Create triton directory to avoid warning
+os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
 # Default dataset with proper namespace
 DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
         return batch
+# Preprocess dataset to ensure all entries are pre-tokenized
+def preprocess_dataset(dataset, tokenizer):
+    """Ensure dataset is fully pre-tokenized to avoid tokenization during training"""
+    logger.info("Pre-processing dataset to ensure all entries are tokenized")
+    def process_example(example):
+        # If already has input_ids as list of integers, keep as is
+        if 'input_ids' in example and isinstance(example['input_ids'], list) and all(isinstance(x, int) for x in example['input_ids']):
+            return example
+        # If has conversations with content field
+        if 'conversations' in example:
+            conversations = example['conversations']
+            if isinstance(conversations, list) and len(conversations) > 0:
+                # If conversations has content field, tokenize it
+                if isinstance(conversations[0], dict) and 'content' in conversations[0]:
+                    content = conversations[0]['content']
+                    if isinstance(content, str):
+                        example['input_ids'] = tokenizer.encode(content, add_special_tokens=False)
+                        return example
+        # For any other format, try to extract text and tokenize
+        text = None
+        if 'text' in example:
+            text = example['text']
+        elif 'content' in example:
+            text = example['content']
+        if text and isinstance(text, str):
+            example['input_ids'] = tokenizer.encode(text, add_special_tokens=False)
+        return example
+    return dataset.map(process_example)
 # Load and prepare dataset with proper sorting
+def load_and_prepare_dataset(dataset_name, config, tokenizer=None):
     """Load and prepare the dataset for fine-tuning with proper sorting"""
     # Use the default dataset if the provided one matches the default name without namespace
     if dataset_name == "phi4-cognitive-dataset":
         dataset_config = config.get("dataset_config", {})
         sort_field = dataset_config.get("sort_by_field", "prompt_number")
+        # Preprocess dataset to ensure all entries are pre-tokenized
+        if tokenizer is not None:
+            dataset = preprocess_dataset(dataset, tokenizer)
         # Sort in ascending order by specified field
         logger.info(f"Sorting dataset by {sort_field} in ascending order")
         dataset = dataset.sort(sort_field)
         if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
             logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
         # Load model settings
         original_model_name = model_config.get("model_name_or_path")
         )
         tokenizer.pad_token = tokenizer.eos_token
+        # Load and prepare dataset with proper sorting
+        dataset = load_and_prepare_dataset(dataset_name, config, tokenizer)
         # Get quantization config
         quant_config = config.get("quantization_config", {})
             bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
             gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
             dataloader_workers = training_config.get("dataloader_num_workers", 4)
+            eval_strategy = training_config.get("eval_strategy", "steps")  # Updated from evaluation_strategy
             load_best_model_at_end = training_config.get("load_best_model_at_end", True)
             logger.info("Using full training parameters for GPU mode")
         else:
             bf16 = False
             gradient_checkpointing = False
             dataloader_workers = 0
+            eval_strategy = "no"
             load_best_model_at_end = False
             logger.warning("Using minimal parameters for CPU training in Space")
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
+            eval_strategy=eval_strategy,  # Updated from evaluation_strategy
             load_best_model_at_end=load_best_model_at_end,
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
                 pad_token_id=tokenizer.pad_token_id,
                 tokenizer=tokenizer
             ),
+            # Add label_names to avoid warning
+            compute_metrics=None,
+            tokenizer=tokenizer,  # Provide tokenizer for proper padding
+            # Define label_names to fix warning
+            label_names=["labels"]
         )
         # Start training