Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

f1e4d0b

verified ·

1 Parent(s): c7c538f

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +45 -28

run_cloud_training.py CHANGED Viewed

@@ -32,6 +32,9 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 def load_config(config_path):
     """Load the transformers config from JSON file"""
     logger.info(f"Loading config from {config_path}")
@@ -45,35 +48,49 @@ def load_and_prepare_dataset(dataset_name, config):
     Sort entries by prompt_number as required.
     NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
     """
     logger.info(f"Loading dataset: {dataset_name}")
-    # Load dataset
-    dataset = load_dataset(dataset_name)
-    # Extract the split we want to use (usually 'train')
-    if 'train' in dataset:
-        dataset = dataset['train']
-    # Get the dataset config
-    dataset_config = config.get("dataset_config", {})
-    sort_field = dataset_config.get("sort_by_field", "prompt_number")
-    sort_direction = dataset_config.get("sort_direction", "ascending")
-    # Sort the dataset by prompt_number
-    logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
-    if sort_direction == "ascending":
-        dataset = dataset.sort(sort_field)
-    else:
-        dataset = dataset.sort(sort_field, reverse=True)
-    # Add shuffle with fixed seed if specified
-    if "shuffle_seed" in dataset_config:
-        shuffle_seed = dataset_config.get("shuffle_seed")
-        logger.info(f"Shuffling dataset with seed {shuffle_seed}")
-        dataset = dataset.shuffle(seed=shuffle_seed)
-    logger.info(f"Dataset loaded with {len(dataset)} entries")
-    return dataset
 # Data collator for pre-tokenized dataset
 class PreTokenizedCollator(DataCollatorMixin):
@@ -138,7 +155,7 @@ def remove_training_marker():
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
-    # Load environment variables and configuration
     load_dotenv()
     config = load_config(config_path)
@@ -170,7 +187,7 @@ def train(config_path, dataset_name, output_dir):
         logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
         logger.info("Configuration Summary:")
         logger.info(f"Model: {model_config.get('model_name_or_path')}")
-        logger.info(f"Dataset: {dataset_name}")
         logger.info(f"Output directory: {output_dir}")
         logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")

 )
 logger = logging.getLogger(__name__)
+# Default dataset path - use the correct path with username
+DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
 def load_config(config_path):
     """Load the transformers config from JSON file"""
     logger.info(f"Loading config from {config_path}")
     Sort entries by prompt_number as required.
     NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
     """
+    # Use the default dataset path if no specific path is provided
+    if dataset_name == "phi4-cognitive-dataset":
+        dataset_name = DEFAULT_DATASET
     logger.info(f"Loading dataset: {dataset_name}")
+    try:
+        # Load dataset
+        dataset = load_dataset(dataset_name)
+        # Extract the split we want to use (usually 'train')
+        if 'train' in dataset:
+            dataset = dataset['train']
+        # Get the dataset config
+        dataset_config = config.get("dataset_config", {})
+        sort_field = dataset_config.get("sort_by_field", "prompt_number")
+        sort_direction = dataset_config.get("sort_direction", "ascending")
+        # Sort the dataset by prompt_number
+        logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
+        if sort_direction == "ascending":
+            dataset = dataset.sort(sort_field)
+        else:
+            dataset = dataset.sort(sort_field, reverse=True)
+        # Add shuffle with fixed seed if specified
+        if "shuffle_seed" in dataset_config:
+            shuffle_seed = dataset_config.get("shuffle_seed")
+            logger.info(f"Shuffling dataset with seed {shuffle_seed}")
+            dataset = dataset.shuffle(seed=shuffle_seed)
+        logger.info(f"Dataset loaded with {len(dataset)} entries")
+        return dataset
+    except Exception as e:
+        logger.error(f"Error loading dataset: {str(e)}")
+        logger.info("Available datasets in the Hub:")
+        # Print a more helpful error message
+        print(f"Failed to load dataset: {dataset_name}")
+        print(f"Make sure the dataset exists and is accessible.")
+        print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
+        raise
 # Data collator for pre-tokenized dataset
 class PreTokenizedCollator(DataCollatorMixin):
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
+    # Load environment variables
     load_dotenv()
     config = load_config(config_path)
         logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
         logger.info("Configuration Summary:")
         logger.info(f"Model: {model_config.get('model_name_or_path')}")
+        logger.info(f"Dataset: {dataset_name if dataset_name != 'phi4-cognitive-dataset' else DEFAULT_DATASET}")
         logger.info(f"Output directory: {output_dir}")
         logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")