Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +45 -28
run_cloud_training.py
CHANGED
|
@@ -32,6 +32,9 @@ logging.basicConfig(
|
|
| 32 |
)
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
| 35 |
def load_config(config_path):
|
| 36 |
"""Load the transformers config from JSON file"""
|
| 37 |
logger.info(f"Loading config from {config_path}")
|
|
@@ -45,35 +48,49 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
| 45 |
Sort entries by prompt_number as required.
|
| 46 |
NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
|
| 47 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
logger.info(f"Loading dataset: {dataset_name}")
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
shuffle_seed
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Data collator for pre-tokenized dataset
|
| 79 |
class PreTokenizedCollator(DataCollatorMixin):
|
|
@@ -138,7 +155,7 @@ def remove_training_marker():
|
|
| 138 |
|
| 139 |
def train(config_path, dataset_name, output_dir):
|
| 140 |
"""Main training function - RESEARCH TRAINING PHASE ONLY"""
|
| 141 |
-
# Load environment variables
|
| 142 |
load_dotenv()
|
| 143 |
config = load_config(config_path)
|
| 144 |
|
|
@@ -170,7 +187,7 @@ def train(config_path, dataset_name, output_dir):
|
|
| 170 |
logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
|
| 171 |
logger.info("Configuration Summary:")
|
| 172 |
logger.info(f"Model: {model_config.get('model_name_or_path')}")
|
| 173 |
-
logger.info(f"Dataset: {dataset_name}")
|
| 174 |
logger.info(f"Output directory: {output_dir}")
|
| 175 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
| 176 |
|
|
|
|
| 32 |
)
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
+
# Default dataset path - use the correct path with username
|
| 36 |
+
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
| 37 |
+
|
| 38 |
def load_config(config_path):
|
| 39 |
"""Load the transformers config from JSON file"""
|
| 40 |
logger.info(f"Loading config from {config_path}")
|
|
|
|
| 48 |
Sort entries by prompt_number as required.
|
| 49 |
NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
|
| 50 |
"""
|
| 51 |
+
# Use the default dataset path if no specific path is provided
|
| 52 |
+
if dataset_name == "phi4-cognitive-dataset":
|
| 53 |
+
dataset_name = DEFAULT_DATASET
|
| 54 |
+
|
| 55 |
logger.info(f"Loading dataset: {dataset_name}")
|
| 56 |
|
| 57 |
+
try:
|
| 58 |
+
# Load dataset
|
| 59 |
+
dataset = load_dataset(dataset_name)
|
| 60 |
+
|
| 61 |
+
# Extract the split we want to use (usually 'train')
|
| 62 |
+
if 'train' in dataset:
|
| 63 |
+
dataset = dataset['train']
|
| 64 |
+
|
| 65 |
+
# Get the dataset config
|
| 66 |
+
dataset_config = config.get("dataset_config", {})
|
| 67 |
+
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
| 68 |
+
sort_direction = dataset_config.get("sort_direction", "ascending")
|
| 69 |
+
|
| 70 |
+
# Sort the dataset by prompt_number
|
| 71 |
+
logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
|
| 72 |
+
if sort_direction == "ascending":
|
| 73 |
+
dataset = dataset.sort(sort_field)
|
| 74 |
+
else:
|
| 75 |
+
dataset = dataset.sort(sort_field, reverse=True)
|
| 76 |
+
|
| 77 |
+
# Add shuffle with fixed seed if specified
|
| 78 |
+
if "shuffle_seed" in dataset_config:
|
| 79 |
+
shuffle_seed = dataset_config.get("shuffle_seed")
|
| 80 |
+
logger.info(f"Shuffling dataset with seed {shuffle_seed}")
|
| 81 |
+
dataset = dataset.shuffle(seed=shuffle_seed)
|
| 82 |
+
|
| 83 |
+
logger.info(f"Dataset loaded with {len(dataset)} entries")
|
| 84 |
+
return dataset
|
| 85 |
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Error loading dataset: {str(e)}")
|
| 88 |
+
logger.info("Available datasets in the Hub:")
|
| 89 |
+
# Print a more helpful error message
|
| 90 |
+
print(f"Failed to load dataset: {dataset_name}")
|
| 91 |
+
print(f"Make sure the dataset exists and is accessible.")
|
| 92 |
+
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
| 93 |
+
raise
|
| 94 |
|
| 95 |
# Data collator for pre-tokenized dataset
|
| 96 |
class PreTokenizedCollator(DataCollatorMixin):
|
|
|
|
| 155 |
|
| 156 |
def train(config_path, dataset_name, output_dir):
|
| 157 |
"""Main training function - RESEARCH TRAINING PHASE ONLY"""
|
| 158 |
+
# Load environment variables
|
| 159 |
load_dotenv()
|
| 160 |
config = load_config(config_path)
|
| 161 |
|
|
|
|
| 187 |
logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
|
| 188 |
logger.info("Configuration Summary:")
|
| 189 |
logger.info(f"Model: {model_config.get('model_name_or_path')}")
|
| 190 |
+
logger.info(f"Dataset: {dataset_name if dataset_name != 'phi4-cognitive-dataset' else DEFAULT_DATASET}")
|
| 191 |
logger.info(f"Output directory: {output_dir}")
|
| 192 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
| 193 |
|