Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +50 -16
run_cloud_training.py
CHANGED
|
@@ -135,7 +135,7 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
| 135 |
"""
|
| 136 |
Load and prepare the dataset for fine-tuning.
|
| 137 |
Sort entries by prompt_number as required.
|
| 138 |
-
|
| 139 |
"""
|
| 140 |
# Use the default dataset path if no specific path is provided
|
| 141 |
if dataset_name == "phi4-cognitive-dataset":
|
|
@@ -182,9 +182,35 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
| 182 |
if len(dataset) > 0:
|
| 183 |
sample = dataset[0]
|
| 184 |
logger.info(f"Sample entry structure: {list(sample.keys())}")
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
return dataset
|
| 189 |
|
| 190 |
except Exception as e:
|
|
@@ -208,12 +234,12 @@ def tokenize_string(text, tokenizer):
|
|
| 208 |
# Data collator for pre-tokenized dataset
|
| 209 |
class PreTokenizedCollator(DataCollatorMixin):
|
| 210 |
"""
|
| 211 |
-
Data collator
|
| 212 |
-
|
| 213 |
"""
|
| 214 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
| 215 |
self.pad_token_id = pad_token_id
|
| 216 |
-
self.tokenizer = tokenizer # Keep a reference to the tokenizer for
|
| 217 |
|
| 218 |
def __call__(self, features):
|
| 219 |
# Print a sample feature to understand structure
|
|
@@ -251,26 +277,33 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
| 251 |
feature['input_ids'] = conversations
|
| 252 |
|
| 253 |
# Case 3: If conversations is a list of dicts with 'content' field
|
| 254 |
-
# This should be avoided for pre-tokenized datasets
|
| 255 |
elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
| 256 |
content = conversations[0]['content']
|
| 257 |
|
| 258 |
# If content is already a list of integers, use it directly
|
| 259 |
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
| 260 |
feature['input_ids'] = content
|
| 261 |
-
#
|
| 262 |
-
elif isinstance(content, str):
|
| 263 |
-
logger.warning("Found string content in
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
# Ensure input_ids is a list of integers
|
| 268 |
if 'input_ids' in feature:
|
| 269 |
-
#
|
| 270 |
-
if isinstance(feature['input_ids'], str):
|
| 271 |
-
logger.warning("Found string input_ids in
|
| 272 |
-
|
| 273 |
-
continue
|
| 274 |
# If input_ids is not a list, convert it
|
| 275 |
elif not isinstance(feature['input_ids'], list):
|
| 276 |
try:
|
|
@@ -569,7 +602,8 @@ def train(config_path, dataset_name, output_dir):
|
|
| 569 |
logger.info("Successfully applied LoRA with standard PEFT")
|
| 570 |
|
| 571 |
# No need to format the dataset - it's already pre-tokenized
|
| 572 |
-
logger.info("Using
|
|
|
|
| 573 |
training_dataset = dataset
|
| 574 |
|
| 575 |
# Configure reporting backends with fallbacks
|
|
|
|
| 135 |
"""
|
| 136 |
Load and prepare the dataset for fine-tuning.
|
| 137 |
Sort entries by prompt_number as required.
|
| 138 |
+
Handles both pre-tokenized and string content.
|
| 139 |
"""
|
| 140 |
# Use the default dataset path if no specific path is provided
|
| 141 |
if dataset_name == "phi4-cognitive-dataset":
|
|
|
|
| 182 |
if len(dataset) > 0:
|
| 183 |
sample = dataset[0]
|
| 184 |
logger.info(f"Sample entry structure: {list(sample.keys())}")
|
| 185 |
+
|
| 186 |
+
# Check if dataset is pre-tokenized or contains string content
|
| 187 |
+
is_pre_tokenized = False
|
| 188 |
+
|
| 189 |
+
if 'input_ids' in sample and isinstance(sample['input_ids'], list) and all(isinstance(x, int) for x in sample['input_ids']):
|
| 190 |
+
logger.info("Dataset appears to be pre-tokenized with input_ids field")
|
| 191 |
+
is_pre_tokenized = True
|
| 192 |
+
elif 'conversations' in sample:
|
| 193 |
logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
|
| 194 |
|
| 195 |
+
# Check if conversations contain pre-tokenized data
|
| 196 |
+
if isinstance(sample['conversations'], list) and len(sample['conversations']) > 0:
|
| 197 |
+
conv = sample['conversations'][0]
|
| 198 |
+
if isinstance(conv, dict) and 'input_ids' in conv and isinstance(conv['input_ids'], list):
|
| 199 |
+
logger.info("Dataset appears to be pre-tokenized in conversations.input_ids")
|
| 200 |
+
is_pre_tokenized = True
|
| 201 |
+
elif isinstance(conv, dict) and 'content' in conv:
|
| 202 |
+
content = conv['content']
|
| 203 |
+
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
| 204 |
+
logger.info("Dataset appears to be pre-tokenized in conversations.content")
|
| 205 |
+
is_pre_tokenized = True
|
| 206 |
+
else:
|
| 207 |
+
logger.info("Dataset appears to contain string content that will need tokenization")
|
| 208 |
+
|
| 209 |
+
if is_pre_tokenized:
|
| 210 |
+
logger.info("Using pre-tokenized dataset - tokenizer will only be used as fallback")
|
| 211 |
+
else:
|
| 212 |
+
logger.info("Dataset contains string content - tokenizer will be used")
|
| 213 |
+
|
| 214 |
return dataset
|
| 215 |
|
| 216 |
except Exception as e:
|
|
|
|
| 234 |
# Data collator for pre-tokenized dataset
|
| 235 |
class PreTokenizedCollator(DataCollatorMixin):
|
| 236 |
"""
|
| 237 |
+
Data collator that can handle both pre-tokenized datasets and string content.
|
| 238 |
+
Will tokenize strings if necessary, but logs warnings.
|
| 239 |
"""
|
| 240 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
| 241 |
self.pad_token_id = pad_token_id
|
| 242 |
+
self.tokenizer = tokenizer # Keep a reference to the tokenizer for fallback tokenization
|
| 243 |
|
| 244 |
def __call__(self, features):
|
| 245 |
# Print a sample feature to understand structure
|
|
|
|
| 277 |
feature['input_ids'] = conversations
|
| 278 |
|
| 279 |
# Case 3: If conversations is a list of dicts with 'content' field
|
|
|
|
| 280 |
elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
| 281 |
content = conversations[0]['content']
|
| 282 |
|
| 283 |
# If content is already a list of integers, use it directly
|
| 284 |
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
| 285 |
feature['input_ids'] = content
|
| 286 |
+
# If content is a string, tokenize it with a warning
|
| 287 |
+
elif isinstance(content, str) and self.tokenizer:
|
| 288 |
+
logger.warning("Found string content in dataset. Tokenizing as fallback.")
|
| 289 |
+
feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
|
| 290 |
+
else:
|
| 291 |
+
logger.warning(f"Unexpected content format: {type(content)}")
|
| 292 |
continue
|
| 293 |
+
|
| 294 |
+
# Case 4: If conversations is a list of strings
|
| 295 |
+
elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
|
| 296 |
+
# Join all strings and tokenize
|
| 297 |
+
logger.warning("Found string conversations in dataset. Tokenizing as fallback.")
|
| 298 |
+
full_text = " ".join(conversations)
|
| 299 |
+
feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
|
| 300 |
|
| 301 |
# Ensure input_ids is a list of integers
|
| 302 |
if 'input_ids' in feature:
|
| 303 |
+
# If input_ids is a string, tokenize it
|
| 304 |
+
if isinstance(feature['input_ids'], str) and self.tokenizer:
|
| 305 |
+
logger.warning("Found string input_ids in dataset. Tokenizing as fallback.")
|
| 306 |
+
feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
|
|
|
|
| 307 |
# If input_ids is not a list, convert it
|
| 308 |
elif not isinstance(feature['input_ids'], list):
|
| 309 |
try:
|
|
|
|
| 602 |
logger.info("Successfully applied LoRA with standard PEFT")
|
| 603 |
|
| 604 |
# No need to format the dataset - it's already pre-tokenized
|
| 605 |
+
logger.info("Using dataset with flexible tokenization handling")
|
| 606 |
+
logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
|
| 607 |
training_dataset = dataset
|
| 608 |
|
| 609 |
# Configure reporting backends with fallbacks
|