Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +60 -7
run_cloud_training.py
CHANGED
|
@@ -110,14 +110,24 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
| 110 |
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
| 111 |
raise
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# Data collator for pre-tokenized dataset
|
| 114 |
class PreTokenizedCollator(DataCollatorMixin):
|
| 115 |
"""
|
| 116 |
Data collator for pre-tokenized datasets.
|
| 117 |
Expects input_ids and labels already tokenized.
|
| 118 |
"""
|
| 119 |
-
def __init__(self, pad_token_id=0):
|
| 120 |
self.pad_token_id = pad_token_id
|
|
|
|
| 121 |
|
| 122 |
def __call__(self, features):
|
| 123 |
# Print a sample feature to understand structure
|
|
@@ -130,15 +140,58 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
| 130 |
# If input_ids is not directly available, try to extract from conversations
|
| 131 |
if 'input_ids' not in feature and 'conversations' in feature:
|
| 132 |
# Extract from conversations based on your dataset structure
|
| 133 |
-
# This is a placeholder - adjust based on actual structure
|
| 134 |
conversations = feature['conversations']
|
|
|
|
|
|
|
|
|
|
| 135 |
if isinstance(conversations, list) and len(conversations) > 0:
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
| 141 |
feature['input_ids'] = conversations[0]['input_ids']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
processed_features.append(feature)
|
| 144 |
|
|
@@ -380,7 +433,7 @@ def train(config_path, dataset_name, output_dir):
|
|
| 380 |
model=model,
|
| 381 |
args=training_args,
|
| 382 |
train_dataset=training_dataset,
|
| 383 |
-
data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id),
|
| 384 |
)
|
| 385 |
|
| 386 |
# Start training
|
|
|
|
| 110 |
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
| 111 |
raise
|
| 112 |
|
| 113 |
+
def tokenize_string(text, tokenizer):
|
| 114 |
+
"""Tokenize a string using the provided tokenizer"""
|
| 115 |
+
if not text:
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
# Tokenize the text
|
| 119 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 120 |
+
return tokens
|
| 121 |
+
|
| 122 |
# Data collator for pre-tokenized dataset
|
| 123 |
class PreTokenizedCollator(DataCollatorMixin):
|
| 124 |
"""
|
| 125 |
Data collator for pre-tokenized datasets.
|
| 126 |
Expects input_ids and labels already tokenized.
|
| 127 |
"""
|
| 128 |
+
def __init__(self, pad_token_id=0, tokenizer=None):
|
| 129 |
self.pad_token_id = pad_token_id
|
| 130 |
+
self.tokenizer = tokenizer # Keep a reference to the tokenizer for string conversion
|
| 131 |
|
| 132 |
def __call__(self, features):
|
| 133 |
# Print a sample feature to understand structure
|
|
|
|
| 140 |
# If input_ids is not directly available, try to extract from conversations
|
| 141 |
if 'input_ids' not in feature and 'conversations' in feature:
|
| 142 |
# Extract from conversations based on your dataset structure
|
|
|
|
| 143 |
conversations = feature['conversations']
|
| 144 |
+
|
| 145 |
+
# Debug the conversations structure
|
| 146 |
+
logger.info(f"Conversations type: {type(conversations)}")
|
| 147 |
if isinstance(conversations, list) and len(conversations) > 0:
|
| 148 |
+
logger.info(f"First conversation type: {type(conversations[0])}")
|
| 149 |
+
logger.info(f"First conversation: {conversations[0]}")
|
| 150 |
+
|
| 151 |
+
# Try different approaches to extract input_ids
|
| 152 |
+
if isinstance(conversations, list) and len(conversations) > 0:
|
| 153 |
+
# Case 1: If conversations is a list of dicts with 'content' field
|
| 154 |
+
if isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
| 155 |
+
content = conversations[0]['content']
|
| 156 |
+
logger.info(f"Found content field: {type(content)}")
|
| 157 |
+
|
| 158 |
+
# If content is a string, tokenize it
|
| 159 |
+
if isinstance(content, str) and self.tokenizer:
|
| 160 |
+
logger.info(f"Tokenizing string content: {content[:50]}...")
|
| 161 |
+
feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
|
| 162 |
+
# If content is already a list of integers, use it directly
|
| 163 |
+
elif isinstance(content, list) and all(isinstance(x, int) for x in content):
|
| 164 |
+
feature['input_ids'] = content
|
| 165 |
+
# If content is already tokenized in some other format
|
| 166 |
+
else:
|
| 167 |
+
logger.warning(f"Unexpected content format: {type(content)}")
|
| 168 |
+
|
| 169 |
+
# Case 2: If conversations is a list of dicts with 'input_ids' field
|
| 170 |
elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
| 171 |
feature['input_ids'] = conversations[0]['input_ids']
|
| 172 |
+
|
| 173 |
+
# Case 3: If conversations itself contains the input_ids
|
| 174 |
+
elif all(isinstance(x, int) for x in conversations):
|
| 175 |
+
feature['input_ids'] = conversations
|
| 176 |
+
|
| 177 |
+
# Case 4: If conversations is a list of strings
|
| 178 |
+
elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
|
| 179 |
+
# Join all strings and tokenize
|
| 180 |
+
full_text = " ".join(conversations)
|
| 181 |
+
feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
|
| 182 |
+
|
| 183 |
+
# Ensure input_ids is a list of integers
|
| 184 |
+
if 'input_ids' in feature:
|
| 185 |
+
# If input_ids is a string, tokenize it
|
| 186 |
+
if isinstance(feature['input_ids'], str) and self.tokenizer:
|
| 187 |
+
logger.info(f"Converting string input_ids to tokens: {feature['input_ids'][:50]}...")
|
| 188 |
+
feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
|
| 189 |
+
# If input_ids is not a list, convert it
|
| 190 |
+
elif not isinstance(feature['input_ids'], list):
|
| 191 |
+
try:
|
| 192 |
+
feature['input_ids'] = list(feature['input_ids'])
|
| 193 |
+
except:
|
| 194 |
+
logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
|
| 195 |
|
| 196 |
processed_features.append(feature)
|
| 197 |
|
|
|
|
| 433 |
model=model,
|
| 434 |
args=training_args,
|
| 435 |
train_dataset=training_dataset,
|
| 436 |
+
data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id, tokenizer=tokenizer),
|
| 437 |
)
|
| 438 |
|
| 439 |
# Start training
|