Spaces:
Sleeping
Sleeping
Update train_model.py
Browse files- train_model.py +17 -4
train_model.py
CHANGED
|
@@ -68,16 +68,29 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
|
|
| 68 |
|
| 69 |
# Log some examples to check dataset structure
|
| 70 |
logging.info(f"Example data from the dataset: {dataset[:5]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def tokenize_function(examples):
|
| 73 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
# Tokenize with truncation and padding
|
| 75 |
tokens = tokenizer(
|
| 76 |
examples['text'],
|
| 77 |
truncation=True,
|
| 78 |
max_length=sequence_length,
|
| 79 |
-
padding=
|
| 80 |
-
return_tensors=None # Let the collator handle tensor
|
| 81 |
)
|
| 82 |
# Log the tokens for debugging
|
| 83 |
logging.info(f"Tokenized example: {tokens}")
|
|
@@ -87,7 +100,7 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
|
|
| 87 |
logging.error(f"Problematic example: {examples}")
|
| 88 |
raise e
|
| 89 |
|
| 90 |
-
# Tokenize the dataset
|
| 91 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
| 92 |
logging.info("Dataset tokenization complete.")
|
| 93 |
return tokenized_datasets
|
|
@@ -215,7 +228,7 @@ def main():
|
|
| 215 |
if args.task == "generation":
|
| 216 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| 217 |
elif args.task == "classification":
|
| 218 |
-
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Handle padding dynamically during batching
|
| 219 |
else:
|
| 220 |
logging.error("Unsupported task type for data collator.")
|
| 221 |
raise ValueError("Unsupported task type for data collator.")
|
|
|
|
| 68 |
|
| 69 |
# Log some examples to check dataset structure
|
| 70 |
logging.info(f"Example data from the dataset: {dataset[:5]}")
|
| 71 |
+
|
| 72 |
+
def clean_text(text):
|
| 73 |
+
# Ensure each text is a string
|
| 74 |
+
if isinstance(text, list):
|
| 75 |
+
return " ".join([str(t) for t in text])
|
| 76 |
+
return str(text)
|
| 77 |
|
| 78 |
def tokenize_function(examples):
|
| 79 |
try:
|
| 80 |
+
# Clean text to ensure correct format
|
| 81 |
+
examples['text'] = [clean_text(text) for text in examples['text']]
|
| 82 |
+
|
| 83 |
+
# Log the type and structure of text to debug
|
| 84 |
+
logging.info(f"Type of examples['text']: {type(examples['text'])}")
|
| 85 |
+
logging.info(f"First example type: {type(examples['text'][0])}")
|
| 86 |
+
|
| 87 |
# Tokenize with truncation and padding
|
| 88 |
tokens = tokenizer(
|
| 89 |
examples['text'],
|
| 90 |
truncation=True,
|
| 91 |
max_length=sequence_length,
|
| 92 |
+
padding=False, # Defer padding to data collator
|
| 93 |
+
return_tensors=None # Let the data collator handle tensor creation
|
| 94 |
)
|
| 95 |
# Log the tokens for debugging
|
| 96 |
logging.info(f"Tokenized example: {tokens}")
|
|
|
|
| 100 |
logging.error(f"Problematic example: {examples}")
|
| 101 |
raise e
|
| 102 |
|
| 103 |
+
# Tokenize the dataset using the modified tokenize_function
|
| 104 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
| 105 |
logging.info("Dataset tokenization complete.")
|
| 106 |
return tokenized_datasets
|
|
|
|
| 228 |
if args.task == "generation":
|
| 229 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| 230 |
elif args.task == "classification":
|
| 231 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest') # Handle padding dynamically during batching
|
| 232 |
else:
|
| 233 |
logging.error("Unsupported task type for data collator.")
|
| 234 |
raise ValueError("Unsupported task type for data collator.")
|