Spaces:
Sleeping
Sleeping
Update train_model.py
Browse files- train_model.py +30 -67
train_model.py
CHANGED
|
@@ -16,7 +16,6 @@ import torch
|
|
| 16 |
import os
|
| 17 |
from huggingface_hub import login, HfApi
|
| 18 |
import logging
|
| 19 |
-
|
| 20 |
from torch.optim import AdamW # Import PyTorch's AdamW
|
| 21 |
|
| 22 |
def setup_logging(log_file_path):
|
|
@@ -64,18 +63,14 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
|
|
| 64 |
"""
|
| 65 |
logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
|
| 66 |
try:
|
| 67 |
-
|
| 68 |
-
dataset, config = dataset_name.split('/', 1)
|
| 69 |
-
dataset = load_dataset("stanfordnlp/imdb",split='train')
|
| 70 |
-
else:
|
| 71 |
-
dataset = load_dataset("stanfordnlp/imdb",split='train')
|
| 72 |
-
|
| 73 |
logging.info("Dataset loaded successfully.")
|
| 74 |
|
| 75 |
def tokenize_function(examples):
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
| 80 |
logging.info("Dataset tokenization complete.")
|
| 81 |
return tokenized_datasets
|
|
@@ -100,7 +95,6 @@ def initialize_model(task, model_name, vocab_size, sequence_length, hidden_size,
|
|
| 100 |
intermediate_size=4 * hidden_size,
|
| 101 |
hidden_act='gelu',
|
| 102 |
use_cache=True,
|
| 103 |
-
truncation=False
|
| 104 |
)
|
| 105 |
model = GPT2LMHeadModel(config)
|
| 106 |
logging.info("GPT2LMHeadModel initialized successfully.")
|
|
@@ -172,31 +166,18 @@ def main():
|
|
| 172 |
if tokenizer.pad_token is None:
|
| 173 |
logging.info("Setting pad_token to eos_token.")
|
| 174 |
tokenizer.pad_token = tokenizer.eos_token
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
logging.info("Resized token embeddings to accommodate pad_token.")
|
| 188 |
-
else:
|
| 189 |
-
logging.info(f"Tokenizer already has pad_token set to: {tokenizer.pad_token}")
|
| 190 |
-
# Initialize model normally
|
| 191 |
-
model = initialize_model(
|
| 192 |
-
task=args.task,
|
| 193 |
-
model_name=args.model_name,
|
| 194 |
-
vocab_size=args.vocab_size,
|
| 195 |
-
sequence_length=args.sequence_length,
|
| 196 |
-
hidden_size=args.hidden_size,
|
| 197 |
-
num_layers=args.num_layers,
|
| 198 |
-
attention_heads=args.attention_heads
|
| 199 |
-
)
|
| 200 |
except Exception as e:
|
| 201 |
logging.error(f"Error initializing tokenizer or model: {str(e)}")
|
| 202 |
raise e
|
|
@@ -223,36 +204,17 @@ def main():
|
|
| 223 |
raise ValueError("Unsupported task type for data collator.")
|
| 224 |
|
| 225 |
# Define training arguments
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
)
|
| 239 |
-
elif args.task == "classification":
|
| 240 |
-
training_args = TrainingArguments(
|
| 241 |
-
output_dir=f"./models/{args.model_name}",
|
| 242 |
-
num_train_epochs=3,
|
| 243 |
-
per_device_train_batch_size=16,
|
| 244 |
-
evaluation_strategy="epoch",
|
| 245 |
-
save_steps=5000,
|
| 246 |
-
save_total_limit=2,
|
| 247 |
-
logging_steps=500,
|
| 248 |
-
learning_rate=5e-5,
|
| 249 |
-
remove_unused_columns=False,
|
| 250 |
-
push_to_hub=False # We'll handle pushing manually
|
| 251 |
-
|
| 252 |
-
)
|
| 253 |
-
else:
|
| 254 |
-
logging.error("Unsupported task type for training arguments.")
|
| 255 |
-
raise ValueError("Unsupported task type for training arguments.")
|
| 256 |
|
| 257 |
# Initialize Trainer with PyTorch's AdamW optimizer
|
| 258 |
trainer = Trainer(
|
|
@@ -260,7 +222,7 @@ def main():
|
|
| 260 |
args=training_args,
|
| 261 |
train_dataset=tokenized_datasets,
|
| 262 |
data_collator=data_collator,
|
| 263 |
-
optimizers=(get_optimizer(model, training_args.learning_rate), None)
|
| 264 |
)
|
| 265 |
|
| 266 |
# Start training
|
|
@@ -303,3 +265,4 @@ def main():
|
|
| 303 |
if __name__ == "__main__":
|
| 304 |
main()
|
| 305 |
|
|
|
|
|
|
| 16 |
import os
|
| 17 |
from huggingface_hub import login, HfApi
|
| 18 |
import logging
|
|
|
|
| 19 |
from torch.optim import AdamW # Import PyTorch's AdamW
|
| 20 |
|
| 21 |
def setup_logging(log_file_path):
|
|
|
|
| 63 |
"""
|
| 64 |
logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
|
| 65 |
try:
|
| 66 |
+
dataset = load_dataset(dataset_name, split='train')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
logging.info("Dataset loaded successfully.")
|
| 68 |
|
| 69 |
def tokenize_function(examples):
|
| 70 |
+
# Truncate and set max_length, but let DataCollator handle padding
|
| 71 |
+
return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
|
| 72 |
+
|
| 73 |
+
# Tokenize the dataset using the modified tokenize_function
|
| 74 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
| 75 |
logging.info("Dataset tokenization complete.")
|
| 76 |
return tokenized_datasets
|
|
|
|
| 95 |
intermediate_size=4 * hidden_size,
|
| 96 |
hidden_act='gelu',
|
| 97 |
use_cache=True,
|
|
|
|
| 98 |
)
|
| 99 |
model = GPT2LMHeadModel(config)
|
| 100 |
logging.info("GPT2LMHeadModel initialized successfully.")
|
|
|
|
| 166 |
if tokenizer.pad_token is None:
|
| 167 |
logging.info("Setting pad_token to eos_token.")
|
| 168 |
tokenizer.pad_token = tokenizer.eos_token
|
| 169 |
+
|
| 170 |
+
# Initialize model
|
| 171 |
+
model = initialize_model(
|
| 172 |
+
task=args.task,
|
| 173 |
+
model_name=args.model_name,
|
| 174 |
+
vocab_size=args.vocab_size,
|
| 175 |
+
sequence_length=args.sequence_length,
|
| 176 |
+
hidden_size=args.hidden_size,
|
| 177 |
+
num_layers=args.num_layers,
|
| 178 |
+
attention_heads=args.attention_heads
|
| 179 |
+
)
|
| 180 |
+
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
except Exception as e:
|
| 182 |
logging.error(f"Error initializing tokenizer or model: {str(e)}")
|
| 183 |
raise e
|
|
|
|
| 204 |
raise ValueError("Unsupported task type for data collator.")
|
| 205 |
|
| 206 |
# Define training arguments
|
| 207 |
+
training_args = TrainingArguments(
|
| 208 |
+
output_dir=f"./models/{args.model_name}",
|
| 209 |
+
num_train_epochs=3,
|
| 210 |
+
per_device_train_batch_size=8 if args.task == "generation" else 16,
|
| 211 |
+
save_steps=5000,
|
| 212 |
+
save_total_limit=2,
|
| 213 |
+
logging_steps=500,
|
| 214 |
+
learning_rate=5e-4 if args.task == "generation" else 5e-5,
|
| 215 |
+
remove_unused_columns=False,
|
| 216 |
+
push_to_hub=False
|
| 217 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
# Initialize Trainer with PyTorch's AdamW optimizer
|
| 220 |
trainer = Trainer(
|
|
|
|
| 222 |
args=training_args,
|
| 223 |
train_dataset=tokenized_datasets,
|
| 224 |
data_collator=data_collator,
|
| 225 |
+
optimizers=(get_optimizer(model, training_args.learning_rate), None)
|
| 226 |
)
|
| 227 |
|
| 228 |
# Start training
|
|
|
|
| 265 |
if __name__ == "__main__":
|
| 266 |
main()
|
| 267 |
|
| 268 |
+
|