Aditi132's picture
Upload 2 files
3f9ad47 verified
import os
import torch
from datasets import load_dataset
from transformers import (
T5ForConditionalGeneration,
T5Tokenizer,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
# --- Configuration ---
MODEL_NAME = "t5-small"
OUTPUT_DIR = "./model_output"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
# We can increase batch size slightly if using GPU, but monitoring RAM is crucial
BATCH_SIZE = 8
EPOCHS = 3
def main():
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
print(f"GPU Name: {torch.cuda.get_device_name(0)}")
print(f"Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
else:
print("WARNING: No GPU detected. Training will be slow on CPU.")
print(f"Loading model: {MODEL_NAME}...")
try:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device) # Move model to GPU immediately
except Exception as e:
print(f"Error loading model: {e}")
return
# --- Load Dataset ---
print("Loading 'billsum' dataset...")
# Using 'ca_test' for a quick cycle
dataset = load_dataset("billsum", split="ca_test")
# Let's train on slightly more data now that we have a GPU
# Splitting the 1200 ca_test examples
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"] # Uses ~1000 examples
eval_dataset = dataset["test"] # Uses ~100 examples
print(f"Training on {len(train_dataset)} examples...")
# --- Preprocessing ---
prefix = "summarize: "
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["text"]]
model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
labels = tokenizer(text_target=examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
print("Tokenizing data...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# --- Training Args ---
training_args = Seq2SeqTrainingArguments(
output_dir=OUTPUT_DIR,
eval_strategy="epoch", # ✅ Correct for transformers >= 4.40
learning_rate=2e-5,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=EPOCHS,
predict_with_generate=True,
fp16=(device == "cuda"), # Mixed precision on GPU
dataloader_num_workers=0, # Safe for Windows
logging_steps=10,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
tokenizer=tokenizer,
data_collator=data_collator,
)
print("Starting training...")
trainer.train()
print("Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()