Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import ( | |
| T5ForConditionalGeneration, | |
| T5Tokenizer, | |
| Seq2SeqTrainingArguments, | |
| Seq2SeqTrainer, | |
| DataCollatorForSeq2Seq | |
| ) | |
| # --- Configuration --- | |
| MODEL_NAME = "t5-small" | |
| OUTPUT_DIR = "./model_output" | |
| MAX_INPUT_LENGTH = 1024 | |
| MAX_TARGET_LENGTH = 128 | |
| # We can increase batch size slightly if using GPU, but monitoring RAM is crucial | |
| BATCH_SIZE = 8 | |
| EPOCHS = 3 | |
| def main(): | |
| # Check for GPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| if device == "cuda": | |
| print(f"GPU Name: {torch.cuda.get_device_name(0)}") | |
| print(f"Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB") | |
| else: | |
| print("WARNING: No GPU detected. Training will be slow on CPU.") | |
| print(f"Loading model: {MODEL_NAME}...") | |
| try: | |
| tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False) | |
| model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME) | |
| model.to(device) # Move model to GPU immediately | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return | |
| # --- Load Dataset --- | |
| print("Loading 'billsum' dataset...") | |
| # Using 'ca_test' for a quick cycle | |
| dataset = load_dataset("billsum", split="ca_test") | |
| # Let's train on slightly more data now that we have a GPU | |
| # Splitting the 1200 ca_test examples | |
| dataset = dataset.train_test_split(test_size=0.1) | |
| train_dataset = dataset["train"] # Uses ~1000 examples | |
| eval_dataset = dataset["test"] # Uses ~100 examples | |
| print(f"Training on {len(train_dataset)} examples...") | |
| # --- Preprocessing --- | |
| prefix = "summarize: " | |
| def preprocess_function(examples): | |
| inputs = [prefix + doc for doc in examples["text"]] | |
| model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True) | |
| labels = tokenizer(text_target=examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True) | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| print("Tokenizing data...") | |
| tokenized_train = train_dataset.map(preprocess_function, batched=True) | |
| tokenized_eval = eval_dataset.map(preprocess_function, batched=True) | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
| # --- Training Args --- | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| eval_strategy="epoch", # ✅ Correct for transformers >= 4.40 | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=BATCH_SIZE, | |
| per_device_eval_batch_size=BATCH_SIZE, | |
| weight_decay=0.01, | |
| save_total_limit=1, | |
| num_train_epochs=EPOCHS, | |
| predict_with_generate=True, | |
| fp16=(device == "cuda"), # Mixed precision on GPU | |
| dataloader_num_workers=0, # Safe for Windows | |
| logging_steps=10, | |
| ) | |
| trainer = Seq2SeqTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train, | |
| eval_dataset=tokenized_eval, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| ) | |
| print("Starting training...") | |
| trainer.train() | |
| print("Saving model...") | |
| trainer.save_model(OUTPUT_DIR) | |
| tokenizer.save_pretrained(OUTPUT_DIR) | |
| print(f"Model saved to {OUTPUT_DIR}") | |
| if __name__ == "__main__": | |
| main() | |