TextGeneration / untitled9.py
MLCraftsman's picture
Upload 2 files
7c2f77f verified
# -*- coding: utf-8 -*-
"""Untitled9.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG
"""
import torch
print("GPU Available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
!pip install transformers datasets nltk -q
from datasets import load_dataset
ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction")
with open("dataset.txt", "w", encoding="utf-8") as f:
f.write(text)
import re
with open("dataset.txt", "r", encoding="utf-8") as f:
text = f.read()
# Remove Gutenberg header/footer
start = text.find("CHAPTER I")
end = text.find("End of the Project Gutenberg")
text = text[start:end]
# Basic cleaning
text = re.sub(r'\n+', '\n', text)
text = text.lower()
with open("clean_text.txt", "w", encoding="utf-8") as f:
f.write(text)
print("Cleaned text length:", len(text))
from datasets import load_dataset
dataset = load_dataset("text", data_files={"train": "clean_text.txt"})
print(dataset)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=1, # increase to 3 for better results
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
logging_steps=100,
fp16=True # GPU acceleration
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Verify the lengths of input_ids in the tokenized_dataset
inconsistent_lengths = []
expected_length = 128
for i, example in enumerate(tokenized_dataset["train"]):
if len(example["input_ids"]) != expected_length:
inconsistent_lengths.append((i, len(example["input_ids"])))
if inconsistent_lengths:
print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:")
for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples
print(f" Example index {idx}: length {length}")
else:
print(f"All input_ids in the training dataset have the expected length of {expected_length}.")
# Also check for unexpected columns
print("\nFeatures in tokenized_dataset['train']:")
print(tokenized_dataset["train"].features)
trainer.train()
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
prompt = "alice was feeling"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(
**inputs,
max_length=100,
temperature=0.8,
top_k=50,
top_p=0.95,
do_sample=True
)
print(tokenizer.decode(output[0], skip_special_tokens=True))
import math
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print("Perplexity:", perplexity)