# -*- coding: utf-8 -*- """Untitled9.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG """ import torch print("GPU Available:", torch.cuda.is_available()) print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu")) !pip install transformers datasets nltk -q from datasets import load_dataset ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction") with open("dataset.txt", "w", encoding="utf-8") as f: f.write(text) import re with open("dataset.txt", "r", encoding="utf-8") as f: text = f.read() # Remove Gutenberg header/footer start = text.find("CHAPTER I") end = text.find("End of the Project Gutenberg") text = text[start:end] # Basic cleaning text = re.sub(r'\n+', '\n', text) text = text.lower() with open("clean_text.txt", "w", encoding="utf-8") as f: f.write(text) print("Cleaned text length:", len(text)) from datasets import load_dataset dataset = load_dataset("text", data_files={"train": "clean_text.txt"}) print(dataset) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length") tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Split the dataset into training and evaluation sets tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1) train_dataset = tokenized_dataset["train"] eval_dataset = tokenized_dataset["test"] from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("gpt2") from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling training_args = TrainingArguments( output_dir="./results", num_train_epochs=1, # increase to 3 for better results per_device_train_batch_size=2, save_steps=500, save_total_limit=2, logging_steps=100, fp16=True # GPU acceleration ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) train_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) # Verify the lengths of input_ids in the tokenized_dataset inconsistent_lengths = [] expected_length = 128 for i, example in enumerate(tokenized_dataset["train"]): if len(example["input_ids"]) != expected_length: inconsistent_lengths.append((i, len(example["input_ids"]))) if inconsistent_lengths: print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:") for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples print(f" Example index {idx}: length {length}") else: print(f"All input_ids in the training dataset have the expected length of {expected_length}.") # Also check for unexpected columns print("\nFeatures in tokenized_dataset['train']:") print(tokenized_dataset["train"].features) trainer.train() import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) prompt = "alice was feeling" inputs = tokenizer(prompt, return_tensors="pt").to(device) output = model.generate( **inputs, max_length=100, temperature=0.8, top_k=50, top_p=0.95, do_sample=True ) print(tokenizer.decode(output[0], skip_special_tokens=True)) import math eval_results = trainer.evaluate() perplexity = math.exp(eval_results["eval_loss"]) print("Perplexity:", perplexity)