Spaces:
Runtime error
Runtime error
File size: 3,861 Bytes
7c2f77f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | # -*- coding: utf-8 -*-
"""Untitled9.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG
"""
import torch
print("GPU Available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
!pip install transformers datasets nltk -q
from datasets import load_dataset
ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction")
with open("dataset.txt", "w", encoding="utf-8") as f:
f.write(text)
import re
with open("dataset.txt", "r", encoding="utf-8") as f:
text = f.read()
# Remove Gutenberg header/footer
start = text.find("CHAPTER I")
end = text.find("End of the Project Gutenberg")
text = text[start:end]
# Basic cleaning
text = re.sub(r'\n+', '\n', text)
text = text.lower()
with open("clean_text.txt", "w", encoding="utf-8") as f:
f.write(text)
print("Cleaned text length:", len(text))
from datasets import load_dataset
dataset = load_dataset("text", data_files={"train": "clean_text.txt"})
print(dataset)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=1, # increase to 3 for better results
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
logging_steps=100,
fp16=True # GPU acceleration
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Verify the lengths of input_ids in the tokenized_dataset
inconsistent_lengths = []
expected_length = 128
for i, example in enumerate(tokenized_dataset["train"]):
if len(example["input_ids"]) != expected_length:
inconsistent_lengths.append((i, len(example["input_ids"])))
if inconsistent_lengths:
print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:")
for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples
print(f" Example index {idx}: length {length}")
else:
print(f"All input_ids in the training dataset have the expected length of {expected_length}.")
# Also check for unexpected columns
print("\nFeatures in tokenized_dataset['train']:")
print(tokenized_dataset["train"].features)
trainer.train()
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
prompt = "alice was feeling"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(
**inputs,
max_length=100,
temperature=0.8,
top_k=50,
top_p=0.95,
do_sample=True
)
print(tokenizer.decode(output[0], skip_special_tokens=True))
import math
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print("Perplexity:", perplexity)
|