File size: 3,861 Bytes
7c2f77f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""Untitled9.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG
"""

import torch
print("GPU Available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

!pip install transformers datasets nltk -q

from datasets import load_dataset

ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction")

with open("dataset.txt", "w", encoding="utf-8") as f:
    f.write(text)

import re

with open("dataset.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Remove Gutenberg header/footer
start = text.find("CHAPTER I")
end = text.find("End of the Project Gutenberg")
text = text[start:end]

# Basic cleaning
text = re.sub(r'\n+', '\n', text)
text = text.lower()

with open("clean_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("Cleaned text length:", len(text))

from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": "clean_text.txt"})
print(dataset)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,          # increase to 3 for better results
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=True                   # GPU acceleration
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Verify the lengths of input_ids in the tokenized_dataset
inconsistent_lengths = []
expected_length = 128

for i, example in enumerate(tokenized_dataset["train"]):
    if len(example["input_ids"]) != expected_length:
        inconsistent_lengths.append((i, len(example["input_ids"])))

if inconsistent_lengths:
    print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:")
    for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples
        print(f"  Example index {idx}: length {length}")
else:
    print(f"All input_ids in the training dataset have the expected length of {expected_length}.")

# Also check for unexpected columns
print("\nFeatures in tokenized_dataset['train']:")
print(tokenized_dataset["train"].features)

trainer.train()

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

prompt = "alice was feeling"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = model.generate(
    **inputs,
    max_length=100,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

import math

eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print("Perplexity:", perplexity)