Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Untitled9.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG | |
| """ | |
| import torch | |
| print("GPU Available:", torch.cuda.is_available()) | |
| print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu")) | |
| !pip install transformers datasets nltk -q | |
| from datasets import load_dataset | |
| ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction") | |
| with open("dataset.txt", "w", encoding="utf-8") as f: | |
| f.write(text) | |
| import re | |
| with open("dataset.txt", "r", encoding="utf-8") as f: | |
| text = f.read() | |
| # Remove Gutenberg header/footer | |
| start = text.find("CHAPTER I") | |
| end = text.find("End of the Project Gutenberg") | |
| text = text[start:end] | |
| # Basic cleaning | |
| text = re.sub(r'\n+', '\n', text) | |
| text = text.lower() | |
| with open("clean_text.txt", "w", encoding="utf-8") as f: | |
| f.write(text) | |
| print("Cleaned text length:", len(text)) | |
| from datasets import load_dataset | |
| dataset = load_dataset("text", data_files={"train": "clean_text.txt"}) | |
| print(dataset) | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length") | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
| # Split the dataset into training and evaluation sets | |
| tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1) | |
| train_dataset = tokenized_dataset["train"] | |
| eval_dataset = tokenized_dataset["test"] | |
| from transformers import AutoModelForCausalLM | |
| model = AutoModelForCausalLM.from_pretrained("gpt2") | |
| from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| num_train_epochs=1, # increase to 3 for better results | |
| per_device_train_batch_size=2, | |
| save_steps=500, | |
| save_total_limit=2, | |
| logging_steps=100, | |
| fp16=True # GPU acceleration | |
| ) | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| train_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) | |
| eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| data_collator=data_collator, | |
| ) | |
| # Verify the lengths of input_ids in the tokenized_dataset | |
| inconsistent_lengths = [] | |
| expected_length = 128 | |
| for i, example in enumerate(tokenized_dataset["train"]): | |
| if len(example["input_ids"]) != expected_length: | |
| inconsistent_lengths.append((i, len(example["input_ids"]))) | |
| if inconsistent_lengths: | |
| print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:") | |
| for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples | |
| print(f" Example index {idx}: length {length}") | |
| else: | |
| print(f"All input_ids in the training dataset have the expected length of {expected_length}.") | |
| # Also check for unexpected columns | |
| print("\nFeatures in tokenized_dataset['train']:") | |
| print(tokenized_dataset["train"].features) | |
| trainer.train() | |
| import torch | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| prompt = "alice was feeling" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| output = model.generate( | |
| **inputs, | |
| max_length=100, | |
| temperature=0.8, | |
| top_k=50, | |
| top_p=0.95, | |
| do_sample=True | |
| ) | |
| print(tokenizer.decode(output[0], skip_special_tokens=True)) | |
| import math | |
| eval_results = trainer.evaluate() | |
| perplexity = math.exp(eval_results["eval_loss"]) | |
| print("Perplexity:", perplexity) | |