Spaces:
Configuration error
Configuration error
| # -*- coding: utf-8 -*- | |
| """LLM.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1AbtqagXl-cWKhXqd5z_uxZ2_JiCI4e6q | |
| """ | |
| # Step 1: Setup | |
| # pip install transformers | |
| # pip install datasets | |
| # !pip install transformers[torch] | |
| # !pip install transformers --upgrade | |
| # !pip install accelerate --upgrade | |
| import transformers | |
| import accelerate | |
| print("Transformers version:", transformers.__version__) | |
| print("Accelerate version:", accelerate.__version__) | |
| # Step 2: Mount Google Drive to access your data | |
| from google.colab import drive | |
| drive.mount('/content/drive') | |
| # Step 2: Import necessary libraries | |
| import torch | |
| from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import load_dataset | |
| from datasets import set_caching_enabled | |
| set_caching_enabled(False) | |
| from datasets import load_dataset | |
| # Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive | |
| dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/Ethics.txt'}) | |
| # Split the 'train' dataset into training and validation sets | |
| train_size = int(len(dataset['train']) * 0.9) | |
| train_dataset = dataset['train'].select(list(range(train_size))) | |
| validation_dataset = dataset['train'].select(list(range(train_size, len(dataset['train'])))) | |
| print("Train dataset size:", len(train_dataset)) | |
| print("Validation dataset size:", len(validation_dataset)) | |
| # Step 4: Tokenization | |
| from transformers import GPT2Tokenizer | |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"]) | |
| # Tokenize the dataset with a reduced number of workers | |
| tokenized_dataset = dataset.map( | |
| tokenize_function, | |
| batched=True, | |
| num_proc=1 # Set the number of workers to 1 | |
| ) | |
| # !pip install gpt-2-simple | |
| import gpt_2_simple as gpt2 | |
| # !pip install accelerate>=0.20.1 | |
| # !pip install accelerate -U | |
| # Step 5: Model Preparation | |
| from transformers import GPT2LMHeadModel | |
| model = GPT2LMHeadModel.from_pretrained('gpt2') | |
| # Step 6: Training | |
| from transformers import TrainingArguments,Trainer | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| overwrite_output_dir=True, | |
| num_train_epochs=3, | |
| per_device_train_batch_size=1, | |
| save_steps=10_000, | |
| save_total_limit=2, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=validation_dataset, | |
| ) | |
| # Step 9: Save the Model | |
| model.save_pretrained("fine_tuned_model") | |
| # !pip install xformers | |
| # # Step 7: Testing | |
| # from transformers import pipeline | |
| # generator = pipeline('text-generation', model=model) | |
| # result = generator('My custom model says,')[0] | |
| # print(result['generated_text']) | |
| # Step 7: Testing | |
| from transformers import pipeline, GPT2Tokenizer | |
| # Create a tokenizer for your GPT-2 model | |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # You might need to adjust the model name | |
| # Create a text generation pipeline | |
| generator = pipeline('text-generation', model=model, tokenizer=tokenizer) | |
| # Generate text using the pipeline | |
| result = generator('My custom model says,')[0] | |
| print(result['generated_text']) |