# -*- coding: utf-8 -*- """LLM.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1AbtqagXl-cWKhXqd5z_uxZ2_JiCI4e6q """ # Step 1: Setup # pip install transformers # pip install datasets # !pip install transformers[torch] # !pip install transformers --upgrade # !pip install accelerate --upgrade import transformers import accelerate print("Transformers version:", transformers.__version__) print("Accelerate version:", accelerate.__version__) # Step 2: Mount Google Drive to access your data from google.colab import drive drive.mount('/content/drive') # Step 2: Import necessary libraries import torch from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments from datasets import load_dataset from datasets import set_caching_enabled set_caching_enabled(False) from datasets import load_dataset # Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/Ethics.txt'}) # Split the 'train' dataset into training and validation sets train_size = int(len(dataset['train']) * 0.9) train_dataset = dataset['train'].select(list(range(train_size))) validation_dataset = dataset['train'].select(list(range(train_size, len(dataset['train'])))) print("Train dataset size:", len(train_dataset)) print("Validation dataset size:", len(validation_dataset)) # Step 4: Tokenization from transformers import GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained('gpt2') def tokenize_function(examples): return tokenizer(examples["text"]) # Tokenize the dataset with a reduced number of workers tokenized_dataset = dataset.map( tokenize_function, batched=True, num_proc=1 # Set the number of workers to 1 ) # !pip install gpt-2-simple import gpt_2_simple as gpt2 # !pip install accelerate>=0.20.1 # !pip install accelerate -U # Step 5: Model Preparation from transformers import GPT2LMHeadModel model = GPT2LMHeadModel.from_pretrained('gpt2') # Step 6: Training from transformers import TrainingArguments,Trainer training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=1, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=validation_dataset, ) # Step 9: Save the Model model.save_pretrained("fine_tuned_model") # !pip install xformers # # Step 7: Testing # from transformers import pipeline # generator = pipeline('text-generation', model=model) # result = generator('My custom model says,')[0] # print(result['generated_text']) # Step 7: Testing from transformers import pipeline, GPT2Tokenizer # Create a tokenizer for your GPT-2 model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # You might need to adjust the model name # Create a text generation pipeline generator = pipeline('text-generation', model=model, tokenizer=tokenizer) # Generate text using the pipeline result = generator('My custom model says,')[0] print(result['generated_text'])