demo-app / app.py
SoniR's picture
Create app.py
0b0afda
# -*- coding: utf-8 -*-
"""LLM.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1AbtqagXl-cWKhXqd5z_uxZ2_JiCI4e6q
"""
# Step 1: Setup
# pip install transformers
# pip install datasets
# !pip install transformers[torch]
# !pip install transformers --upgrade
# !pip install accelerate --upgrade
import transformers
import accelerate
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)
# Step 2: Mount Google Drive to access your data
from google.colab import drive
drive.mount('/content/drive')
# Step 2: Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import set_caching_enabled
set_caching_enabled(False)
from datasets import load_dataset
# Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive
dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/Ethics.txt'})
# Split the 'train' dataset into training and validation sets
train_size = int(len(dataset['train']) * 0.9)
train_dataset = dataset['train'].select(list(range(train_size)))
validation_dataset = dataset['train'].select(list(range(train_size, len(dataset['train']))))
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(validation_dataset))
# Step 4: Tokenization
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def tokenize_function(examples):
return tokenizer(examples["text"])
# Tokenize the dataset with a reduced number of workers
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
num_proc=1 # Set the number of workers to 1
)
# !pip install gpt-2-simple
import gpt_2_simple as gpt2
# !pip install accelerate>=0.20.1
# !pip install accelerate -U
# Step 5: Model Preparation
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Step 6: Training
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=1,
save_steps=10_000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
)
# Step 9: Save the Model
model.save_pretrained("fine_tuned_model")
# !pip install xformers
# # Step 7: Testing
# from transformers import pipeline
# generator = pipeline('text-generation', model=model)
# result = generator('My custom model says,')[0]
# print(result['generated_text'])
# Step 7: Testing
from transformers import pipeline, GPT2Tokenizer
# Create a tokenizer for your GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # You might need to adjust the model name
# Create a text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
# Generate text using the pipeline
result = generator('My custom model says,')[0]
print(result['generated_text'])