TP_NLP_Lennon_Chaves / tp2_nlp_lennon_chaves.py
lennonssss's picture
Upload tp2_nlp_lennon_chaves.py
4cff2f4 verified
# -*- coding: utf-8 -*-
"""TP2_NLP_Lennon_Chaves.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1ggDnqgrV0zUdbiI1exZQEjDT6ihRGlLY
"""
# Preparação do Ambiente
# Configuração do Google Collaboratory e Instalação das Bibliotecas Necessárias
#!pip install torch transformers requests
#!pip install accelerate -U
#!pip install datasets
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, PreTrainedModel, PretrainedConfig
from torch.utils.data import Dataset, DataLoader
import requests
from datasets import load_dataset
# Coleta e Pré-processamento dos Dados
# Utilização do Conjunto de Dados TinyShakespeare
dataset = load_dataset('tiny_shakespeare')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Tokenização e Limpeza dos Dados
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids'])
"""**Configuração da Arquitetura do Modelo LLaMA 1**
Vamos implementar os componentes principais da arquitetura LLaMA 1: RMSNorm, SwiGLU e Rotary Embeddings. Em seguida, definiremos a rede neural completa usando PyTorch.
"""
# Definição da Configuração e Modelo
class LLaMAConfig(PretrainedConfig):
model_type = "llama"
def __init__(self, vocab_size=50257, d_model=128, num_heads=4, num_layers=2, **kwargs):
self.vocab_size = vocab_size
self.d_model = d_model
self.num_heads = num_heads
self.num_layers = num_layers
super().__init__(**kwargs)
class LLaMAModel(PreTrainedModel):
config_class = LLaMAConfig
def __init__(self, config):
super().__init__(config)
self.embedding = nn.Embedding(config.vocab_size, config.d_model)
self.layers = nn.ModuleList([nn.TransformerEncoderLayer(config.d_model, config.num_heads) for _ in range(config.num_layers)])
self.norm = RMSNorm(config.d_model)
self.swiglu = SwiGLU(config.d_model)
self.rotary_emb = RotaryEmbeddings(config.d_model)
self.fc = nn.Linear(config.d_model, config.vocab_size)
self.init_weights()
def forward(self, x):
x = self.embedding(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x)
x = self.swiglu(x)
x = self.rotary_emb(x)
x = self.fc(x)
return x
class RMSNorm(nn.Module):
def __init__(self, d):
super().__init__()
self.scale = nn.Parameter(torch.ones(d))
def forward(self, x):
norm_x = torch.norm(x, dim=-1, keepdim=True)
return self.scale * x / (norm_x + 1e-6)
class SwiGLU(nn.Module):
def __init__(self, d):
super().__init__()
self.linear1 = nn.Linear(d, d)
self.linear2 = nn.Linear(d, d)
self.silu = nn.SiLU()
def forward(self, x):
return self.linear1(x) * self.silu(self.linear2(x))
class RotaryEmbeddings(nn.Module):
def __init__(self, d):
super().__init__()
self.d = d
def forward(self, x):
half_dim = self.d // 2
emb = torch.cat([torch.cos(x[:, :, :half_dim]), torch.sin(x[:, :, half_dim:])], dim=-1)
return emb
config = LLaMAConfig(vocab_size=tokenizer.vocab_size, d_model=128, num_heads=4, num_layers=2)
model = LLaMAModel(config)
# Treinamento do Modelo
# Ajuste dos Hiperparâmetros
learning_rate = 5e-5
batch_size = 32
num_epochs = 100
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
# Função de Treinamento
def train(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
for batch in dataloader:
inputs = batch['input_ids'].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
# DataLoader para o conjunto de dados
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True)
# Treinamento
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in range(num_epochs):
loss = train(model, train_dataloader, optimizer, criterion, device)
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss}")
# Avaliação do Modelo
# Função de Avaliação
def evaluate(model, dataloader, criterion, device):
model.eval()
total_loss = 0
with torch.no_grad():
for batch in dataloader:
inputs = batch['input_ids'].to(device)
outputs = model(inputs)
loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1))
total_loss += loss.item()
return total_loss / len(dataloader)
# DataLoader para avaliação
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size)
# Avaliação
eval_loss = evaluate(model, eval_dataloader, criterion, device)
perplexity = torch.exp(torch.tensor(eval_loss))
print(f"Validation Loss: {eval_loss}, Perplexity: {perplexity}")