| |
| """TP2_NLP_Lennon_Chaves.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1ggDnqgrV0zUdbiI1exZQEjDT6ihRGlLY |
| """ |
|
|
| |
|
|
| |
|
|
| |
| |
| |
|
|
| import torch |
| import torch.nn as nn |
| from transformers import GPT2Tokenizer, PreTrainedModel, PretrainedConfig |
| from torch.utils.data import Dataset, DataLoader |
| import requests |
| from datasets import load_dataset |
|
|
| |
|
|
| |
| dataset = load_dataset('tiny_shakespeare') |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| def tokenize_function(examples): |
| return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512) |
|
|
| tokenized_datasets = dataset.map(tokenize_function, batched=True) |
| tokenized_datasets.set_format(type='torch', columns=['input_ids']) |
|
|
| """**Configuração da Arquitetura do Modelo LLaMA 1** |
| |
| Vamos implementar os componentes principais da arquitetura LLaMA 1: RMSNorm, SwiGLU e Rotary Embeddings. Em seguida, definiremos a rede neural completa usando PyTorch. |
| """ |
|
|
| |
| class LLaMAConfig(PretrainedConfig): |
| model_type = "llama" |
|
|
| def __init__(self, vocab_size=50257, d_model=128, num_heads=4, num_layers=2, **kwargs): |
| self.vocab_size = vocab_size |
| self.d_model = d_model |
| self.num_heads = num_heads |
| self.num_layers = num_layers |
| super().__init__(**kwargs) |
|
|
| class LLaMAModel(PreTrainedModel): |
| config_class = LLaMAConfig |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.embedding = nn.Embedding(config.vocab_size, config.d_model) |
| self.layers = nn.ModuleList([nn.TransformerEncoderLayer(config.d_model, config.num_heads) for _ in range(config.num_layers)]) |
| self.norm = RMSNorm(config.d_model) |
| self.swiglu = SwiGLU(config.d_model) |
| self.rotary_emb = RotaryEmbeddings(config.d_model) |
| self.fc = nn.Linear(config.d_model, config.vocab_size) |
| self.init_weights() |
|
|
| def forward(self, x): |
| x = self.embedding(x) |
| for layer in self.layers: |
| x = layer(x) |
| x = self.norm(x) |
| x = self.swiglu(x) |
| x = self.rotary_emb(x) |
| x = self.fc(x) |
| return x |
|
|
| class RMSNorm(nn.Module): |
| def __init__(self, d): |
| super().__init__() |
| self.scale = nn.Parameter(torch.ones(d)) |
|
|
| def forward(self, x): |
| norm_x = torch.norm(x, dim=-1, keepdim=True) |
| return self.scale * x / (norm_x + 1e-6) |
|
|
| class SwiGLU(nn.Module): |
| def __init__(self, d): |
| super().__init__() |
| self.linear1 = nn.Linear(d, d) |
| self.linear2 = nn.Linear(d, d) |
| self.silu = nn.SiLU() |
|
|
| def forward(self, x): |
| return self.linear1(x) * self.silu(self.linear2(x)) |
|
|
| class RotaryEmbeddings(nn.Module): |
| def __init__(self, d): |
| super().__init__() |
| self.d = d |
|
|
| def forward(self, x): |
| half_dim = self.d // 2 |
| emb = torch.cat([torch.cos(x[:, :, :half_dim]), torch.sin(x[:, :, half_dim:])], dim=-1) |
| return emb |
|
|
| config = LLaMAConfig(vocab_size=tokenizer.vocab_size, d_model=128, num_heads=4, num_layers=2) |
| model = LLaMAModel(config) |
|
|
| |
|
|
| |
| learning_rate = 5e-5 |
| batch_size = 32 |
| num_epochs = 100 |
|
|
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) |
| criterion = nn.CrossEntropyLoss() |
|
|
| |
| def train(model, dataloader, optimizer, criterion, device): |
| model.train() |
| total_loss = 0 |
| for batch in dataloader: |
| inputs = batch['input_ids'].to(device) |
| optimizer.zero_grad() |
| outputs = model(inputs) |
| loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1)) |
| loss.backward() |
| optimizer.step() |
| total_loss += loss.item() |
| return total_loss / len(dataloader) |
|
|
| |
| from torch.utils.data import DataLoader |
|
|
| train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True) |
|
|
| |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| model.to(device) |
|
|
| for epoch in range(num_epochs): |
| loss = train(model, train_dataloader, optimizer, criterion, device) |
| print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss}") |
|
|
| |
|
|
| |
| def evaluate(model, dataloader, criterion, device): |
| model.eval() |
| total_loss = 0 |
| with torch.no_grad(): |
| for batch in dataloader: |
| inputs = batch['input_ids'].to(device) |
| outputs = model(inputs) |
| loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1)) |
| total_loss += loss.item() |
| return total_loss / len(dataloader) |
|
|
| |
| eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size) |
|
|
| |
| eval_loss = evaluate(model, eval_dataloader, criterion, device) |
| perplexity = torch.exp(torch.tensor(eval_loss)) |
| print(f"Validation Loss: {eval_loss}, Perplexity: {perplexity}") |