import torch import torch.nn as nn class MiniTransformer(nn.Module): def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.attention = nn.MultiheadAttention(embed_dim, num_heads) self.linear1 = nn.Linear(embed_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, vocab_size) def forward(self, x): x = self.embedding(x) attn_output, _ = self.attention(x, x, x) x = self.linear1(attn_output).relu() return self.linear2(x)