|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import random |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SimpleTokenizer: |
|
|
def __init__(self, texts): |
|
|
chars = sorted(list(set("".join(texts)))) |
|
|
self.stoi = {ch:i for i,ch in enumerate(chars)} |
|
|
self.itos = {i:ch for i,ch in enumerate(chars)} |
|
|
self.vocab_size = len(chars) |
|
|
|
|
|
def encode(self, text): |
|
|
return [self.stoi[c] for c in text] |
|
|
|
|
|
def decode(self, ids): |
|
|
return "".join([self.itos[i] for i in ids]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MiniGPT(nn.Module): |
|
|
def __init__(self, vocab_size, n_embd=64, n_layer=4, n_head=4, block_size=64): |
|
|
super().__init__() |
|
|
self.token_emb = nn.Embedding(vocab_size, n_embd) |
|
|
self.pos_emb = nn.Embedding(block_size, n_embd) |
|
|
self.blocks = nn.ModuleList([ |
|
|
nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head) |
|
|
for _ in range(n_layer) |
|
|
]) |
|
|
self.ln_f = nn.LayerNorm(n_embd) |
|
|
self.head = nn.Linear(n_embd, vocab_size) |
|
|
self.block_size = block_size |
|
|
|
|
|
def forward(self, idx): |
|
|
B, T = idx.shape |
|
|
token_embeddings = self.token_emb(idx) |
|
|
positions = torch.arange(T, device=idx.device) |
|
|
pos_embeddings = self.pos_emb(positions) |
|
|
x = token_embeddings + pos_embeddings |
|
|
|
|
|
x = x.transpose(0, 1) |
|
|
for block in self.blocks: |
|
|
x = block(x) |
|
|
x = x.transpose(0,1) |
|
|
x = self.ln_f(x) |
|
|
logits = self.head(x) |
|
|
return logits |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
texts = [ |
|
|
"Bonjour je suis un mini agent IA. ", |
|
|
"L'espace est immense et mystérieux. ", |
|
|
"Les étoiles brillent dans le ciel nocturne. ", |
|
|
"Le futur de l'IA est fascinant. " |
|
|
] |
|
|
|
|
|
tokenizer = SimpleTokenizer(texts) |
|
|
data = [tokenizer.encode(t) for t in texts] |
|
|
data = torch.tensor([t + [0]*(64-len(t)) for t in data]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model = MiniGPT(vocab_size=tokenizer.vocab_size).to(device) |
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) |
|
|
loss_fn = nn.CrossEntropyLoss() |
|
|
|
|
|
for epoch in range(200): |
|
|
idx = data.to(device) |
|
|
logits = model(idx) |
|
|
|
|
|
loss = loss_fn(logits[:,:-1,:].reshape(-1, tokenizer.vocab_size), |
|
|
idx[:,1:].reshape(-1)) |
|
|
optimizer.zero_grad() |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
if epoch % 20 == 0: |
|
|
print(f"Epoch {epoch} - Loss: {loss.item():.4f}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate(model, tokenizer, start="L", length=100): |
|
|
model.eval() |
|
|
idx = torch.tensor([tokenizer.encode(start)], device=device) |
|
|
for _ in range(length): |
|
|
logits = model(idx) |
|
|
logits = logits[:,-1,:] |
|
|
probs = F.softmax(logits, dim=-1) |
|
|
next_id = torch.multinomial(probs, num_samples=1) |
|
|
idx = torch.cat([idx, next_id], dim=1) |
|
|
return tokenizer.decode(idx[0].tolist()) |
|
|
|
|
|
print("Texte généré :") |
|
|
print(generate(model, tokenizer, start="L")) |
|
|
|