File size: 4,334 Bytes
d958fc1
 
 
c84502c
d958fc1
c84502c
 
d958fc1
c84502c
 
 
 
 
d958fc1
 
c84502c
d958fc1
c84502c
 
d958fc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
import gradio as gr

# ===== DATASET =====
with open("dataset.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

def encode(s): return [stoi.get(c, 0) for c in s]
def decode(l): return "".join([itos[i] for i in l])

# ===== GPT-Style Transformer Decoder =====
class GPTBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.GELU(),
            nn.Linear(dim_feedforward, d_model),
            nn.Dropout(dropout),
        )
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attn(x, x, x, attn_mask=mask)
        x = self.ln1(x + attn_out)
        ff_out = self.ff(x)
        x = self.ln2(x + ff_out)
        return x

class GPTModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=4, dim_feedforward=512, max_len=5000, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        self.blocks = nn.ModuleList([GPTBlock(d_model, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        token_embeddings = self.token_emb(x)  # (batch, seq_len, d_model)
        pos_embeddings = self.pos_emb[:, :seq_len, :]  # (1, seq_len, d_model)
        x = token_embeddings + pos_embeddings
        x = x.transpose(0, 1)  # for MultiheadAttention: (seq_len, batch, d_model)

        # causal mask (upper triangular)
        mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1).to(x.device)

        for block in self.blocks:
            x = block(x, mask)

        x = x.transpose(0, 1)  # back to (batch, seq_len, d_model)
        x = self.ln_f(x)
        logits = self.head(x)  # (batch, seq_len, vocab_size)
        return logits

# ===== TRAINING =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTModel(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()

seq_len = 25
batch_size = 1
epochs = 300

data_tensor = torch.tensor(encode(text), dtype=torch.long)

for epoch in range(epochs):
    model.train()
    idx = np.random.randint(0, len(data_tensor) - seq_len - 1)
    chunk = data_tensor[idx:idx+seq_len+1].unsqueeze(0).to(device)  # (1, seq_len+1)
    input_seq = chunk[:, :-1]
    target_seq = chunk[:, 1:]

    optimizer.zero_grad()
    logits = model(input_seq)
    loss = criterion(logits.view(-1, vocab_size), target_seq.view(-1))
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# ===== TEXT GENERATION =====
def generate_text(model, seed, max_len=100):
    model.eval()
    input_ids = torch.tensor(encode(seed), dtype=torch.long).unsqueeze(0).to(device)
    generated = seed

    with torch.no_grad():
        for _ in range(max_len):
            logits = model(input_ids)
            probs = F.softmax(logits[0, -1], dim=-1).cpu().numpy()
            next_id = np.random.choice(len(probs), p=probs)
            generated += itos[next_id]
            next_token = torch.tensor([[next_id]], device=device)
            input_ids = torch.cat([input_ids, next_token], dim=1)

    return generated

# ===== GRADIO CHAT =====
def chat_with_ai(inp):
    return generate_text(model, inp, max_len=100)[len(inp):]

import gradio as gr
iface = gr.Interface(fn=chat_with_ai,
                     inputs=gr.Textbox(lines=1, placeholder="Ketik chat kamu..."),
                     outputs="text",
                     title="Chat AI Transformer GPT Style",
                     description="Chat AI pake model Transformer GPT-style sederhana")

iface.launch()