File size: 6,987 Bytes
12cbb16
 
ae652a3
b348ae4
dd9995d
12cbb16
0d5a057
12cbb16
b348ae4
 
 
 
 
0d5a057
12cbb16
b348ae4
15250ee
12cbb16
 
f3a2ef3
 
12cbb16
ec6d244
 
 
12cbb16
7a4f90a
 
 
 
 
12cbb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7fb50
12cbb16
b348ae4
12cbb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b348ae4
12cbb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4926a0c
3639f22
12cbb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d5a057
12cbb16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import gradio as gr

import gdown
import torch
import torch.nn as nn
from torch.nn import functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
n_embd = 384
n_head = 4
n_layer = 4
block_size = 128
dropout = 0.2


gdown.download('https://drive.usercontent.google.com/download?id=14k2xUrvJ32trhLCzV2_O7klreBBA3dUu&authuser=0&confirm=t', 'model.pth', quiet=False)
gdown.download('https://drive.usercontent.google.com/download?id=1-JSvTzTxyI5zJwO39o0wuxJpvY-NqzGE&export=download&authuser=0&confirm=t&uuid=9eff48e6-67f8-4728-aa7f-552c497fb02c&at=AN_67v0xah9SgNOs5FDNKIuxVWL9%3A1727637766874', 'data.txt.gz', quiet=False)


import gzip
with gzip.open('data.txt.gz', 'rt', encoding='utf-8') as f:
    dataset = f.read()
#    chars = sorted(list(set(dataset)))
chars = ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x81', '\x8d', '\x8f', '\x90', '\x92', '\x93', '\x94', '\x9d', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'Â', 'Ã', 'Æ', 'Ç', 'É', 'Ê', 'Ë', 'Ð', 'Ò', '×', 'Ø', 'Ù', 'à', 'á', 'â', 'ã', 'ä', 'å', 'é', 'í', 'ï', 'ð', 'ñ', 'ó', 'ö', 'ā', 'Œ', 'œ', 'Š', 'š', 'Ÿ', 'Ž', 'ž', 'ƒ', 'ˆ', '˜', 'і', '\u2005', '\u2009', '\u200a', '\u200b', '\u200e', '–', '—', '―', '‘', '’', '‚', '“', '”', '„', '†', '‡', '•', '…', '\u2028', '\u2029', '\u202a', '‰', '′', '‹', '›', '€', '™', '−', '─', '」', 'fi', '\ufeff', '�', '𝑐', '🌴', '🌹', '🍌', '🙂']
vocab_size = 212# len(chars)

string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(dataset), dtype=torch.long)

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        w = q @ k.transpose(-2, -1) * C**-0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        v = self.value(x)
        out = w @ v
        return out


class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["training", "validation"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


model = GPTLanguageModel(vocab_size)
m = model.to(device)
# print(sum(p.numel() for p in m.parameters()) / 1e3, "K parameters")

# load the model.pth
model.load_state_dict(torch.load("model.pth", map_location=torch.device('cpu')), strict=False)

model.eval()

def respond(
    message,
    max_tokens=512,
):
    context = torch.tensor(encode(message), dtype=torch.long, device=device).unsqueeze(
        0
    )
    response = decode(model.generate(context, max_new_tokens=max_tokens)[0].tolist())
    return response


iface = gr.Interface(
    fn=respond,
    inputs=[
        gr.Textbox(lines=5, label="Message", value="Hi Harry Potter"),
        gr.Slider(minimum=100, maximum=2048, value=256, label="Max Tokens"),
    ],
    outputs="text",
    title="PotterLLM",
    description="A language model trained on Harry Potter Series.",
    theme="huggingface",
)

if __name__ == "__main__":
    iface.launch()