malarsaravanan commited on
Commit
a431752
·
verified ·
1 Parent(s): bf45d63

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +332 -0
  2. model_quantized.pt +3 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shakespeare Text Generator - Hugging Face Gradio App
3
+ Trained GPT-2 model (124M params) with loss 0.094349
4
+ """
5
+
6
+ import gradio as gr
7
+ import torch
8
+ import tiktoken
9
+ import os
10
+ from dataclasses import dataclass
11
+
12
+
13
+ # GPT Model Architecture
14
+ @dataclass
15
+ class GPTConfig:
16
+ block_size: int = 1024
17
+ vocab_size: int = 50257
18
+ n_layer: int = 12
19
+ n_head: int = 12
20
+ n_embd: int = 768
21
+ dropout: float = 0.0
22
+ bias: bool = True
23
+
24
+
25
+ import torch.nn as nn
26
+ from torch.nn import functional as F
27
+ import math
28
+
29
+
30
+ class CausalSelfAttention(nn.Module):
31
+ def __init__(self, config):
32
+ super().__init__()
33
+ assert config.n_embd % config.n_head == 0
34
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
35
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
36
+ self.attn_dropout = nn.Dropout(config.dropout)
37
+ self.resid_dropout = nn.Dropout(config.dropout)
38
+ self.n_head = config.n_head
39
+ self.n_embd = config.n_embd
40
+ self.dropout = config.dropout
41
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
42
+ .view(1, 1, config.block_size, config.block_size))
43
+ self.c_proj.NANOGPT_SCALE_INIT = 1
44
+
45
+ def forward(self, x):
46
+ B, T, C = x.size()
47
+ qkv = self.c_attn(x)
48
+ q, k, v = qkv.split(self.n_embd, dim=2)
49
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
50
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
51
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
52
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
53
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
54
+ att = F.softmax(att, dim=-1)
55
+ att = self.attn_dropout(att)
56
+ y = att @ v
57
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
58
+ y = self.resid_dropout(self.c_proj(y))
59
+ return y
60
+
61
+
62
+ class MLP(nn.Module):
63
+ def __init__(self, config):
64
+ super().__init__()
65
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
66
+ self.gelu = nn.GELU(approximate='tanh')
67
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
68
+ self.dropout = nn.Dropout(config.dropout)
69
+ self.c_proj.NANOGPT_SCALE_INIT = 1
70
+
71
+ def forward(self, x):
72
+ x = self.c_fc(x)
73
+ x = self.gelu(x)
74
+ x = self.c_proj(x)
75
+ x = self.dropout(x)
76
+ return x
77
+
78
+
79
+ class Block(nn.Module):
80
+ def __init__(self, config):
81
+ super().__init__()
82
+ self.ln_1 = nn.LayerNorm(config.n_embd)
83
+ self.attn = CausalSelfAttention(config)
84
+ self.ln_2 = nn.LayerNorm(config.n_embd)
85
+ self.mlp = MLP(config)
86
+
87
+ def forward(self, x):
88
+ x = x + self.attn(self.ln_1(x))
89
+ x = x + self.mlp(self.ln_2(x))
90
+ return x
91
+
92
+
93
+ class GPT(nn.Module):
94
+ def __init__(self, config):
95
+ super().__init__()
96
+ self.config = config
97
+ self.transformer = nn.ModuleDict(dict(
98
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
99
+ wpe=nn.Embedding(config.block_size, config.n_embd),
100
+ drop=nn.Dropout(config.dropout),
101
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
102
+ ln_f=nn.LayerNorm(config.n_embd),
103
+ ))
104
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
105
+ self.transformer.wte.weight = self.lm_head.weight
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANOGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
118
+
119
+ def forward(self, idx, targets=None):
120
+ device = idx.device
121
+ b, t = idx.size()
122
+ assert t <= self.config.block_size
123
+ pos = torch.arange(0, t, dtype=torch.long, device=device)
124
+ pos_emb = self.transformer.wpe(pos)
125
+ tok_emb = self.transformer.wte(idx)
126
+ x = self.transformer.drop(tok_emb + pos_emb)
127
+ for block in self.transformer.h:
128
+ x = block(x)
129
+ x = self.transformer.ln_f(x)
130
+ if targets is not None:
131
+ logits = self.lm_head(x)
132
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
133
+ else:
134
+ logits = self.lm_head(x[:, [-1], :])
135
+ loss = None
136
+ return logits, loss
137
+
138
+ @torch.no_grad()
139
+ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
140
+ for _ in range(max_new_tokens):
141
+ idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
142
+ logits, _ = self(idx_cond)
143
+ logits = logits[:, -1, :] / temperature
144
+ if top_k is not None:
145
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
146
+ logits[logits < v[:, [-1]]] = -float('Inf')
147
+ probs = F.softmax(logits, dim=-1)
148
+ idx_next = torch.multinomial(probs, num_samples=1)
149
+ idx = torch.cat((idx, idx_next), dim=1)
150
+ return idx
151
+
152
+
153
+ # Load model
154
+ print("Loading model...")
155
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
156
+ config = GPTConfig()
157
+ model = GPT(config)
158
+
159
+ # Load checkpoint
160
+ checkpoint_path = "model_quantized.pt"
161
+ if os.path.exists(checkpoint_path):
162
+ checkpoint = torch.load(checkpoint_path, map_location=device)
163
+ model.load_state_dict(checkpoint['model_state_dict'])
164
+ print(f"✓ Loaded quantized model from {checkpoint_path}")
165
+ print(f" Training loss: {checkpoint.get('loss', 'N/A')}")
166
+ print(f" Model size: 330MB (FP16 quantized)")
167
+ else:
168
+ print("⚠️ Checkpoint not found. Please upload 'model_quantized.pt'")
169
+
170
+ model.to(device)
171
+ model.eval()
172
+ print(f"✓ Model ready on {device}")
173
+
174
+ # Tokenizer
175
+ enc = tiktoken.get_encoding('gpt2')
176
+
177
+
178
+ def generate_text(prompt, max_tokens=100, temperature=0.8, top_k=50):
179
+ """Generate text from a prompt"""
180
+
181
+ if not prompt:
182
+ return "⚠️ Please enter a prompt!"
183
+
184
+ try:
185
+ # Tokenize
186
+ tokens = enc.encode(prompt)
187
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
188
+
189
+ # Generate
190
+ with torch.no_grad():
191
+ generated = model.generate(
192
+ tokens,
193
+ max_new_tokens=max_tokens,
194
+ temperature=temperature,
195
+ top_k=top_k
196
+ )
197
+
198
+ # Decode
199
+ generated_text = enc.decode(generated[0].tolist())
200
+
201
+ return generated_text
202
+
203
+ except Exception as e:
204
+ return f"❌ Error: {str(e)}"
205
+
206
+
207
+ # Example prompts
208
+ examples = [
209
+ ["First Citizen:", 150, 0.8, 50],
210
+ ["ROMEO:", 150, 0.8, 50],
211
+ ["To be, or not to be,", 200, 0.7, 40],
212
+ ["What light through yonder window breaks?", 150, 0.8, 50],
213
+ ["Friends, Romans, countrymen,", 150, 0.8, 50],
214
+ ]
215
+
216
+
217
+ # Gradio Interface with Teal Theme
218
+ with gr.Blocks(
219
+ title="Shakespeare Text Generator",
220
+ theme=gr.themes.Soft(
221
+ primary_hue="teal",
222
+ secondary_hue="cyan",
223
+ neutral_hue="slate"
224
+ ),
225
+ css="""
226
+ .gradio-container {
227
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
228
+ }
229
+ .gr-button-primary {
230
+ background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%) !important;
231
+ border: none !important;
232
+ color: white !important;
233
+ font-weight: 600 !important;
234
+ }
235
+ .gr-button-primary:hover {
236
+ background: linear-gradient(135deg, #0d9488 0%, #0f766e 100%) !important;
237
+ transform: translateY(-1px);
238
+ box-shadow: 0 4px 12px rgba(20, 184, 166, 0.3) !important;
239
+ }
240
+ h1 {
241
+ color: #0f766e !important;
242
+ text-align: center;
243
+ }
244
+ """
245
+ ) as demo:
246
+ gr.Markdown("""
247
+ # 🎭 Shakespeare Text Generator
248
+
249
+ **GPT-2 Model (124M parameters)** trained on Shakespeare's complete works.
250
+
251
+ - **Final Loss**: 0.094349 (Target: < 0.099999) ✅
252
+ - **Architecture**: Decoder-only Transformer (12 layers, 12 heads, 768 dim)
253
+
254
+ Enter a Shakespearean prompt and watch the AI continue the text!
255
+ """)
256
+
257
+ with gr.Row():
258
+ with gr.Column(scale=2):
259
+ prompt_input = gr.Textbox(
260
+ label="Prompt",
261
+ placeholder="Enter a Shakespearean prompt (e.g., 'First Citizen:', 'ROMEO:', 'To be, or not to be,')",
262
+ lines=3
263
+ )
264
+
265
+ with gr.Row():
266
+ max_tokens = gr.Slider(
267
+ minimum=50,
268
+ maximum=500,
269
+ value=150,
270
+ step=10,
271
+ label="Max Tokens"
272
+ )
273
+ temperature = gr.Slider(
274
+ minimum=0.5,
275
+ maximum=1.5,
276
+ value=0.8,
277
+ step=0.1,
278
+ label="Temperature (creativity)"
279
+ )
280
+ top_k = gr.Slider(
281
+ minimum=10,
282
+ maximum=100,
283
+ value=50,
284
+ step=10,
285
+ label="Top-K (diversity)"
286
+ )
287
+
288
+ generate_btn = gr.Button("✨ Generate Shakespeare", variant="primary", size="lg")
289
+
290
+ with gr.Column(scale=2):
291
+ output_text = gr.Textbox(
292
+ label="Generated Text",
293
+ lines=15,
294
+ show_copy_button=True
295
+ )
296
+
297
+ gr.Markdown("""
298
+ ### 💡 Tips:
299
+ - **Temperature**: Lower (0.5-0.7) = more focused, Higher (0.9-1.2) = more creative
300
+ - **Top-K**: Controls vocabulary diversity (40-60 recommended)
301
+ - **Prompts**: Try character names (ROMEO:, JULIET:) or famous phrases
302
+ """)
303
+
304
+ gr.Examples(
305
+ examples=examples,
306
+ inputs=[prompt_input, max_tokens, temperature, top_k],
307
+ label="Example Prompts"
308
+ )
309
+
310
+ gr.Markdown("""
311
+ ---
312
+ ### 📊 Model Details:
313
+ - **Parameters**: 123,653,632 (124M)
314
+ - **Architecture**: GPT-2 (Decoder-only Transformer)
315
+ - **Training Data**: Shakespeare's complete works
316
+ - **Final Loss**: 0.094349
317
+ - **Techniques**: Gradient Accumulation, LR Scheduling, AdamW, Parameter-specific Weight Decay
318
+
319
+ **GitHub**: [View Source Code](https://github.com/yourusername/gpt-shakespeare)
320
+ """)
321
+
322
+ # Connect button
323
+ generate_btn.click(
324
+ fn=generate_text,
325
+ inputs=[prompt_input, max_tokens, temperature, top_k],
326
+ outputs=output_text
327
+ )
328
+
329
+
330
+ if __name__ == "__main__":
331
+ demo.launch()
332
+
model_quantized.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:326f6579f24398f528f725bfe10eae7d28830c033c272570d0837d861cb0e60e
3
+ size 351291573
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ torch>=2.0.0
3
+ tiktoken>=0.5.0
4
+ numpy>=1.24.0
5
+