Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

3216812

verified ·

1 Parent(s): 95db43d

Upload 2 files

Browse files

Files changed (2) hide show

app.py +246 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+HuggingFace Spaces App for GPT-2 124M Shakespeare Model
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import tiktoken
+import gradio as gr
+import math
+from dataclasses import dataclass
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.c_proj.NANOGPT_SCALE_INIT = 1
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
+    def forward(self, x):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.gelu = nn.GELU(approximate='tanh')
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.c_proj.NANOGPT_SCALE_INIT = 1
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50257
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            wpe=nn.Embedding(config.block_size, config.n_embd),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=nn.LayerNorm(config.n_embd),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+    def forward(self, idx, targets=None):
+        B, T = idx.size()
+        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
+        pos_emb = self.transformer.wpe(pos)
+        tok_emb = self.transformer.wte(idx)
+        x = tok_emb + pos_emb
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+# Load model
+print("Loading model...")
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+config = GPTConfig()
+model = GPT(config)
+# Try to load model (works both locally and on HuggingFace)
+try:
+    checkpoint = torch.load('model_checkpoint_final.pt', map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    print("Model loaded from checkpoint")
+except FileNotFoundError:
+    print("Warning: Model checkpoint not found. Using untrained model.")
+    # Model will be randomly initialized - not ideal but won't crash
+model.to(device)
+model.eval()
+print(f"Model ready on {device}")
+enc = tiktoken.get_encoding('gpt2')
+def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
+    """Generate text from prompt"""
+    try:
+        # Encode prompt
+        tokens = enc.encode(prompt)
+        tokens = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)
+        # Generate
+        with torch.no_grad():
+            for _ in range(max_new_tokens):
+                # Forward pass
+                logits, _ = model(tokens)
+                logits = logits[:, -1, :] / temperature
+                # Top-k sampling
+                topk_probs, topk_indices = torch.topk(F.softmax(logits, dim=-1), top_k, dim=-1)
+                ix = torch.multinomial(topk_probs, 1)
+                next_token = torch.gather(topk_indices, -1, ix)
+                # Append to sequence
+                tokens = torch.cat([tokens, next_token], dim=1)
+                # Stop if we hit max length
+                if tokens.size(1) >= config.block_size:
+                    break
+        # Decode
+        generated_text = enc.decode(tokens[0].tolist())
+        return generated_text
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="GPT-2 124M Shakespeare Model") as demo:
+    gr.Markdown("""
+    # 🎭 GPT-2 124M Shakespeare Language Model
+    This is a 124M parameter decoder-only transformer model trained on Shakespeare's complete works.
+    **Training Results:**
+    - Final Loss: 0.095127 (Target: < 0.099999) ✅
+    - Model Parameters: 124.44M
+    - Training Steps: 1,637
+    Enter a prompt below to generate Shakespeare-style text!
+    """)
+    with gr.Row():
+        with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Enter your prompt here (e.g., 'First Citizen:', 'ROMEO:', 'To be or not')",
+                value="First Citizen:",
+                lines=3
+            )
+            max_tokens = gr.Slider(
+                label="Max Tokens",
+                minimum=50,
+                maximum=200,
+                value=100,
+                step=10
+            )
+            temperature = gr.Slider(
+                label="Temperature",
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1
+            )
+            top_k = gr.Slider(
+                label="Top-K",
+                minimum=10,
+                maximum=100,
+                value=50,
+                step=10
+            )
+            generate_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(
+                label="Generated Text",
+                lines=10,
+                interactive=False
+            )
+    # Example prompts
+    gr.Markdown("### Example Prompts:")
+    examples = gr.Examples(
+        examples=[
+            ["First Citizen:"],
+            ["ROMEO:"],
+            ["To be or not"],
+            ["HAMLET:"],
+            ["MACBETH:"],
+        ],
+        inputs=prompt_input
+    )
+    generate_btn.click(
+        fn=generate_text,
+        inputs=[prompt_input, max_tokens, temperature, top_k],
+        outputs=output
+    )
+    gr.Markdown("""
+    ---
+    **Note:** The model was trained on Shakespeare text and generates text in that style.
+    Generated text may not always be coherent but should follow Shakespearean patterns.
+    """)
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch>=2.0.0
+tiktoken>=0.5.0
+gradio>=5.4.1