Rajendro commited on
Commit
657dabc
·
verified ·
1 Parent(s): 37c8622

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +75 -0
  2. model.py +134 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ from huggingface_hub import hf_hub_download
5
+ from model import LlamaForCausalLM # Import your custom model class
6
+
7
+ # Load tokenizer and model
8
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
9
+ if tokenizer.pad_token is None:
10
+ tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
11
+
12
+ # Initialize model with reduced parameters (135M config)
13
+ model = LlamaForCausalLM(
14
+ vocab_size=tokenizer.vocab_size,
15
+ dim=576,
16
+ num_layers=30,
17
+ hidden_dim=1536,
18
+ num_heads=9
19
+ )
20
+ device = "cpu"
21
+ # Load trained weights
22
+ # state_dict = torch.hub.load_state_dict_from_url(
23
+ # "https://huggingface.co/Rajendro/smallmv2135/blob/main/model-dict-step-5500.pt",
24
+ # map_location="cpu"
25
+ # )
26
+ # model.load_state_dict(state_dict)
27
+ # model.eval()
28
+
29
+ model_id = "Rajendro/smallmv2135"
30
+ checkpoint_path = hf_hub_download(repo_id=model_id, filename="model-dict-step-5500.pt")
31
+
32
+ checkpoint = torch.load(checkpoint_path, map_location=device)
33
+ model.load_state_dict(checkpoint['model_state_dict'])
34
+ model.to(device)
35
+ model.eval()
36
+
37
+ def generate_text(prompt, max_length=100, temperature=0.7, top_k=50):
38
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
39
+
40
+ with torch.no_grad():
41
+ for _ in range(max_length):
42
+ outputs = model(input_ids)
43
+ next_token_logits = outputs[:, -1, :] / temperature
44
+
45
+ # Apply top-k sampling
46
+ top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k, dim=-1)
47
+ probs = torch.softmax(top_k_logits, dim=-1)
48
+
49
+ # Sample from distribution
50
+ next_token_idx = torch.multinomial(probs, num_samples=1)
51
+ next_token = top_k_indices[0, next_token_idx[0]]
52
+
53
+ if next_token.item() == tokenizer.eos_token_id:
54
+ break
55
+
56
+ input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
57
+
58
+ return tokenizer.decode(input_ids[0], skip_special_tokens=True)
59
+
60
+ # Gradio interface
61
+ demo = gr.Interface(
62
+ fn=generate_text,
63
+ inputs=[
64
+ gr.Textbox(label="Input Prompt", lines=3),
65
+ gr.Slider(50, 200, value=100, label="Max Length"),
66
+ gr.Slider(0.1, 2.0, value=0.7, label="Temperature"),
67
+ gr.Slider(10, 100, value=50, label="Top-k")
68
+ ],
69
+ outputs=gr.Textbox(label="Generated Text", lines=5),
70
+ title="🦙 Sample SmolLLM Demo",
71
+ description="A 135M parameter language model trained on smollm-corpus"
72
+ )
73
+
74
+ if __name__ == "__main__":
75
+ demo.launch()
model.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ # RMSNorm is a normalization technique that normalizes the input by dividing by the square root of the variance plus a small number to prevent division by zero
6
+ class LlamaRMSNorm(nn.Module):
7
+ def __init__(self, hidden_size, eps=1e-5): # the number of features/dimensions/embeddings in the input, eps is a small number to prevent division by zero
8
+ super().__init__()
9
+ self.weight = nn.Parameter(torch.ones(hidden_size)) # weight is a learnable parameter that scales the input
10
+ self.eps = eps
11
+
12
+ def forward(self, x):
13
+ norm = x.pow(2).mean(-1, keepdim=True).sqrt() + self.eps # compute the norm of the input
14
+ return x / norm * self.weight # normalize the input by dividing by the norm and scale it by the weight parameter
15
+
16
+
17
+ # RotaryEmbedding is a technique that rotates the input by a learnable angle
18
+ class LlamaRotaryEmbedding(nn.Module):
19
+ def __init__(self, dim, base=10000, device=None): # dim is the number of features/dimensions/embeddings in the input, base is a base number for the frequency, device is the device to store the buffer
20
+ super().__init__()
21
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device).float() / dim)) # compute the inverse frequency
22
+ self.register_buffer("inv_freq", inv_freq) # register the inverse frequency as a buffer
23
+
24
+ def forward(self, x, seq_len):
25
+ seq_len = seq_len.to(x.device) # convert seq_len to the device of the input
26
+ t = torch.arange(seq_len, device=x.device) # create a tensor of the sequence length
27
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq) # compute the frequency by taking the dot product of the sequence length and the inverse frequency
28
+ emb = torch.cat((freqs, freqs), dim=-1) # concatenate the frequency with itself
29
+ return emb
30
+
31
+ class LlamaMLP(nn.Module):
32
+ def __init__(self, dim, hidden_dim):
33
+ super().__init__()
34
+ self.gate_proj = nn.Linear(dim, hidden_dim, bias=False) # create the gate projection layer with the input dimension and the hidden dimension
35
+ self.up_proj = nn.Linear(dim, hidden_dim, bias=False) # create the up projection layer with the input dimension and the hidden dimension
36
+ self.down_proj = nn.Linear(hidden_dim, dim, bias=False) # create the down projection layer with the hidden dimension and the output dimension
37
+ self.act_fn = nn.SiLU() # create the activation function
38
+
39
+ def forward(self, x):
40
+ gated = self.gate_proj(x) # apply the gate projection to the input
41
+ hidden = self.up_proj(x) # apply the up projection to the input
42
+ return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
43
+
44
+ class LlamaAttention(nn.Module):
45
+ def __init__(self, dim, num_heads=8):
46
+ super().__init__()
47
+ self.num_heads = num_heads
48
+ self.head_dim = dim // num_heads
49
+
50
+ self.q_proj = nn.Linear(dim, dim, bias=False)
51
+ self.k_proj = nn.Linear(dim, dim, bias=False)
52
+ self.v_proj = nn.Linear(dim, dim, bias=False)
53
+ self.o_proj = nn.Linear(dim, dim, bias=False)
54
+
55
+ def forward(self, x):
56
+ batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
57
+ q = self.q_proj(x)
58
+ k = self.k_proj(x)
59
+ v = self.v_proj(x)
60
+
61
+
62
+ # Split heads
63
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # [batch_size, num_heads, seq_len, head_dim]
64
+ k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
65
+ v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
66
+
67
+ # Scaled dot-product attention
68
+ scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
69
+ attention = torch.softmax(scores, dim=-1)
70
+ context = torch.matmul(attention, v)
71
+
72
+ # Combine heads
73
+ context = context.transpose(1, 2).reshape(batch_size, seq_len, dim)
74
+ return self.o_proj(context)
75
+
76
+ class LlamaDecoderLayer(nn.Module):
77
+ def __init__(self, dim, hidden_dim, num_heads):
78
+ super().__init__()
79
+ self.self_attn = LlamaAttention(dim, num_heads)
80
+ self.mlp = LlamaMLP(dim, hidden_dim)
81
+ self.input_layernorm = LlamaRMSNorm(dim)
82
+ self.post_attention_layernorm = LlamaRMSNorm(dim)
83
+
84
+ def forward(self, x):
85
+ residual = x
86
+ x = self.input_layernorm(x)
87
+ x = self.self_attn(x)
88
+ x = x + residual
89
+
90
+ residual = x
91
+ x = self.post_attention_layernorm(x)
92
+ x = self.mlp(x)
93
+ x = x + residual
94
+ return x
95
+
96
+
97
+ class LlamaModel(nn.Module):
98
+ def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
99
+ super().__init__()
100
+ self.embed_tokens = nn.Embedding(vocab_size, dim)
101
+ self.layers = nn.ModuleList([
102
+ LlamaDecoderLayer(dim, hidden_dim, num_heads) for _ in range(num_layers)
103
+ ])
104
+ self.norm = LlamaRMSNorm(dim)
105
+ self.rotary_emb = LlamaRotaryEmbedding(dim)
106
+
107
+ def forward(self, x):
108
+ x = self.embed_tokens(x)
109
+ for layer in self.layers:
110
+ x = layer(x)
111
+ return self.norm(x)
112
+
113
+ class LlamaForCausalLM(nn.Module):
114
+ def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
115
+ super().__init__()
116
+ self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads)
117
+ self.lm_head = nn.Linear(dim, vocab_size, bias=False)
118
+
119
+ def forward(self, x):
120
+ x = self.model(x)
121
+ return self.lm_head(x)
122
+
123
+ def get_model(tokenizer):
124
+ vocab_size = tokenizer.vocab_size # Use actual tokenizer vocab size
125
+ return LlamaForCausalLM(
126
+ vocab_size=vocab_size,
127
+ dim=576,
128
+ num_layers=30,
129
+ hidden_dim=1536,
130
+ num_heads=8
131
+ )
132
+
133
+ # model = get_model()
134
+ # print(model)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ transformers
4
+ huggingface_hub