HSinghHuggingFace commited on
Commit
f98cc3f
·
1 Parent(s): 333126d

Huggingface app

Browse files
Files changed (8) hide show
  1. .gitignore +7 -0
  2. README.md +32 -2
  3. app.py +193 -0
  4. model.pt +3 -0
  5. requirements.txt +5 -0
  6. src/config/model_config.py +10 -0
  7. src/models/gpt.py +177 -0
  8. src/utils/device_utils.py +10 -0
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ .venv
5
+ venv/
6
+ ENV/
7
+ .DS_Store
README.md CHANGED
@@ -8,7 +8,37 @@ sdk_version: 1.41.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: SmolLM2-135-Text-Generator
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Text generation using SmolLM2-135 model
12
  ---
13
 
14
+ # SmolLM2-135 Text Generator
15
+
16
+ This is a Streamlit-based text generation application using a fine-tuned SmolLM2-135 model. The application allows users to:
17
+
18
+ - Input custom prompts
19
+ - Control the length of generated text
20
+ - Generate multiple text sequences
21
+ - View token information
22
+
23
+ ## Features
24
+
25
+ - Interactive text input
26
+ - Adjustable text generation length
27
+ - Multiple sequence generation
28
+ - Real-time text generation
29
+ - Token information display
30
+
31
+ ## Usage
32
+
33
+ 1. Enter your prompt in the text area
34
+ 2. Adjust the length of text to be generated
35
+ 3. Select the number of sequences to generate
36
+ 4. Click "Generate" to create text
37
+
38
+ ## Technical Details
39
+
40
+ The application uses:
41
+ - SmolLM2-135 model architecture
42
+ - Tiktoken tokenizer
43
+ - PyTorch for model inference
44
+ - Streamlit for the user interface
app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import tiktoken
4
+ import sys
5
+ import os
6
+ import logging
7
+ import warnings
8
+
9
+ # Configure logging and warnings
10
+ logging.getLogger('streamlit').setLevel(logging.ERROR)
11
+ warnings.filterwarnings('ignore', message='.*torch.classes.*')
12
+ warnings.filterwarnings('ignore', category=FutureWarning)
13
+
14
+ # Add the project root to Python path
15
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
+
17
+ from src.config.model_config import GPTConfig
18
+ from src.models.gpt import LlamaForCausalLM
19
+ from src.utils.device_utils import get_device
20
+
21
+ @st.cache_resource
22
+ def load_model():
23
+ """
24
+ Load and prepare the model for inference.
25
+ Returns the loaded model and device.
26
+ """
27
+ device = get_device()
28
+
29
+ try:
30
+ # Load the checkpoint dictionary
31
+ checkpoint = torch.load('model.pt', map_location=device)
32
+
33
+ # Initialize model with config
34
+ config = GPTConfig()
35
+ model = LlamaForCausalLM(config)
36
+
37
+ # Load state dict - extract model_state_dict from checkpoint
38
+ if "model_state_dict" in checkpoint:
39
+ state_dict = checkpoint["model_state_dict"]
40
+ else:
41
+ state_dict = checkpoint
42
+
43
+ # Remove cached rotary embedding buffers
44
+ state_dict.pop("model.rotary_emb.cos_cached", None)
45
+ state_dict.pop("model.rotary_emb.sin_cached", None)
46
+
47
+ model.load_state_dict(state_dict, strict=True)
48
+
49
+ # Prepare model for inference
50
+ model = model.float()
51
+ model.to(device)
52
+ model.eval()
53
+
54
+ return model, device
55
+
56
+ except Exception as e:
57
+ st.error(f"Detailed error during model loading: {str(e)}")
58
+ raise e
59
+
60
+ def generate_text(model, prompt, max_length=100, num_return_sequences=1, device='cpu'):
61
+ """
62
+ Generate text based on the input prompt.
63
+
64
+ Args:
65
+ model: The loaded GPT model
66
+ prompt: Input text prompt
67
+ max_length: Maximum number of tokens to generate
68
+ num_return_sequences: Number of different sequences to generate
69
+ device: Device to run inference on
70
+
71
+ Returns:
72
+ List of generated text sequences
73
+ """
74
+ tokenizer = tiktoken.get_encoding('gpt2')
75
+ input_tokens = tokenizer.encode(prompt)
76
+ x = torch.tensor(input_tokens).unsqueeze(0).repeat(num_return_sequences, 1)
77
+ x = x.to(device)
78
+
79
+ # Calculate final length (input length + requested additional tokens)
80
+ input_length = x.size(1)
81
+ target_length = input_length + max_length
82
+
83
+ # Generate text
84
+ with torch.no_grad():
85
+ while x.size(1) < target_length:
86
+ # Get predictions
87
+ logits, _ = model(x)
88
+ next_token_logits = logits[:, -1, :]
89
+
90
+ # Apply temperature to make the distribution more focused
91
+ probs = torch.softmax(next_token_logits / 0.8, dim=-1)
92
+
93
+ # Sample from the distribution
94
+ next_token = torch.multinomial(probs, num_samples=1)
95
+
96
+ # Append to the sequence
97
+ x = torch.cat((x, next_token), dim=1)
98
+
99
+ # Print token information
100
+ st.text(f"Size of Input tokens: {input_length}, Additional tokens to be predicted: {max_length}, Total tokens to be generated: {x.size(1)}")
101
+
102
+ # Decode generated sequences
103
+ generated_texts = []
104
+ for i in range(num_return_sequences):
105
+ tokens = x[i].tolist()
106
+ text = tokenizer.decode(tokens)
107
+ generated_texts.append(text)
108
+
109
+ return generated_texts
110
+
111
+ # Set page config
112
+ st.set_page_config(
113
+ page_title="SmolLM2-135 Text Generator",
114
+ page_icon="🐢",
115
+ layout="wide"
116
+ )
117
+
118
+ # Streamlit UI
119
+ st.title("🐢 SmolLM2-135 Text Generator")
120
+ st.markdown("""
121
+ This application uses a fine-tuned SmolLM2-135 model to generate text based on your prompts.
122
+ Enter your prompt below and adjust the generation parameters to create unique text sequences.
123
+ """)
124
+
125
+ # Create two columns for the interface
126
+ col1, col2 = st.columns([2, 1])
127
+
128
+ with col1:
129
+ # Input form
130
+ prompt = st.text_area(
131
+ "Enter your prompt:",
132
+ "Once upon a time",
133
+ height=100,
134
+ help="Enter the text you want the model to continue from"
135
+ )
136
+
137
+ with col2:
138
+ # Generation parameters
139
+ max_length = st.slider(
140
+ "Predict additional text of length:",
141
+ min_value=1,
142
+ max_value=50,
143
+ value=20,
144
+ help="Number of additional tokens to generate"
145
+ )
146
+
147
+ num_sequences = st.slider(
148
+ "Number of sequences to generate:",
149
+ min_value=1,
150
+ max_value=5,
151
+ value=1,
152
+ help="Generate multiple different sequences from the same prompt"
153
+ )
154
+
155
+ # Load model
156
+ try:
157
+ model, device = load_model()
158
+ model_status = st.success("Model loaded successfully! Ready to generate text.")
159
+ except Exception as e:
160
+ st.error(f"Error loading model: {str(e)}")
161
+ st.stop()
162
+
163
+ # Generate button
164
+ if st.button("Generate", type="primary"):
165
+ if not prompt:
166
+ st.warning("Please enter a prompt first!")
167
+ else:
168
+ with st.spinner("Generating text..."):
169
+ try:
170
+ generated_texts = generate_text(
171
+ model=model,
172
+ prompt=prompt,
173
+ max_length=max_length,
174
+ num_return_sequences=num_sequences,
175
+ device=device
176
+ )
177
+
178
+ # Display results
179
+ st.subheader("Generated Text:")
180
+ for i, text in enumerate(generated_texts, 1):
181
+ with st.expander(f"Sequence {i}", expanded=True):
182
+ st.write(text)
183
+
184
+ except Exception as e:
185
+ st.error(f"Error during text generation: {str(e)}")
186
+
187
+ # Add footer
188
+ st.markdown("---")
189
+ st.markdown("""
190
+ <div style='text-align: center'>
191
+ <p>Built with Streamlit and PyTorch | SmolLM2-135 Model</p>
192
+ </div>
193
+ """, unsafe_allow_html=True)
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf16a04318e75ccea0fc7e37ac501f7a56016ee500352ce2a20ee78e004e610b
3
+ size 277571739
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==1.41.1
2
+ torch>=2.0.0
3
+ tiktoken>=0.5.0
4
+ numpy>=1.24.0
5
+ tqdm>=4.65.0
src/config/model_config.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ class GPTConfig:
2
+ def __init__(self):
3
+ self.vocab_size = 32000
4
+ self.hidden_size = 256
5
+ self.num_hidden_layers = 12
6
+ self.num_attention_heads = 4 # Changed to match head_dim=64
7
+ self.intermediate_size = 512
8
+ self.hidden_act = "silu"
9
+ self.rms_norm_eps = 1e-5
10
+ self.max_position_embeddings = 1024
src/models/gpt.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
7
+ # Reshape position_ids to match q's shape
8
+ position_ids = position_ids.unsqueeze(0).unsqueeze(0) # [1, 1, seq_len]
9
+
10
+ # Get the rotary embeddings for this position
11
+ cos = cos.squeeze(0) # [seq_len, dim]
12
+ sin = sin.squeeze(0) # [seq_len, dim]
13
+
14
+ # Apply rotary embeddings
15
+ q_embed = (q * cos) + (rotate_half(q) * sin)
16
+ k_embed = (k * cos) + (rotate_half(k) * sin)
17
+
18
+ return q_embed, k_embed
19
+
20
+ def rotate_half(x):
21
+ """Rotates half the hidden dims of the input."""
22
+ x1 = x[..., :x.shape[-1] // 2]
23
+ x2 = x[..., x.shape[-1] // 2:]
24
+ return torch.cat((-x2, x1), dim=-1)
25
+
26
+ class LlamaRMSNorm(nn.Module):
27
+ def __init__(self, hidden_size, eps=1e-5):
28
+ super().__init__()
29
+ self.weight = nn.Parameter(torch.ones(hidden_size))
30
+ self.eps = eps
31
+
32
+ def forward(self, x):
33
+ variance = x.pow(2).mean(-1, keepdim=True)
34
+ x = x * torch.rsqrt(variance + self.eps)
35
+ return self.weight * x
36
+
37
+ class LlamaRotaryEmbedding(nn.Module):
38
+ def __init__(self, dim, max_position_embeddings=2048, base=10000):
39
+ super().__init__()
40
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim//4, dtype=torch.float32) / dim))
41
+ self.register_buffer("inv_freq", inv_freq)
42
+ self.register_buffer("cos_cached", None, persistent=False)
43
+ self.register_buffer("sin_cached", None, persistent=False)
44
+ self.max_position_embeddings = max_position_embeddings
45
+
46
+ def forward(self, x, seq_len):
47
+ if self.cos_cached is not None and self.cos_cached.size(1) >= seq_len:
48
+ return self.cos_cached[:, :seq_len, :], self.sin_cached[:, :seq_len, :]
49
+
50
+ t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
51
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
52
+ emb = torch.cat((freqs, freqs), dim=-1)
53
+
54
+ cos = torch.cos(emb)[None, :, :]
55
+ sin = torch.sin(emb)[None, :, :]
56
+
57
+ self.cos_cached = cos
58
+ self.sin_cached = sin
59
+
60
+ return cos, sin
61
+
62
+ class LlamaAttention(nn.Module):
63
+ def __init__(self, config):
64
+ super().__init__()
65
+ self.hidden_size = config.hidden_size
66
+ self.num_heads = config.num_attention_heads
67
+ self.head_dim = 64 # Fixed head dimension to match saved model
68
+
69
+ self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
70
+ self.k_proj = nn.Linear(config.hidden_size, 64, bias=False) # Single head dimension
71
+ self.v_proj = nn.Linear(config.hidden_size, 64, bias=False) # Single head dimension
72
+ self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
73
+
74
+ def forward(self, hidden_states, rotary_emb=None):
75
+ bsz, q_len, _ = hidden_states.size()
76
+
77
+ q = self.q_proj(hidden_states)
78
+ k = self.k_proj(hidden_states)
79
+ v = self.v_proj(hidden_states)
80
+
81
+ # Split q into heads before applying rotary embeddings
82
+ q = q.view(bsz, q_len, self.num_heads, -1) # -1 will be 64
83
+ k = k.view(bsz, q_len, 1, -1) # Keep k as single head
84
+ v = v.view(bsz, q_len, 1, -1) # Keep v as single head
85
+
86
+ # Apply rotary embeddings if provided
87
+ if rotary_emb is not None:
88
+ position_ids = torch.arange(q_len, device=q.device)
89
+ cos, sin = rotary_emb(v, q_len)
90
+ # Split q and k in half for rotation
91
+ q1, q2 = q[..., :32], q[..., 32:]
92
+ k1, k2 = k[..., :32], k[..., 32:]
93
+ # Apply rotation to first half
94
+ q_embed = torch.cat([
95
+ q1 * cos.unsqueeze(2) - q2 * sin.unsqueeze(2),
96
+ q2 * cos.unsqueeze(2) + q1 * sin.unsqueeze(2)
97
+ ], dim=-1)
98
+ k_embed = torch.cat([
99
+ k1 * cos.unsqueeze(2) - k2 * sin.unsqueeze(2),
100
+ k2 * cos.unsqueeze(2) + k1 * sin.unsqueeze(2)
101
+ ], dim=-1)
102
+ q, k = q_embed, k_embed
103
+
104
+ # Expand k and v to match number of heads
105
+ k = k.expand(-1, -1, self.num_heads, -1)
106
+ v = v.expand(-1, -1, self.num_heads, -1)
107
+
108
+ # Scaled dot-product attention
109
+ attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
110
+ attn_weights = F.softmax(attn_weights, dim=-1)
111
+
112
+ attn_output = torch.matmul(attn_weights, v)
113
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
114
+ attn_output = self.o_proj(attn_output)
115
+
116
+ return attn_output
117
+
118
+ class LlamaMLP(nn.Module):
119
+ def __init__(self, config):
120
+ super().__init__()
121
+ self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
122
+ self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
123
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
124
+ self.act_fn = nn.SiLU()
125
+
126
+ def forward(self, x):
127
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
128
+
129
+ class LlamaDecoderLayer(nn.Module):
130
+ def __init__(self, config):
131
+ super().__init__()
132
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
133
+ self.self_attn = LlamaAttention(config)
134
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
135
+ self.mlp = LlamaMLP(config)
136
+
137
+ def forward(self, hidden_states, rotary_emb=None): # Add rotary_emb parameter
138
+ residual = hidden_states
139
+ hidden_states = self.input_layernorm(hidden_states)
140
+ hidden_states = self.self_attn(hidden_states, rotary_emb=rotary_emb)
141
+ hidden_states = residual + hidden_states
142
+
143
+ residual = hidden_states
144
+ hidden_states = self.post_attention_layernorm(hidden_states)
145
+ hidden_states = self.mlp(hidden_states)
146
+ hidden_states = residual + hidden_states
147
+
148
+ return hidden_states
149
+
150
+ class LlamaModel(nn.Module):
151
+ def __init__(self, config):
152
+ super().__init__()
153
+ self.config = config
154
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
155
+ self.rotary_emb = LlamaRotaryEmbedding(dim=64) # This will create inv_freq of size 16
156
+ self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
157
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
158
+
159
+ def forward(self, input_ids):
160
+ hidden_states = self.embed_tokens(input_ids)
161
+
162
+ for layer in self.layers:
163
+ hidden_states = layer(hidden_states, rotary_emb=self.rotary_emb)
164
+
165
+ hidden_states = self.norm(hidden_states)
166
+ return hidden_states
167
+
168
+ class LlamaForCausalLM(nn.Module):
169
+ def __init__(self, config):
170
+ super().__init__()
171
+ self.model = LlamaModel(config)
172
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
173
+
174
+ def forward(self, input_ids):
175
+ hidden_states = self.model(input_ids)
176
+ logits = self.lm_head(hidden_states)
177
+ return logits, None
src/utils/device_utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def get_device():
4
+ """
5
+ Determine the available device for computation.
6
+ Returns either CUDA device if available, or CPU.
7
+ """
8
+ if torch.cuda.is_available():
9
+ return torch.device('cuda')
10
+ return torch.device('cpu')