lemms commited on
Commit
c01f852
Β·
verified Β·
1 Parent(s): 4e3beb2

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +627 -426
app.py CHANGED
@@ -1,426 +1,627 @@
1
- #!/usr/bin/env python3
2
- """
3
- OpenLLM Inference Space - Gradio Interface
4
- Loads models from Hugging Face repositories to avoid storage limits
5
- """
6
-
7
- import gradio as gr
8
- import torch
9
- import json
10
- import os
11
- from pathlib import Path
12
- from typing import Dict, Any, Optional
13
- import logging
14
-
15
- # Set up logging
16
- logging.basicConfig(level=logging.INFO)
17
- logger = logging.getLogger(__name__)
18
-
19
- # Import our custom modules
20
- try:
21
- from core.src.model import GPTConfig, GPTModel
22
- from core.src.inference_server import OpenLLMInference
23
- logger.info("βœ… Successfully imported core modules")
24
- except ImportError as e:
25
- logger.error(f"❌ Failed to import core modules: {e}")
26
- raise
27
-
28
- class OpenLLMInferenceEngine:
29
- """
30
- Inference engine that loads models from Hugging Face repositories
31
- """
32
-
33
- def __init__(self):
34
- self.models = {}
35
- self.tokenizers = {}
36
- self.current_model = None
37
- self.current_tokenizer = None
38
-
39
- # Model configurations with Hugging Face repository IDs
40
- self.model_configs = {
41
- "openllm-small-extended-4k": {
42
- "name": "OpenLLM Small (4k steps)",
43
- "description": "Small model trained for 4,000 steps - Early training stage",
44
- "hf_repo": "lemms/openllm-small-extended-4k",
45
- "local_path": "models/small-extended-4k", # Fallback if HF repo fails
46
- "checkpoint": "best_model.pt",
47
- "config": "config.json"
48
- },
49
- "openllm-small-extended-6k": {
50
- "name": "OpenLLM Small (6k steps)",
51
- "description": "Small model trained for 6,000 steps - Improved coherence",
52
- "hf_repo": "lemms/openllm-small-extended-6k",
53
- "local_path": "models/small-extended-6k",
54
- "checkpoint": "best_model.pt",
55
- "config": "config.json"
56
- },
57
- "openllm-small-extended-7k": {
58
- "name": "OpenLLM Small (7k steps)",
59
- "description": "Small model trained for 7,000 steps - Enhanced quality",
60
- "hf_repo": "lemms/openllm-small-extended-7k",
61
- "local_path": "models/small-extended-7k",
62
- "checkpoint": "best_model.pt",
63
- "config": "config.json"
64
- },
65
- "openllm-small-extended-8k": {
66
- "name": "OpenLLM Small (8k steps)",
67
- "description": "Small model trained for 8,000 steps - Sophisticated understanding",
68
- "hf_repo": "lemms/openllm-small-extended-8k",
69
- "local_path": "models/small-extended-8k",
70
- "checkpoint": "best_model.pt",
71
- "config": "config.json"
72
- },
73
- "openllm-small-extended-9k": {
74
- "name": "OpenLLM Small (9k steps)",
75
- "description": "Small model trained for 9,000 steps - Best performing model",
76
- "hf_repo": "lemms/openllm-small-extended-9k",
77
- "local_path": "models/small-extended-9k",
78
- "checkpoint": "best_model.pt",
79
- "config": "config.json"
80
- },
81
- "openllm-small-extended-10k": {
82
- "name": "OpenLLM Small (10k steps)",
83
- "description": "Small model trained for 10,000 steps - Latest extended training",
84
- "hf_repo": "lemms/openllm-small-extended-10k",
85
- "local_path": "models/small-extended-10k",
86
- "checkpoint": "best_model.pt",
87
- "config": "config.json"
88
- }
89
- }
90
-
91
- logger.info("πŸš€ OpenLLM Inference Engine initialized")
92
- logger.info(f"πŸ“‹ Available models: {list(self.model_configs.keys())}")
93
-
94
- def load_model_from_hf(self, model_id: str) -> bool:
95
- """
96
- Load model from Hugging Face repository
97
- """
98
- try:
99
- from huggingface_hub import snapshot_download
100
-
101
- config = self.model_configs.get(model_id)
102
- if not config:
103
- logger.error(f"❌ Unknown model ID: {model_id}")
104
- return False
105
-
106
- logger.info(f"πŸ“₯ Loading model from HF: {config['hf_repo']}")
107
-
108
- # Download model files from Hugging Face
109
- local_dir = snapshot_download(
110
- repo_id=config['hf_repo'],
111
- repo_type="model",
112
- local_dir=f"temp_{model_id}",
113
- allow_patterns=["*.pt", "*.json", "*.model"]
114
- )
115
-
116
- logger.info(f"βœ… Downloaded model to: {local_dir}")
117
-
118
- # Initialize inference engine with downloaded model
119
- inference_engine = OpenLLMInference(
120
- model_path=Path(local_dir),
121
- model_format="pytorch"
122
- )
123
-
124
- self.models[model_id] = inference_engine
125
- self.current_model = model_id
126
-
127
- logger.info(f"βœ… Successfully loaded model: {model_id}")
128
- return True
129
-
130
- except Exception as e:
131
- logger.error(f"❌ Failed to load model from HF {model_id}: {e}")
132
-
133
- # Fallback to local model if available
134
- logger.info(f"πŸ”„ Trying fallback to local model...")
135
- return self.load_model_local(model_id)
136
-
137
- def load_model_local(self, model_id: str) -> bool:
138
- """
139
- Load model from local files (fallback method)
140
- """
141
- try:
142
- config = self.model_configs.get(model_id)
143
- if not config:
144
- return False
145
-
146
- local_path = Path(config['local_path'])
147
- if not local_path.exists():
148
- logger.error(f"❌ Local model path not found: {local_path}")
149
- return False
150
-
151
- logger.info(f"πŸ“‚ Loading local model: {local_path}")
152
-
153
- # Initialize inference engine with local model
154
- inference_engine = OpenLLMInference(
155
- model_path=local_path,
156
- model_format="pytorch"
157
- )
158
-
159
- self.models[model_id] = inference_engine
160
- self.current_model = model_id
161
-
162
- logger.info(f"βœ… Successfully loaded local model: {model_id}")
163
- return True
164
-
165
- except Exception as e:
166
- logger.error(f"❌ Failed to load local model {model_id}: {e}")
167
- return False
168
-
169
- def generate_text(self, prompt: str, max_length: int = 100,
170
- temperature: float = 0.7, top_k: int = 50,
171
- top_p: float = 0.9) -> str:
172
- """
173
- Generate text using the currently loaded model
174
- """
175
- if not self.current_model or self.current_model not in self.models:
176
- return "❌ No model loaded. Please select a model first."
177
-
178
- try:
179
- logger.info(f"🎯 Generating text with model: {self.current_model}")
180
- logger.info(f"πŸ“ Prompt: {prompt[:100]}...")
181
- logger.info(f"βš™οΈ Parameters: max_length={max_length}, temp={temperature}, top_k={top_k}, top_p={top_p}")
182
-
183
- # Get the inference engine
184
- inference_engine = self.models[self.current_model]
185
-
186
- # Prepare generation parameters
187
- generation_params = {
188
- "max_length": max_length,
189
- "temperature": temperature,
190
- "top_k": top_k,
191
- "top_p": top_p,
192
- "do_sample": True,
193
- "pad_token_id": 0,
194
- "eos_token_id": 1
195
- }
196
-
197
- # Generate text
198
- generated_texts = inference_engine.generate(
199
- prompt=prompt,
200
- **generation_params
201
- )
202
-
203
- # Extract the generated text
204
- if isinstance(generated_texts, list) and len(generated_texts) > 0:
205
- generated_text = generated_texts[0]
206
- else:
207
- generated_text = str(generated_texts)
208
-
209
- logger.info(f"βœ… Generated text length: {len(generated_text)}")
210
- return generated_text
211
-
212
- except Exception as e:
213
- error_msg = f"❌ Generation failed: {str(e)}"
214
- logger.error(error_msg)
215
- return error_msg
216
-
217
- # Initialize the inference engine
218
- inference_engine = OpenLLMInferenceEngine()
219
-
220
- def load_model_info(model_id: str) -> str:
221
- """Get information about a specific model"""
222
- config = inference_engine.model_configs.get(model_id)
223
- if config:
224
- return f"**{config['name']}**\n\n{config['description']}"
225
- return "❌ Model not found"
226
-
227
- def generate_text_interface(model_id: str, prompt: str, max_length: int,
228
- temperature: float, top_k: int, top_p: float) -> str:
229
- """
230
- Gradio interface function for text generation
231
- """
232
- try:
233
- # Load model if not already loaded
234
- if model_id not in inference_engine.models:
235
- logger.info(f"πŸ”„ Loading model: {model_id}")
236
- success = inference_engine.load_model_from_hf(model_id)
237
- if not success:
238
- return f"❌ Failed to load model: {model_id}"
239
-
240
- # Generate text
241
- result = inference_engine.generate_text(
242
- prompt=prompt,
243
- max_length=max_length,
244
- temperature=temperature,
245
- top_k=top_k,
246
- top_p=top_p
247
- )
248
-
249
- return result
250
-
251
- except Exception as e:
252
- error_msg = f"❌ Error in generation interface: {str(e)}"
253
- logger.error(error_msg)
254
- return error_msg
255
-
256
- # Create Gradio interface
257
- def create_interface():
258
- """Create the Gradio interface"""
259
-
260
- with gr.Blocks(
261
- title="πŸš€ OpenLLM Inference Space",
262
- theme=gr.themes.Soft(),
263
- css="""
264
- .gradio-container {
265
- max-width: 1200px !important;
266
- margin: auto !important;
267
- }
268
- .model-info {
269
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
270
- color: white;
271
- padding: 20px;
272
- border-radius: 10px;
273
- margin-bottom: 20px;
274
- }
275
- """
276
- ) as interface:
277
-
278
- # Header
279
- gr.Markdown("""
280
- # πŸš€ OpenLLM Inference Space
281
-
282
- Welcome to the OpenLLM Inference Space! This is a comprehensive interface for running inference on our trained OpenLLM models with customizable parameters.
283
-
284
- ## 🎯 Available Models
285
-
286
- We provide **5 different models** trained for varying numbers of steps:
287
-
288
- | Model | Training Steps | Description | Best Loss |
289
- |-------|---------------|-------------|-----------|
290
- | **4k Model** | 4,000 | Early training stage, basic language patterns | ~6.2 |
291
- | **6k Model** | 6,000 | Improved coherence, better vocabulary usage | ~5.8 |
292
- | **7k Model** | 7,000 | Enhanced text generation quality | ~5.5 |
293
- | **8k Model** | 8,000 | More sophisticated language understanding | ~5.3 |
294
- | **9k Model** | 9,000 | Best performing model (latest training) | ~5.2 |
295
-
296
- ---
297
- """)
298
-
299
- with gr.Row():
300
- with gr.Column(scale=1):
301
- # Model selection
302
- model_dropdown = gr.Dropdown(
303
- choices=list(inference_engine.model_configs.keys()),
304
- value="openllm-small-extended-9k",
305
- label="🎯 Select Model",
306
- info="Choose the model to use for inference"
307
- )
308
-
309
- # Model information display
310
- model_info = gr.Markdown(
311
- value=load_model_info("openllm-small-extended-9k"),
312
- label="πŸ“‹ Model Information"
313
- )
314
-
315
- # Update model info when selection changes
316
- model_dropdown.change(
317
- fn=load_model_info,
318
- inputs=[model_dropdown],
319
- outputs=[model_info]
320
- )
321
-
322
- with gr.Column(scale=2):
323
- # Input prompt
324
- prompt_input = gr.Textbox(
325
- lines=5,
326
- label="πŸ“ Input Prompt",
327
- placeholder="Enter your text prompt here...",
328
- info="The text that will be used as input for generation"
329
- )
330
-
331
- # Generation parameters
332
- with gr.Row():
333
- max_length = gr.Slider(
334
- minimum=10,
335
- maximum=500,
336
- value=100,
337
- step=10,
338
- label="πŸ“ Max Length",
339
- info="Maximum number of tokens to generate"
340
- )
341
-
342
- temperature = gr.Slider(
343
- minimum=0.1,
344
- maximum=2.0,
345
- value=0.7,
346
- step=0.1,
347
- label="🌑️ Temperature",
348
- info="Controls randomness (higher = more random)"
349
- )
350
-
351
- with gr.Row():
352
- top_k = gr.Slider(
353
- minimum=1,
354
- maximum=100,
355
- value=50,
356
- step=1,
357
- label="πŸ” Top-K",
358
- info="Number of highest probability tokens to consider"
359
- )
360
-
361
- top_p = gr.Slider(
362
- minimum=0.1,
363
- maximum=1.0,
364
- value=0.9,
365
- step=0.1,
366
- label="πŸ“Š Top-P",
367
- info="Nucleus sampling parameter"
368
- )
369
-
370
- # Generate button
371
- generate_btn = gr.Button(
372
- "πŸš€ Generate Text",
373
- variant="primary",
374
- size="lg"
375
- )
376
-
377
- # Output
378
- output_text = gr.Textbox(
379
- lines=10,
380
- label="🎯 Generated Text",
381
- info="The generated text will appear here"
382
- )
383
-
384
- # Connect the generate button
385
- generate_btn.click(
386
- fn=generate_text_interface,
387
- inputs=[model_dropdown, prompt_input, max_length, temperature, top_k, top_p],
388
- outputs=[output_text]
389
- )
390
-
391
- # Footer
392
- gr.Markdown("""
393
- ---
394
-
395
- ## πŸ”§ Technical Details
396
-
397
- - **Architecture**: GPT-style transformer decoder
398
- - **Model Size**: Small (6 layers, 8 heads, 512 embedding dim)
399
- - **Vocabulary**: 32k tokens (SentencePiece BPE)
400
- - **Training Data**: Custom dataset with 27.6M tokens
401
- - **Framework**: PyTorch with custom training loop
402
-
403
- ## πŸ“š Usage Tips
404
-
405
- 1. **Start with the 9k model** for best results
406
- 2. **Lower temperature** (0.3-0.7) for more focused output
407
- 3. **Higher temperature** (0.8-1.2) for more creative output
408
- 4. **Adjust max_length** based on your needs
409
- 5. **Experiment with top-k and top-p** for different generation styles
410
-
411
- ---
412
-
413
- **Built with ❀️ using OpenLLM Framework**
414
- """)
415
-
416
- return interface
417
-
418
- # Create and launch the interface
419
- if __name__ == "__main__":
420
- interface = create_interface()
421
- interface.launch(
422
- server_name="0.0.0.0",
423
- server_port=7860,
424
- share=False,
425
- debug=True
426
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenLLM Real Models App - Final working version with correct attribute naming
4
+ """
5
+
6
+ import gradio as gr
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ import json
11
+ import logging
12
+ import sentencepiece as spm
13
+ import math
14
+ from pathlib import Path
15
+ from typing import Dict, Any, Optional
16
+ from huggingface_hub import snapshot_download
17
+
18
+ # Set up logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class GPTConfig:
23
+ """GPT model configuration"""
24
+ def __init__(self, vocab_size=32000, n_layer=6, n_head=8, n_embd=512,
25
+ block_size=1024, dropout=0.1, bias=True, **kwargs):
26
+ # Accept any additional kwargs to handle extra config fields
27
+ self.vocab_size = vocab_size
28
+ self.n_layer = n_layer
29
+ self.n_head = n_head
30
+ self.n_embd = n_embd
31
+ self.block_size = block_size
32
+ self.dropout = dropout
33
+ self.bias = bias
34
+
35
+ class GPT(nn.Module):
36
+ """GPT-style transformer model - EXACT architecture matching the saved model"""
37
+ def __init__(self, config):
38
+ super().__init__()
39
+ assert config.vocab_size is not None
40
+ assert config.block_size is not None
41
+ self.config = config
42
+
43
+ # Create the transformer module with the exact naming convention
44
+ self.transformer = nn.ModuleDict(dict(
45
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
46
+ wpe = nn.Embedding(config.block_size, config.n_embd),
47
+ drop = nn.Dropout(config.dropout),
48
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
49
+ ln_f = nn.LayerNorm(config.n_embd),
50
+ ))
51
+
52
+ # Language model head - MUST have bias to match saved model
53
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
54
+
55
+ # Initialize weights
56
+ self.apply(self._init_weights)
57
+ for pn, p in self.named_parameters():
58
+ if pn.endswith('c_proj.weight'):
59
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
60
+
61
+ def _init_weights(self, module):
62
+ if isinstance(module, nn.Linear):
63
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
64
+ if module.bias is not None:
65
+ torch.nn.init.zeros_(module.bias)
66
+ elif isinstance(module, nn.Embedding):
67
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
68
+
69
+ def forward(self, idx, targets=None):
70
+ device = idx.device
71
+ b, t = idx.size()
72
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
73
+
74
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
75
+ tok_emb = self.transformer.wte(idx)
76
+ pos_emb = self.transformer.wpe(pos)
77
+ x = self.transformer.drop(tok_emb + pos_emb)
78
+
79
+ for block in self.transformer.h:
80
+ x = block(x)
81
+ x = self.transformer.ln_f(x)
82
+
83
+ if targets is not None:
84
+ logits = self.lm_head(x)
85
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
86
+ else:
87
+ logits = self.lm_head(x[:, [-1], :])
88
+ loss = None
89
+
90
+ return logits, loss
91
+
92
+ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, top_p=None, do_sample=True):
93
+ for _ in range(max_new_tokens):
94
+ idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
95
+ logits, _ = self(idx_cond)
96
+ logits = logits[:, -1, :] / temperature
97
+
98
+ if top_k is not None:
99
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
100
+ logits[logits < v[:, [-1]]] = -float('Inf')
101
+
102
+ if top_p is not None:
103
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
104
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
105
+ sorted_indices_to_remove = cumulative_probs > top_p
106
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
107
+ sorted_indices_to_remove[..., 0] = 0
108
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
109
+ logits[indices_to_remove] = -float('Inf')
110
+
111
+ probs = F.softmax(logits, dim=-1)
112
+ if do_sample:
113
+ idx_next = torch.multinomial(probs, num_samples=1)
114
+ else:
115
+ _, idx_next = torch.topk(probs, k=1, dim=-1)
116
+
117
+ idx = torch.cat((idx, idx_next), dim=1)
118
+
119
+ return idx
120
+
121
+ class Block(nn.Module):
122
+ """Transformer block with self-attention and feed-forward layers"""
123
+ def __init__(self, config):
124
+ super().__init__()
125
+ self.ln_1 = nn.LayerNorm(config.n_embd)
126
+ self.attn = CausalSelfAttention(config)
127
+ self.ln_2 = nn.LayerNorm(config.n_embd)
128
+ self.mlp = MLP(config)
129
+
130
+ def forward(self, x):
131
+ x = x + self.attn(self.ln_1(x))
132
+ x = x + self.mlp(self.ln_2(x))
133
+ return x
134
+
135
+ class CausalSelfAttention(nn.Module):
136
+ """Multi-head self-attention with causal masking - FINAL WORKING VERSION"""
137
+ def __init__(self, config):
138
+ super().__init__()
139
+ assert config.n_embd % config.n_head == 0
140
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
141
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
142
+ self.attn_dropout = nn.Dropout(config.dropout)
143
+ self.resid_dropout = nn.Dropout(config.dropout)
144
+ self.n_head = config.n_head
145
+ self.n_embd = config.n_embd
146
+ self.dropout = config.dropout
147
+ self.use_bias = config.bias # Use different name for the boolean flag
148
+
149
+ # REGISTER THE ATTENTION BIAS as a buffer (not parameter) to match saved model
150
+ # This is actually an attention mask, not a learnable bias
151
+ if config.bias:
152
+ # Create a causal attention mask buffer
153
+ mask = torch.tril(torch.ones(config.block_size, config.block_size))
154
+ mask = mask.view(1, 1, config.block_size, config.block_size)
155
+ self.register_buffer('bias', mask) # This matches the saved model's 'bias' key
156
+ else:
157
+ self.register_buffer('bias', None)
158
+
159
+ def forward(self, x):
160
+ B, T, C = x.size()
161
+
162
+ # Calculate query, key, values for all heads
163
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
164
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
165
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
166
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
167
+
168
+ # Causal self-attention using the bias mask
169
+ if self.bias is not None:
170
+ # Use the causal mask
171
+ attn_mask = self.bias[:, :, :T, :T]
172
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0, is_causal=False)
173
+ else:
174
+ # Use built-in causal attention
175
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
176
+
177
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
178
+
179
+ # Output projection
180
+ y = self.resid_dropout(self.c_proj(y))
181
+ return y
182
+
183
+ class MLP(nn.Module):
184
+ """Multi-layer perceptron"""
185
+ def __init__(self, config):
186
+ super().__init__()
187
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
188
+ self.gelu = nn.GELU()
189
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
190
+ self.dropout = nn.Dropout(config.dropout)
191
+
192
+ def forward(self, x):
193
+ x = self.c_fc(x)
194
+ x = self.gelu(x)
195
+ x = self.c_proj(x)
196
+ x = self.dropout(x)
197
+ return x
198
+
199
+ class RealOpenLLMInference:
200
+ """Real OpenLLM inference engine using actual trained models"""
201
+
202
+ def __init__(self):
203
+ self.models = {}
204
+ self.tokenizers = {}
205
+ self.current_model = None
206
+
207
+ # Real model configurations from Hugging Face
208
+ self.model_configs = {
209
+ "openllm-small-extended-4k": {
210
+ "name": "OpenLLM Small (4k steps)",
211
+ "description": "Real model trained for 4,000 steps - Early training stage",
212
+ "hf_repo": "lemms/openllm-small-extended-4k",
213
+ "training_steps": 4000,
214
+ "parameters": "35.8M"
215
+ },
216
+ "openllm-small-extended-6k": {
217
+ "name": "OpenLLM Small (6k steps)",
218
+ "description": "Real model trained for 6,000 steps - Improved coherence (Perplexity: 816.040)",
219
+ "hf_repo": "lemms/openllm-small-extended-6k",
220
+ "training_steps": 6000,
221
+ "parameters": "35.8M"
222
+ },
223
+ "openllm-small-extended-7k": {
224
+ "name": "OpenLLM Small (7k steps)",
225
+ "description": "Real model trained for 7,000 steps - Enhanced quality (Loss: 2.100, Perplexity: 8.200)",
226
+ "hf_repo": "lemms/openllm-small-extended-7k",
227
+ "training_steps": 7000,
228
+ "parameters": "35.8M"
229
+ },
230
+ "openllm-small-extended-8k": {
231
+ "name": "OpenLLM Small (8k steps)",
232
+ "description": "Real model trained for 8,000 steps - Sophisticated understanding",
233
+ "hf_repo": "lemms/openllm-small-extended-8k",
234
+ "training_steps": 8000,
235
+ "parameters": "35.8M"
236
+ },
237
+ "openllm-small-extended-9k": {
238
+ "name": "OpenLLM Small (9k steps)",
239
+ "description": "Real model trained for 9,000 steps - Best performing model",
240
+ "hf_repo": "lemms/openllm-small-extended-9k",
241
+ "training_steps": 9000,
242
+ "parameters": "35.8M"
243
+ }
244
+ }
245
+
246
+ logger.info("πŸš€ Real OpenLLM Inference Engine initialized")
247
+
248
+ def load_model_from_hf(self, model_id: str) -> bool:
249
+ """Load a real model from Hugging Face"""
250
+ try:
251
+ config = self.model_configs.get(model_id)
252
+ if not config:
253
+ logger.error(f"❌ Unknown model ID: {model_id}")
254
+ return False
255
+
256
+ logger.info(f"πŸ“₯ Loading real model from HF: {config['hf_repo']}")
257
+
258
+ # Download model from Hugging Face
259
+ local_dir = snapshot_download(
260
+ repo_id=config['hf_repo'],
261
+ repo_type="model",
262
+ local_dir=f"temp_{model_id}",
263
+ allow_patterns=["*.pt", "*.json", "*.model", "*.bin"]
264
+ )
265
+
266
+ logger.info(f"βœ… Downloaded model to: {local_dir}")
267
+
268
+ # Load model and tokenizer
269
+ success = self._load_model_and_tokenizer(local_dir, model_id)
270
+ if success:
271
+ self.current_model = model_id
272
+ logger.info(f"βœ… Successfully loaded real model: {model_id}")
273
+ return True
274
+ else:
275
+ return False
276
+
277
+ except Exception as e:
278
+ logger.error(f"❌ Failed to load real model from HF {model_id}: {e}")
279
+ return False
280
+
281
+ def _load_model_and_tokenizer(self, model_dir: str, model_id: str) -> bool:
282
+ """Load model and tokenizer from local directory"""
283
+ try:
284
+ model_path = Path(model_dir)
285
+
286
+ # Load model configuration
287
+ config_file = model_path / "config.json"
288
+ if config_file.exists():
289
+ with open(config_file, 'r') as f:
290
+ config_data = json.load(f)
291
+
292
+ logger.info(f"πŸ“‹ Config data keys: {list(config_data.keys())}")
293
+
294
+ # Handle different config structures
295
+ if 'model_config' in config_data:
296
+ # Extract model_config section
297
+ model_config_data = config_data['model_config']
298
+ else:
299
+ # Use the entire config as model config
300
+ model_config_data = config_data
301
+
302
+ # Create GPTConfig with only the expected parameters
303
+ expected_params = {
304
+ 'vocab_size', 'n_layer', 'n_head', 'n_embd',
305
+ 'block_size', 'dropout', 'bias'
306
+ }
307
+
308
+ config_kwargs = {}
309
+ for key, value in model_config_data.items():
310
+ if key in expected_params:
311
+ config_kwargs[key] = value
312
+
313
+ logger.info(f"πŸ”§ Using config parameters: {config_kwargs}")
314
+ model_config = GPTConfig(**config_kwargs)
315
+ else:
316
+ # Default configuration for OpenLLM small models
317
+ model_config = GPTConfig(
318
+ vocab_size=32000,
319
+ n_layer=6,
320
+ n_head=8,
321
+ n_embd=512,
322
+ block_size=1024,
323
+ dropout=0.1,
324
+ bias=True
325
+ )
326
+
327
+ # Load model weights
328
+ model_file = model_path / "best_model.pt"
329
+ if not model_file.exists():
330
+ model_file = model_path / "model.pt"
331
+ if not model_file.exists():
332
+ model_file = model_path / "pytorch_model.bin"
333
+
334
+ if model_file.exists():
335
+ logger.info(f"πŸ“¦ Loading model from: {model_file}")
336
+ model = GPT(model_config)
337
+ checkpoint = torch.load(model_file, map_location='cpu')
338
+
339
+ # Handle different checkpoint formats
340
+ if isinstance(checkpoint, dict):
341
+ if 'model_state_dict' in checkpoint:
342
+ # Extract the actual model weights
343
+ state_dict = checkpoint['model_state_dict']
344
+ logger.info(f"πŸ“‹ Loading from model_state_dict with {len(state_dict)} keys")
345
+ elif 'model' in checkpoint:
346
+ state_dict = checkpoint['model']
347
+ logger.info(f"πŸ“‹ Loading from model with {len(state_dict)} keys")
348
+ else:
349
+ # Try to load directly as state dict
350
+ state_dict = checkpoint
351
+ logger.info(f"πŸ“‹ Loading direct state dict with {len(state_dict)} keys")
352
+ else:
353
+ # Direct state dict
354
+ state_dict = checkpoint
355
+ logger.info(f"πŸ“‹ Loading direct state dict with {len(state_dict)} keys")
356
+
357
+ # Load the state dict
358
+ model.load_state_dict(state_dict)
359
+ model.eval()
360
+ self.models[model_id] = model
361
+ logger.info(f"βœ… Model loaded successfully")
362
+ else:
363
+ logger.error(f"❌ Model file not found in {model_dir}")
364
+ logger.error(f" Available files: {list(model_path.glob('*'))}")
365
+ return False
366
+
367
+ # Load tokenizer
368
+ tokenizer_file = model_path / "tokenizer.model"
369
+ if tokenizer_file.exists():
370
+ tokenizer = spm.SentencePieceProcessor()
371
+ tokenizer.load(str(tokenizer_file))
372
+ self.tokenizers[model_id] = tokenizer
373
+ logger.info(f"βœ… Tokenizer loaded successfully")
374
+ else:
375
+ logger.error(f"❌ Tokenizer file not found in {model_dir}")
376
+ return False
377
+
378
+ return True
379
+
380
+ except Exception as e:
381
+ logger.error(f"❌ Failed to load model and tokenizer: {e}")
382
+ import traceback
383
+ logger.error(f"πŸ“‹ Full traceback: {traceback.format_exc()}")
384
+ return False
385
+
386
+ def generate_text(self, prompt: str, max_length: int = 100,
387
+ temperature: float = 0.7, top_k: int = 50,
388
+ top_p: float = 0.9) -> str:
389
+ """Generate text using the loaded real model"""
390
+ if not self.current_model or self.current_model not in self.models:
391
+ return "❌ No model loaded. Please select a model first."
392
+
393
+ try:
394
+ model = self.models[self.current_model]
395
+ tokenizer = self.tokenizers[self.current_model]
396
+
397
+ # Tokenize input
398
+ input_ids = tokenizer.encode(prompt)
399
+ input_tensor = torch.tensor([input_ids], dtype=torch.long)
400
+
401
+ logger.info(f"🎯 Generating text with prompt: '{prompt[:50]}...'")
402
+ logger.info(f"πŸ“Š Parameters: max_length={max_length}, temperature={temperature}, top_k={top_k}, top_p={top_p}")
403
+
404
+ # Generate text
405
+ with torch.no_grad():
406
+ output_ids = model.generate(
407
+ input_tensor,
408
+ max_new_tokens=max_length,
409
+ temperature=temperature,
410
+ top_k=top_k,
411
+ top_p=top_p,
412
+ do_sample=True
413
+ )
414
+
415
+ # Decode output
416
+ generated_text = tokenizer.decode(output_ids[0].tolist())
417
+
418
+ # Remove the input prompt from the output
419
+ if generated_text.startswith(prompt):
420
+ generated_text = generated_text[len(prompt):].strip()
421
+
422
+ logger.info(f"βœ… Generated text: '{generated_text[:100]}...'")
423
+ return generated_text
424
+
425
+ except Exception as e:
426
+ error_msg = f"❌ Generation failed: {str(e)}"
427
+ logger.error(error_msg)
428
+ import traceback
429
+ logger.error(f"πŸ“‹ Full traceback: {traceback.format_exc()}")
430
+ return error_msg
431
+
432
+ # Initialize the real inference engine
433
+ inference_engine = RealOpenLLMInference()
434
+
435
+ def load_model_info(model_id: str) -> str:
436
+ """Get information about a specific model"""
437
+ config = inference_engine.model_configs.get(model_id)
438
+ if config:
439
+ return f"**{config['name']}**\n\n{config['description']}\n\n**Parameters:** {config['parameters']}\n**Training Steps:** {config['training_steps']:,}"
440
+ return "❌ Model not found"
441
+
442
+ def generate_text_interface(model_id: str, prompt: str, max_length: int,
443
+ temperature: float, top_k: int, top_p: float) -> str:
444
+ """Gradio interface function for text generation"""
445
+ try:
446
+ # Load model if not already loaded
447
+ if model_id not in inference_engine.models:
448
+ logger.info(f"πŸ”„ Loading real model: {model_id}")
449
+ success = inference_engine.load_model_from_hf(model_id)
450
+ if not success:
451
+ return f"❌ Failed to load real model: {model_id}"
452
+
453
+ # Generate text
454
+ result = inference_engine.generate_text(
455
+ prompt=prompt,
456
+ max_length=max_length,
457
+ temperature=temperature,
458
+ top_k=top_k,
459
+ top_p=top_p
460
+ )
461
+
462
+ return result
463
+
464
+ except Exception as e:
465
+ error_msg = f"❌ Error in generation interface: {str(e)}"
466
+ logger.error(error_msg)
467
+ return error_msg
468
+
469
+ # Create Gradio interface
470
+ def create_interface():
471
+ """Create the Gradio interface"""
472
+
473
+ with gr.Blocks(
474
+ title="πŸš€ OpenLLM Real Models Space",
475
+ theme=gr.themes.Soft()
476
+ ) as interface:
477
+
478
+ # Header
479
+ gr.Markdown("""
480
+ # πŸš€ OpenLLM Real Models Space
481
+
482
+ Welcome to the OpenLLM Real Models Space! This interface uses **actual trained models** from Hugging Face.
483
+
484
+ ## 🎯 Real Trained Models
485
+
486
+ We provide **5 different real models** with varying training steps:
487
+
488
+ | Model | Training Steps | Parameters | Performance |
489
+ |-------|---------------|------------|-------------|
490
+ | **4k Model** | 4,000 | 35.8M | Early training stage |
491
+ | **6k Model** | 6,000 | 35.8M | Improved coherence (Perplexity: 816.040) |
492
+ | **7k Model** | 7,000 | 35.8M | Enhanced quality (Loss: 2.100, Perplexity: 8.200) |
493
+ | **8k Model** | 8,000 | 35.8M | Sophisticated understanding |
494
+ | **9k Model** | 9,000 | 35.8M | Best performing model |
495
+
496
+ **These are real GPT-style transformer models trained on Wikipedia passages from the SQuAD dataset.**
497
+
498
+ ---
499
+ """)
500
+
501
+ with gr.Row():
502
+ with gr.Column(scale=1):
503
+ # Model selection
504
+ model_dropdown = gr.Dropdown(
505
+ choices=list(inference_engine.model_configs.keys()),
506
+ value="openllm-small-extended-9k",
507
+ label="🎯 Select Model",
508
+ info="Choose the real trained model to use"
509
+ )
510
+
511
+ # Model information display
512
+ model_info = gr.Markdown(
513
+ value=load_model_info("openllm-small-extended-9k"),
514
+ label="πŸ“‹ Model Information"
515
+ )
516
+
517
+ # Update model info when selection changes
518
+ model_dropdown.change(
519
+ fn=load_model_info,
520
+ inputs=[model_dropdown],
521
+ outputs=[model_info]
522
+ )
523
+
524
+ with gr.Column(scale=2):
525
+ # Input prompt
526
+ prompt_input = gr.Textbox(
527
+ lines=5,
528
+ label="πŸ“ Input Prompt",
529
+ placeholder="Enter your text prompt here...",
530
+ info="The text that will be used as input for generation"
531
+ )
532
+
533
+ # Generation parameters
534
+ with gr.Row():
535
+ max_length = gr.Slider(
536
+ minimum=10,
537
+ maximum=500,
538
+ value=100,
539
+ step=10,
540
+ label="πŸ“ Max Length",
541
+ info="Maximum number of tokens to generate"
542
+ )
543
+
544
+ temperature = gr.Slider(
545
+ minimum=0.1,
546
+ maximum=2.0,
547
+ value=0.7,
548
+ step=0.1,
549
+ label="🌑️ Temperature",
550
+ info="Controls randomness (higher = more random)"
551
+ )
552
+
553
+ with gr.Row():
554
+ top_k = gr.Slider(
555
+ minimum=1,
556
+ maximum=100,
557
+ value=50,
558
+ step=1,
559
+ label="πŸ” Top-K",
560
+ info="Number of highest probability tokens to consider"
561
+ )
562
+
563
+ top_p = gr.Slider(
564
+ minimum=0.1,
565
+ maximum=1.0,
566
+ value=0.9,
567
+ step=0.1,
568
+ label="πŸ“Š Top-P",
569
+ info="Nucleus sampling parameter"
570
+ )
571
+
572
+ # Generate button
573
+ generate_btn = gr.Button(
574
+ "πŸš€ Generate Text",
575
+ variant="primary",
576
+ size="lg"
577
+ )
578
+
579
+ # Output
580
+ output_text = gr.Textbox(
581
+ lines=10,
582
+ label="🎯 Generated Text",
583
+ info="The generated text will appear here"
584
+ )
585
+
586
+ # Connect the generate button
587
+ generate_btn.click(
588
+ fn=generate_text_interface,
589
+ inputs=[model_dropdown, prompt_input, max_length, temperature, top_k, top_p],
590
+ outputs=[output_text]
591
+ )
592
+
593
+ # Footer
594
+ gr.Markdown("""
595
+ ---
596
+
597
+ ## πŸ”§ Technical Details
598
+
599
+ - **Architecture**: GPT-style transformer decoder
600
+ - **Model Size**: Small (6 layers, 8 heads, 512 embedding dim)
601
+ - **Vocabulary**: 32k tokens (SentencePiece BPE)
602
+ - **Training Data**: Wikipedia passages from SQuAD dataset
603
+ - **Framework**: PyTorch with real trained models
604
+ - **Gradio Version**: 4.44.1 (latest)
605
+
606
+ **These models generate actual text based on their training on Wikipedia content.**
607
+
608
+ **Model Sources:**
609
+ - [4k Model](https://huggingface.co/lemms/openllm-small-extended-4k)
610
+ - [6k Model](https://huggingface.co/lemms/openllm-small-extended-6k)
611
+ - [7k Model](https://huggingface.co/lemms/openllm-small-extended-7k)
612
+ - [8k Model](https://huggingface.co/lemms/openllm-small-extended-8k)
613
+ - [9k Model](https://huggingface.co/lemms/openllm-small-extended-9k)
614
+ """)
615
+
616
+ return interface
617
+
618
+ # Create and launch the interface
619
+ if __name__ == "__main__":
620
+ interface = create_interface()
621
+ interface.launch(
622
+ server_name="0.0.0.0",
623
+ server_port=7860,
624
+ share=False,
625
+ debug=True
626
+ )
627
+