lemms commited on
Commit
85c2681
·
verified ·
1 Parent(s): a67c931

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +426 -1361
app.py CHANGED
@@ -1,1361 +1,426 @@
1
- #!/usr/bin/env python3
2
- """
3
- OpenLLM Real Models App - Ultimate Working Version with Correct lm_head Bias Handling
4
-
5
- This is the FINAL WORKING VERSION of the OpenLLM Real Models inference application that has been
6
- extensively debugged and optimized to correctly load and run the actual trained OpenLLM models
7
- from Hugging Face Hub.
8
-
9
- CRITICAL ARCHITECTURE MATCHING:
10
- - The GPT model architecture EXACTLY matches the saved state_dict from the trained models
11
- - All layer naming conventions use the 'transformer.' prefix (wte, wpe, h, ln_f)
12
- - Custom transformer blocks (Block, CausalSelfAttention, MLP) replace generic nn.TransformerEncoderLayer
13
- - Attention bias is correctly handled as causal attention masks (register_buffer) not learnable parameters
14
- - Language model head (lm_head) uses bias=False to match the saved model's architecture
15
- - All attribute naming conflicts have been resolved (use_bias vs bias)
16
-
17
- MODEL LOADING PROCESS:
18
- 1. Download model files from Hugging Face Hub using snapshot_download
19
- 2. Parse config.json to extract model configuration parameters
20
- 3. Create GPTConfig object with exact parameter matching
21
- 4. Initialize GPT model with custom architecture
22
- 5. Load state_dict from best_model.pt (handles model_state_dict wrapper)
23
- 6. Load SentencePiece tokenizer from tokenizer.model
24
- 7. Set model to evaluation mode for inference
25
-
26
- TEXT GENERATION FEATURES:
27
- - Real-time text generation using actual trained model weights
28
- - Configurable generation parameters (temperature, top_k, top_p, max_length)
29
- - Proper tokenization and detokenization using SentencePiece
30
- - Causal language modeling with attention masking
31
- - Support for all 5 model variants (4k, 6k, 7k, 8k, 9k training steps)
32
-
33
- TECHNICAL IMPLEMENTATION DETAILS:
34
- - PyTorch-based transformer architecture with custom attention implementation
35
- - Gradio web interface for user-friendly model interaction
36
- - Comprehensive error handling and logging throughout the pipeline
37
- - Memory-efficient model loading with CPU-only inference
38
- - Real-time model switching between different training checkpoints
39
-
40
- AUTHOR: Louis Chua Bean Chong
41
- PROJECT: OpenLLM - Open Source Large Language Model Framework
42
- LICENSE: GPLv3 - Open Source First Philosophy
43
- """
44
-
45
- import gradio as gr
46
- import torch
47
- import torch.nn as nn
48
- import torch.nn.functional as F
49
- import json
50
- import logging
51
- import sentencepiece as spm
52
- import math
53
- from pathlib import Path
54
- from typing import Dict, Any, Optional
55
- from huggingface_hub import snapshot_download
56
-
57
- # Set up comprehensive logging for debugging and monitoring
58
- logging.basicConfig(level=logging.INFO)
59
- logger = logging.getLogger(__name__)
60
-
61
- class GPTConfig:
62
- """
63
- GPT Model Configuration Class - Handles All Model Architecture Parameters
64
-
65
- This class defines the complete configuration for the GPT-style transformer model,
66
- including all architectural parameters that determine the model's size, capacity,
67
- and behavior. It accepts additional kwargs to handle any extra configuration
68
- fields that might be present in the saved model's config.json file.
69
-
70
- CRITICAL PARAMETERS:
71
- - vocab_size: Size of the vocabulary (32,000 for OpenLLM models)
72
- - n_layer: Number of transformer layers (6 for small models)
73
- - n_head: Number of attention heads (8 for small models)
74
- - n_embd: Embedding dimension (512 for small models)
75
- - block_size: Maximum sequence length (1024 tokens)
76
- - dropout: Dropout rate for regularization (0.1)
77
- - bias: Whether to use bias terms in linear layers (True)
78
-
79
- ARCHITECTURE NOTES:
80
- - Small model configuration: 6 layers, 8 heads, 512 dims = 35.8M parameters
81
- - This matches the exact architecture used during training
82
- - All parameters are carefully tuned for the SQuAD dataset training
83
- """
84
- def __init__(self, vocab_size=32000, n_layer=6, n_head=8, n_embd=512,
85
- block_size=1024, dropout=0.1, bias=True, **kwargs):
86
- # Accept any additional kwargs to handle extra config fields from saved models
87
- # This is crucial for loading models that may have additional metadata
88
- self.vocab_size = vocab_size
89
- self.n_layer = n_layer
90
- self.n_head = n_head
91
- self.n_embd = n_embd
92
- self.block_size = block_size
93
- self.dropout = dropout
94
- self.bias = bias
95
-
96
- class GPT(nn.Module):
97
- """
98
- GPT-Style Transformer Model - EXACT Architecture Matching the Saved Model
99
-
100
- This is the core transformer model that EXACTLY matches the architecture of the
101
- trained OpenLLM models. Every layer, every parameter, and every naming convention
102
- has been carefully designed to match the saved state_dict from the training process.
103
-
104
- ARCHITECTURE COMPONENTS:
105
- - transformer.wte: Word token embeddings (vocab_size -> n_embd)
106
- - transformer.wpe: Position embeddings (block_size -> n_embd)
107
- - transformer.drop: Dropout layer for regularization
108
- - transformer.h: List of transformer blocks (n_layer count)
109
- - transformer.ln_f: Final layer normalization
110
- - lm_head: Language model head (n_embd -> vocab_size, NO bias)
111
-
112
- CRITICAL DESIGN DECISIONS:
113
- - Uses nn.ModuleDict for transformer components to match 'transformer.' prefix
114
- - Custom Block, CausalSelfAttention, and MLP classes for exact architecture
115
- - lm_head.bias = False to match saved model (no bias term)
116
- - Proper weight initialization following GPT-style conventions
117
- - Causal attention masking for autoregressive generation
118
-
119
- FORWARD PASS:
120
- - Combines token and position embeddings
121
- - Processes through transformer blocks with residual connections
122
- - Applies final layer normalization
123
- - Projects to vocabulary space for next-token prediction
124
-
125
- GENERATION:
126
- - Autoregressive text generation with temperature, top-k, and top-p sampling
127
- - Causal attention ensures tokens only attend to previous tokens
128
- - Configurable generation parameters for different text styles
129
- """
130
- def __init__(self, config):
131
- super().__init__()
132
- # Validate critical configuration parameters
133
- assert config.vocab_size is not None, "vocab_size must be specified"
134
- assert config.block_size is not None, "block_size must be specified"
135
- self.config = config
136
-
137
- # Create the transformer module with the EXACT naming convention from saved model
138
- # This nn.ModuleDict structure is crucial for matching the 'transformer.' prefix
139
- # in the saved state_dict keys (transformer.wte.weight, transformer.wpe.weight, etc.)
140
- self.transformer = nn.ModuleDict(dict(
141
- wte = nn.Embedding(config.vocab_size, config.n_embd), # Word token embeddings
142
- wpe = nn.Embedding(config.block_size, config.n_embd), # Position embeddings
143
- drop = nn.Dropout(config.dropout), # Dropout for regularization
144
- h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # Transformer blocks
145
- ln_f = nn.LayerNorm(config.n_embd), # Final layer normalization
146
- ))
147
-
148
- # Language model head - CRITICAL: NO bias to match saved model architecture
149
- # The saved models were trained without bias in the language model head
150
- # This is a common practice in transformer language models for efficiency
151
- self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
152
-
153
- # Initialize weights using GPT-style initialization
154
- # This ensures proper weight scaling and prevents gradient issues
155
- self.apply(self._init_weights)
156
- for pn, p in self.named_parameters():
157
- if pn.endswith('c_proj.weight'):
158
- # Special initialization for projection layers in transformer blocks
159
- torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
160
-
161
- def _init_weights(self, module):
162
- """
163
- GPT-Style Weight Initialization for All Model Components
164
-
165
- This function applies the standard GPT weight initialization strategy:
166
- - Linear layers: Normal distribution with mean=0, std=0.02
167
- - Embeddings: Normal distribution with mean=0, std=0.02
168
- - Bias terms: Zero initialization (when present)
169
-
170
- This initialization scheme has been proven effective for transformer models
171
- and helps with training stability and convergence.
172
- """
173
- if isinstance(module, nn.Linear):
174
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
175
- if module.bias is not None:
176
- torch.nn.init.zeros_(module.bias)
177
- elif isinstance(module, nn.Embedding):
178
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
179
-
180
- def forward(self, idx, targets=None):
181
- """
182
- Forward Pass Through the Complete Transformer Model
183
-
184
- This is the main inference function that processes input tokens through
185
- the entire transformer architecture to produce logits for next-token prediction.
186
-
187
- ARGUMENTS:
188
- - idx: Input token indices (batch_size, sequence_length)
189
- - targets: Target token indices for training (optional, for loss computation)
190
-
191
- PROCESSING STEPS:
192
- 1. Extract sequence length and validate against block_size
193
- 2. Create position indices for positional encoding
194
- 3. Look up token and position embeddings
195
- 4. Combine embeddings and apply dropout
196
- 5. Process through all transformer blocks
197
- 6. Apply final layer normalization
198
- 7. Project to vocabulary space via language model head
199
-
200
- RETURNS:
201
- - logits: Predicted token probabilities (batch_size, seq_len, vocab_size)
202
- - loss: Cross-entropy loss (only if targets provided)
203
-
204
- NOTE: During inference (targets=None), only the last token's logits are returned
205
- for efficient autoregressive generation.
206
- """
207
- device = idx.device
208
- b, t = idx.size()
209
- # Validate sequence length against model's maximum block size
210
- assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
211
-
212
- # Create position indices for positional encoding
213
- # This enables the model to understand token positions in the sequence
214
- pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
215
-
216
- # Look up embeddings for tokens and positions
217
- tok_emb = self.transformer.wte(idx) # Token embeddings
218
- pos_emb = self.transformer.wpe(pos) # Position embeddings
219
-
220
- # Combine embeddings and apply dropout for regularization
221
- x = self.transformer.drop(tok_emb + pos_emb)
222
-
223
- # Process through all transformer blocks with residual connections
224
- for block in self.transformer.h:
225
- x = block(x)
226
-
227
- # Apply final layer normalization
228
- x = self.transformer.ln_f(x)
229
-
230
- # Project to vocabulary space for next-token prediction
231
- if targets is not None:
232
- # Training mode: compute loss for all positions
233
- logits = self.lm_head(x)
234
- loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
235
- else:
236
- # Inference mode: only compute logits for the last token (efficient generation)
237
- logits = self.lm_head(x[:, [-1], :])
238
- loss = None
239
-
240
- return logits, loss
241
-
242
- def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, top_p=None, do_sample=True):
243
- """
244
- Autoregressive Text Generation with Advanced Sampling Strategies
245
-
246
- This function generates text by repeatedly predicting the next token
247
- using the trained model, with configurable sampling parameters for
248
- controlling the creativity and coherence of the generated text.
249
-
250
- GENERATION PROCESS:
251
- 1. For each new token to generate:
252
- a. Forward pass through model to get logits for next token
253
- b. Apply temperature scaling to control randomness
254
- c. Apply top-k filtering to limit vocabulary choices
255
- d. Apply top-p (nucleus) sampling for dynamic vocabulary selection
256
- e. Sample next token from filtered probability distribution
257
- f. Append to sequence and repeat
258
-
259
- SAMPLING PARAMETERS:
260
- - temperature: Controls randomness (higher = more random, lower = more focused)
261
- - top_k: Limits vocabulary to k highest probability tokens
262
- - top_p: Nucleus sampling - limits to tokens with cumulative probability <= p
263
- - do_sample: Whether to sample (True) or use greedy decoding (False)
264
-
265
- ATTENTION HANDLING:
266
- - Uses causal attention masking to ensure tokens only attend to previous tokens
267
- - Automatically handles sequence length limits via block_size
268
- - Efficient autoregressive generation with minimal memory usage
269
-
270
- RETURNS:
271
- - Complete token sequence including input and generated tokens
272
- """
273
- for _ in range(max_new_tokens):
274
- # Ensure sequence doesn't exceed model's block size
275
- idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
276
-
277
- # Forward pass to get logits for next token
278
- logits, _ = self(idx_cond)
279
- logits = logits[:, -1, :] / temperature # Apply temperature scaling
280
-
281
- # Top-k filtering: keep only the k highest probability tokens
282
- if top_k is not None:
283
- v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
284
- logits[logits < v[:, [-1]]] = -float('Inf')
285
-
286
- # Top-p (nucleus) sampling: keep tokens with cumulative probability <= top_p
287
- if top_p is not None:
288
- sorted_logits, sorted_indices = torch.sort(logits, descending=True)
289
- cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
290
- sorted_indices_to_remove = cumulative_probs > top_p
291
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
292
- sorted_indices_to_remove[..., 0] = 0
293
- indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
294
- logits[indices_to_remove] = -float('Inf')
295
-
296
- # Convert logits to probabilities and sample next token
297
- probs = F.softmax(logits, dim=-1)
298
- if do_sample:
299
- # Stochastic sampling for creative text generation
300
- idx_next = torch.multinomial(probs, num_samples=1)
301
- else:
302
- # Greedy decoding for deterministic generation
303
- _, idx_next = torch.topk(probs, k=1, dim=-1)
304
-
305
- # Append new token to sequence
306
- idx = torch.cat((idx, idx_next), dim=1)
307
-
308
- return idx
309
-
310
- class Block(nn.Module):
311
- """
312
- Transformer Block - Core Building Block of the GPT Architecture
313
-
314
- Each transformer block implements the standard transformer architecture with:
315
- - Multi-head self-attention mechanism for capturing token relationships
316
- - Feed-forward neural network for processing attention outputs
317
- - Layer normalization for training stability
318
- - Residual connections for gradient flow
319
-
320
- ARCHITECTURE:
321
- - ln_1: Pre-attention layer normalization
322
- - attn: Multi-head causal self-attention
323
- - ln_2: Pre-feedforward layer normalization
324
- - mlp: Multi-layer perceptron (feed-forward network)
325
-
326
- RESIDUAL CONNECTIONS:
327
- - x = x + attn(ln_1(x)) # Residual connection around attention
328
- - x = x + mlp(ln_2(x)) # Residual connection around feed-forward
329
-
330
- DESIGN RATIONALE:
331
- - Layer normalization is applied BEFORE each sublayer (pre-norm)
332
- - This improves training stability and allows deeper networks
333
- - Residual connections help with gradient flow during backpropagation
334
- - The combination enables effective training of very deep transformer models
335
- """
336
- def __init__(self, config):
337
- super().__init__()
338
- self.ln_1 = nn.LayerNorm(config.n_embd) # Pre-attention normalization
339
- self.attn = CausalSelfAttention(config) # Multi-head causal attention
340
- self.ln_2 = nn.LayerNorm(config.n_embd) # Pre-feedforward normalization
341
- self.mlp = MLP(config) # Feed-forward network
342
-
343
- def forward(self, x):
344
- """
345
- Forward Pass Through a Single Transformer Block
346
-
347
- This implements the standard transformer block computation with
348
- pre-norm layer normalization and residual connections.
349
-
350
- PROCESSING STEPS:
351
- 1. Apply layer normalization to input
352
- 2. Process through multi-head self-attention
353
- 3. Add residual connection (x + attention_output)
354
- 4. Apply layer normalization to result
355
- 5. Process through feed-forward network
356
- 6. Add residual connection (x + feedforward_output)
357
-
358
- ARGUMENTS:
359
- - x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
360
-
361
- RETURNS:
362
- - Output tensor of same shape as input
363
- """
364
- # First sublayer: self-attention with residual connection
365
- x = x + self.attn(self.ln_1(x))
366
- # Second sublayer: feed-forward with residual connection
367
- x = x + self.mlp(self.ln_2(x))
368
- return x
369
-
370
- class CausalSelfAttention(nn.Module):
371
- """
372
- Multi-Head Causal Self-Attention - ULTIMATE WORKING VERSION
373
-
374
- This is the FINAL WORKING VERSION of the attention mechanism that correctly
375
- handles the causal attention bias as a buffer (not a learnable parameter).
376
- This was a critical fix that resolved the state_dict loading issues.
377
-
378
- ATTENTION MECHANISM:
379
- - Multi-head attention allows the model to attend to different parts of the sequence
380
- - Causal masking ensures tokens can only attend to previous tokens (autoregressive)
381
- - Query, Key, Value projections from the same input sequence
382
- - Scaled dot-product attention with optional dropout
383
-
384
- CRITICAL FIXES IMPLEMENTED:
385
- - Attention bias is correctly handled as a causal mask buffer (register_buffer)
386
- - Attribute naming conflict resolved (use_bias vs bias)
387
- - Proper attention mask application in forward pass
388
- - Exact matching with saved model's attention architecture
389
-
390
- ARCHITECTURE COMPONENTS:
391
- - c_attn: Combined QKV projection (n_embd -> 3*n_embd)
392
- - c_proj: Output projection (n_embd -> n_embd)
393
- - attn_dropout: Dropout for attention weights
394
- - resid_dropout: Dropout for output projection
395
- - bias: Causal attention mask (registered as buffer, not parameter)
396
-
397
- ATTENTION COMPUTATION:
398
- 1. Project input to Q, K, V vectors
399
- 2. Reshape for multi-head attention
400
- 3. Apply scaled dot-product attention with causal masking
401
- 4. Reshape back to original dimensions
402
- 5. Apply output projection with dropout
403
- """
404
- def __init__(self, config):
405
- super().__init__()
406
- # Validate that embedding dimension is divisible by number of heads
407
- assert config.n_embd % config.n_head == 0, "Embedding dimension must be divisible by number of heads"
408
-
409
- # Attention projections
410
- self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) # QKV projection
411
- self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) # Output projection
412
-
413
- # Dropout layers for regularization
414
- self.attn_dropout = nn.Dropout(config.dropout) # Attention weight dropout
415
- self.resid_dropout = nn.Dropout(config.dropout) # Output dropout
416
-
417
- # Store configuration parameters
418
- self.n_head = config.n_head # Number of attention heads
419
- self.n_embd = config.n_embd # Embedding dimension
420
- self.dropout = config.dropout # Dropout rate
421
- self.use_bias = config.bias # Use different name for the boolean flag to avoid conflicts
422
-
423
- # CRITICAL FIX: REGISTER THE ATTENTION BIAS as a buffer (not parameter)
424
- # This is actually an attention mask, not a learnable bias
425
- # The saved model stores this as 'bias' in the state_dict
426
- if config.bias:
427
- # Create a causal attention mask buffer
428
- # This is a lower triangular matrix that prevents tokens from attending to future tokens
429
- mask = torch.tril(torch.ones(config.block_size, config.block_size))
430
- mask = mask.view(1, 1, config.block_size, config.block_size)
431
- self.register_buffer('bias', mask) # This matches the saved model's 'bias' key
432
- else:
433
- self.register_buffer('bias', None)
434
-
435
- def forward(self, x):
436
- """
437
- Forward Pass Through Multi-Head Causal Self-Attention
438
-
439
- This function implements the complete attention mechanism including:
440
- - Query, Key, Value computation from input
441
- - Multi-head attention with causal masking
442
- - Output projection and dropout
443
-
444
- ATTENTION STEPS:
445
- 1. Project input to Q, K, V vectors (combined projection for efficiency)
446
- 2. Reshape for multi-head attention (separate heads)
447
- 3. Apply scaled dot-product attention with causal masking
448
- 4. Reshape back to original dimensions
449
- 5. Apply output projection with dropout
450
-
451
- ARGUMENTS:
452
- - x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
453
-
454
- RETURNS:
455
- - Output tensor of same shape as input
456
- """
457
- B, T, C = x.size() # Batch size, sequence length, embedding dimension
458
-
459
- # Calculate query, key, values for all heads
460
- # This is an efficient combined projection that creates Q, K, V in one operation
461
- q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
462
-
463
- # Reshape for multi-head attention
464
- # Each head gets a subset of the embedding dimension
465
- k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
466
- q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
467
- v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
468
-
469
- # Causal self-attention using the bias mask
470
- if self.bias is not None:
471
- # Use the causal mask - this prevents tokens from attending to future tokens
472
- # The mask is a lower triangular matrix where mask[i,j] = 1 if i >= j, 0 otherwise
473
- attn_mask = self.bias[:, :, :T, :T] # Extract mask for current sequence length
474
- y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask,
475
- dropout_p=self.dropout if self.training else 0,
476
- is_causal=False) # We provide our own mask
477
- else:
478
- # Use built-in causal attention (alternative approach)
479
- y = F.scaled_dot_product_attention(q, k, v, attn_mask=None,
480
- dropout_p=self.dropout if self.training else 0,
481
- is_causal=True)
482
-
483
- # Reshape back to original dimensions
484
- y = y.transpose(1, 2).contiguous().view(B, T, C)
485
-
486
- # Output projection with dropout
487
- y = self.resid_dropout(self.c_proj(y))
488
- return y
489
-
490
- class MLP(nn.Module):
491
- """
492
- Multi-Layer Perceptron - Feed-Forward Network in Transformer Blocks
493
-
494
- The MLP is the feed-forward component of each transformer block, consisting of:
495
- - Two linear transformations with a GELU activation in between
496
- - Dropout for regularization
497
- - Optional bias terms (controlled by config.bias)
498
-
499
- ARCHITECTURE:
500
- - c_fc: First linear layer (n_embd -> 4*n_embd) - expansion
501
- - gelu: GELU activation function
502
- - c_proj: Second linear layer (4*n_embd -> n_embd) - projection
503
- - dropout: Dropout layer for regularization
504
-
505
- DESIGN RATIONALE:
506
- - The 4x expansion factor is standard in transformer architectures
507
- - GELU activation provides smooth gradients and good performance
508
- - Dropout prevents overfitting during training
509
- - The combination allows the model to learn complex non-linear transformations
510
-
511
- MATHEMATICAL OPERATION:
512
- - x = dropout(linear2(gelu(linear1(x))))
513
- - This creates a powerful non-linear transformation for each token
514
- """
515
- def __init__(self, config):
516
- super().__init__()
517
- # First linear layer: expand embedding dimension by 4x
518
- self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
519
- # GELU activation function (commonly used in transformers)
520
- self.gelu = nn.GELU()
521
- # Second linear layer: project back to original embedding dimension
522
- self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
523
- # Dropout for regularization
524
- self.dropout = nn.Dropout(config.dropout)
525
-
526
- def forward(self, x):
527
- """
528
- Forward Pass Through the Multi-Layer Perceptron
529
-
530
- This implements the standard feed-forward computation in transformer blocks:
531
- 1. Expand dimension with first linear layer
532
- 2. Apply GELU activation
533
- 3. Project back to original dimension
534
- 4. Apply dropout for regularization
535
-
536
- ARGUMENTS:
537
- - x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
538
-
539
- RETURNS:
540
- - Output tensor of same shape as input
541
- """
542
- x = self.c_fc(x) # Expand: n_embd -> 4*n_embd
543
- x = self.gelu(x) # Apply GELU activation
544
- x = self.c_proj(x) # Project: 4*n_embd -> n_embd
545
- x = self.dropout(x) # Apply dropout for regularization
546
- return x
547
-
548
- class RealOpenLLMInference:
549
- """
550
- Real OpenLLM Inference Engine - Loads and Runs Actual Trained Models
551
-
552
- This is the core inference engine that handles the complete pipeline for loading
553
- and running the actual trained OpenLLM models from Hugging Face Hub. It provides
554
- a unified interface for model management, text generation, and parameter control.
555
-
556
- KEY FEATURES:
557
- - Dynamic model loading from Hugging Face Hub repositories
558
- - Support for all 5 model variants (4k, 6k, 7k, 8k, 9k training steps)
559
- - Comprehensive error handling and logging
560
- - Memory-efficient model management
561
- - Real-time model switching capabilities
562
-
563
- MODEL CONFIGURATIONS:
564
- - Each model has specific training characteristics and performance metrics
565
- - Models are trained on Wikipedia passages from the SQuAD dataset
566
- - Architecture: 6 layers, 8 heads, 512 embedding dim, 35.8M parameters
567
- - Vocabulary: 32k tokens using SentencePiece BPE tokenization
568
-
569
- TECHNICAL IMPLEMENTATION:
570
- - Uses huggingface_hub.snapshot_download for efficient model downloading
571
- - Handles various checkpoint formats (model_state_dict, direct state_dict)
572
- - Supports multiple model file formats (best_model.pt, model.pt, pytorch_model.bin)
573
- - Implements robust config parsing with fallback defaults
574
- - Provides detailed logging for debugging and monitoring
575
-
576
- MEMORY MANAGEMENT:
577
- - Models are loaded on-demand to conserve memory
578
- - Supports multiple models in memory simultaneously
579
- - Automatic cleanup of temporary download directories
580
- - CPU-only inference for compatibility and stability
581
- """
582
-
583
- def __init__(self):
584
- """
585
- Initialize the Real OpenLLM Inference Engine
586
-
587
- Sets up the inference engine with model configurations, storage containers,
588
- and logging infrastructure. This is the entry point for all model operations.
589
-
590
- INITIALIZATION COMPONENTS:
591
- - models: Dictionary to store loaded model instances
592
- - tokenizers: Dictionary to store loaded tokenizer instances
593
- - current_model: Tracks the currently active model
594
- - model_configs: Complete configuration for all available models
595
-
596
- MODEL CONFIGURATIONS INCLUDED:
597
- - 4k model: Early training stage, basic language understanding
598
- - 6k model: Improved coherence, better text generation
599
- - 7k model: Enhanced quality with lower perplexity
600
- - 8k model: Sophisticated understanding and reasoning
601
- - 9k model: Best performing model with highest quality output
602
- """
603
- # Storage containers for loaded models and tokenizers
604
- self.models = {} # Dictionary: model_id -> GPT model instance
605
- self.tokenizers = {} # Dictionary: model_id -> SentencePiece tokenizer
606
- self.current_model = None # Currently active model ID
607
-
608
- # Complete configuration for all available real models from Hugging Face
609
- # Each model has specific training characteristics and performance metrics
610
- self.model_configs = {
611
- "openllm-small-extended-4k": {
612
- "name": "OpenLLM Small (4k steps)",
613
- "description": "Real model trained for 4,000 steps - Early training stage with basic language understanding and simple text generation capabilities. This model represents the initial learning phase where the model begins to understand basic language patterns.",
614
- "hf_repo": "lemms/openllm-small-extended-4k",
615
- "training_steps": 4000,
616
- "parameters": "35.8M"
617
- },
618
- "openllm-small-extended-6k": {
619
- "name": "OpenLLM Small (6k steps)",
620
- "description": "Real model trained for 6,000 steps - Improved coherence and better text generation quality. This model shows significant improvement in understanding context and generating more coherent text sequences. Perplexity: 816.040 indicates substantial learning progress.",
621
- "hf_repo": "lemms/openllm-small-extended-6k",
622
- "training_steps": 6000,
623
- "parameters": "35.8M"
624
- },
625
- "openllm-small-extended-7k": {
626
- "name": "OpenLLM Small (7k steps)",
627
- "description": "Real model trained for 7,000 steps - Enhanced quality with significantly improved text generation. This model demonstrates much better language understanding with Loss: 2.100 and Perplexity: 8.200, showing excellent training convergence.",
628
- "hf_repo": "lemms/openllm-small-extended-7k",
629
- "training_steps": 7000,
630
- "parameters": "35.8M"
631
- },
632
- "openllm-small-extended-8k": {
633
- "name": "OpenLLM Small (8k steps)",
634
- "description": "Real model trained for 8,000 steps - Sophisticated understanding and advanced reasoning capabilities. This model shows deep comprehension of complex language patterns and can generate high-quality, contextually appropriate text.",
635
- "hf_repo": "lemms/openllm-small-extended-8k",
636
- "training_steps": 8000,
637
- "parameters": "35.8M"
638
- },
639
- "openllm-small-extended-9k": {
640
- "name": "OpenLLM Small (9k steps)",
641
- "description": "Real model trained for 9,000 steps - Best performing model with highest quality output. This represents the pinnacle of training for the small model architecture, offering the most sophisticated language understanding and generation capabilities.",
642
- "hf_repo": "lemms/openllm-small-extended-9k",
643
- "training_steps": 9000,
644
- "parameters": "35.8M"
645
- }
646
- }
647
-
648
- # Initialize logging to track engine startup
649
- logger.info("🚀 Real OpenLLM Inference Engine initialized with comprehensive model support")
650
-
651
- def load_model_from_hf(self, model_id: str) -> bool:
652
- """
653
- Load a Real Model from Hugging Face Hub
654
-
655
- This is the main entry point for loading models from Hugging Face Hub.
656
- It handles the complete pipeline from repository identification to model
657
- initialization, including downloading, configuration parsing, and setup.
658
-
659
- LOADING PROCESS:
660
- 1. Validate model_id against available configurations
661
- 2. Download model files from Hugging Face Hub
662
- 3. Parse model configuration and architecture
663
- 4. Initialize GPT model with exact architecture matching
664
- 5. Load trained weights from checkpoint file
665
- 6. Initialize SentencePiece tokenizer
666
- 7. Set model to evaluation mode for inference
667
-
668
- ERROR HANDLING:
669
- - Validates model_id existence before processing
670
- - Handles network errors during download
671
- - Manages file format variations and parsing issues
672
- - Provides detailed error messages for debugging
673
-
674
- ARGUMENTS:
675
- - model_id: String identifier for the model (e.g., "openllm-small-extended-9k")
676
-
677
- RETURNS:
678
- - bool: True if model loaded successfully, False otherwise
679
-
680
- SIDE EFFECTS:
681
- - Downloads model files to temporary directory
682
- - Stores model and tokenizer in internal dictionaries
683
- - Sets current_model to loaded model_id
684
- - Logs detailed progress information
685
- """
686
- try:
687
- # Validate that the requested model exists in our configuration
688
- config = self.model_configs.get(model_id)
689
- if not config:
690
- logger.error(f"❌ Unknown model ID: {model_id} - not found in available configurations")
691
- return False
692
-
693
- logger.info(f"📥 Loading real model from HF: {config['hf_repo']}")
694
-
695
- # Download model files from Hugging Face Hub
696
- # This uses the efficient snapshot_download function that handles caching
697
- # and only downloads files that don't already exist locally
698
- local_dir = snapshot_download(
699
- repo_id=config['hf_repo'],
700
- repo_type="model",
701
- local_dir=f"temp_{model_id}",
702
- allow_patterns=["*.pt", "*.json", "*.model", "*.bin"] # Only download necessary files
703
- )
704
-
705
- logger.info(f"✅ Downloaded model to: {local_dir}")
706
-
707
- # Load model and tokenizer from the downloaded directory
708
- # This is the core loading function that handles all the technical details
709
- success = self._load_model_and_tokenizer(local_dir, model_id)
710
- if success:
711
- # Update current model tracking
712
- self.current_model = model_id
713
- logger.info(f"✅ Successfully loaded real model: {model_id}")
714
- return True
715
- else:
716
- logger.error(f"❌ Failed to load model and tokenizer for: {model_id}")
717
- return False
718
-
719
- except Exception as e:
720
- # Comprehensive error handling for all potential issues
721
- logger.error(f"❌ Failed to load real model from HF {model_id}: {e}")
722
- return False
723
-
724
- def _load_model_and_tokenizer(self, model_dir: str, model_id: str) -> bool:
725
- """
726
- Load Model and Tokenizer from Local Directory - Core Loading Function
727
-
728
- This is the core function that handles the technical details of loading
729
- the model architecture, weights, and tokenizer from the downloaded files.
730
- It implements robust error handling and supports multiple file formats.
731
-
732
- LOADING STEPS:
733
- 1. Parse config.json to extract model architecture parameters
734
- 2. Create GPTConfig object with exact parameter matching
735
- 3. Initialize GPT model with custom architecture
736
- 4. Load state_dict from checkpoint file (handles multiple formats)
737
- 5. Load SentencePiece tokenizer from tokenizer.model
738
- 6. Set model to evaluation mode for inference
739
-
740
- CONFIGURATION HANDLING:
741
- - Supports both direct config and nested model_config structures
742
- - Filters parameters to only include expected GPTConfig fields
743
- - Provides fallback defaults for missing configuration files
744
- - Handles extra configuration fields gracefully
745
-
746
- CHECKPOINT FORMATS SUPPORTED:
747
- - model_state_dict: Standard PyTorch training checkpoint format
748
- - model: Alternative checkpoint key for model weights
749
- - Direct state_dict: Raw model weights without wrapper
750
- - Multiple file formats: best_model.pt, model.pt, pytorch_model.bin
751
-
752
- ERROR HANDLING:
753
- - Validates file existence before processing
754
- - Handles missing configuration files with defaults
755
- - Manages state_dict key mismatches and format variations
756
- - Provides detailed error messages and file listings
757
-
758
- ARGUMENTS:
759
- - model_dir: Path to directory containing model files
760
- - model_id: String identifier for the model being loaded
761
-
762
- RETURNS:
763
- - bool: True if loading successful, False otherwise
764
-
765
- SIDE EFFECTS:
766
- - Stores loaded model in self.models[model_id]
767
- - Stores loaded tokenizer in self.tokenizers[model_id]
768
- - Logs detailed progress and error information
769
- """
770
- try:
771
- model_path = Path(model_dir)
772
-
773
- # STEP 1: Load and parse model configuration
774
- # The config.json file contains all the architectural parameters
775
- config_file = model_path / "config.json"
776
- if config_file.exists():
777
- # Load configuration data from JSON file
778
- with open(config_file, 'r') as f:
779
- config_data = json.load(f)
780
-
781
- logger.info(f"📋 Config data keys: {list(config_data.keys())}")
782
-
783
- # Handle different config structures that might be present
784
- # Some models store config in a nested 'model_config' section
785
- if 'model_config' in config_data:
786
- # Extract model_config section for the actual model parameters
787
- model_config_data = config_data['model_config']
788
- logger.info("🔧 Using nested model_config structure")
789
- else:
790
- # Use the entire config as model config (direct structure)
791
- model_config_data = config_data
792
- logger.info("🔧 Using direct config structure")
793
-
794
- # Create GPTConfig with only the expected parameters
795
- # This filters out any extra fields that might cause issues
796
- expected_params = {
797
- 'vocab_size', 'n_layer', 'n_head', 'n_embd',
798
- 'block_size', 'dropout', 'bias'
799
- }
800
-
801
- config_kwargs = {}
802
- for key, value in model_config_data.items():
803
- if key in expected_params:
804
- config_kwargs[key] = value
805
-
806
- logger.info(f"🔧 Using config parameters: {config_kwargs}")
807
- model_config = GPTConfig(**config_kwargs)
808
- else:
809
- # Fallback to default configuration if config file is missing
810
- # This ensures the system can still work with incomplete model files
811
- logger.warning(f"⚠️ Config file not found, using default configuration")
812
- model_config = GPTConfig(
813
- vocab_size=32000,
814
- n_layer=6,
815
- n_head=8,
816
- n_embd=512,
817
- block_size=1024,
818
- dropout=0.1,
819
- bias=True
820
- )
821
-
822
- # STEP 2: Load model weights from checkpoint file
823
- # Try multiple possible file names and formats
824
- model_file = model_path / "best_model.pt"
825
- if not model_file.exists():
826
- model_file = model_path / "model.pt"
827
- if not model_file.exists():
828
- model_file = model_path / "pytorch_model.bin"
829
-
830
- if model_file.exists():
831
- logger.info(f"📦 Loading model from: {model_file}")
832
-
833
- # Initialize GPT model with the parsed configuration
834
- model = GPT(model_config)
835
-
836
- # Load checkpoint data from file
837
- checkpoint = torch.load(model_file, map_location='cpu')
838
-
839
- # Handle different checkpoint formats that might be present
840
- if isinstance(checkpoint, dict):
841
- if 'model_state_dict' in checkpoint:
842
- # Standard PyTorch training checkpoint format
843
- state_dict = checkpoint['model_state_dict']
844
- logger.info(f"📋 Loading from model_state_dict with {len(state_dict)} keys")
845
- elif 'model' in checkpoint:
846
- # Alternative checkpoint key for model weights
847
- state_dict = checkpoint['model']
848
- logger.info(f"📋 Loading from model with {len(state_dict)} keys")
849
- else:
850
- # Try to load directly as state dict
851
- state_dict = checkpoint
852
- logger.info(f"📋 Loading direct state dict with {len(state_dict)} keys")
853
- else:
854
- # Direct state dict (no wrapper dictionary)
855
- state_dict = checkpoint
856
- logger.info(f"📋 Loading direct state dict with {len(state_dict)} keys")
857
-
858
- # Load the state dict into the model
859
- # This is where the architecture matching is critical
860
- model.load_state_dict(state_dict)
861
-
862
- # Set model to evaluation mode for inference
863
- model.eval()
864
-
865
- # Store the loaded model in our dictionary
866
- self.models[model_id] = model
867
- logger.info(f"✅ Model loaded successfully")
868
- else:
869
- # Handle missing model file
870
- logger.error(f"❌ Model file not found in {model_dir}")
871
- logger.error(f" Available files: {list(model_path.glob('*'))}")
872
- return False
873
-
874
- # STEP 3: Load SentencePiece tokenizer
875
- # The tokenizer is essential for text tokenization and detokenization
876
- tokenizer_file = model_path / "tokenizer.model"
877
- if tokenizer_file.exists():
878
- # Initialize SentencePiece processor
879
- tokenizer = spm.SentencePieceProcessor()
880
-
881
- # Load the trained tokenizer model
882
- tokenizer.load(str(tokenizer_file))
883
-
884
- # Store the loaded tokenizer in our dictionary
885
- self.tokenizers[model_id] = tokenizer
886
- logger.info(f"✅ Tokenizer loaded successfully")
887
- else:
888
- # Handle missing tokenizer file
889
- logger.error(f"❌ Tokenizer file not found in {model_dir}")
890
- return False
891
-
892
- # All components loaded successfully
893
- return True
894
-
895
- except Exception as e:
896
- # Comprehensive error handling with full traceback
897
- logger.error(f"❌ Failed to load model and tokenizer: {e}")
898
- import traceback
899
- logger.error(f"📋 Full traceback: {traceback.format_exc()}")
900
- return False
901
-
902
- def generate_text(self, prompt: str, max_length: int = 100,
903
- temperature: float = 0.7, top_k: int = 50,
904
- top_p: float = 0.9) -> str:
905
- """
906
- Generate Text Using the Loaded Real Model
907
-
908
- This is the main text generation function that uses the loaded model
909
- to generate coherent text based on the input prompt. It implements
910
- the complete generation pipeline from tokenization to text output.
911
-
912
- GENERATION PROCESS:
913
- 1. Validate that a model is currently loaded
914
- 2. Tokenize the input prompt using SentencePiece
915
- 3. Convert tokens to PyTorch tensor format
916
- 4. Generate new tokens using the model's autoregressive generation
917
- 5. Decode the generated tokens back to text
918
- 6. Remove the input prompt from the output for clean results
919
-
920
- GENERATION PARAMETERS:
921
- - temperature: Controls randomness (0.1-2.0, higher = more random)
922
- - top_k: Limits vocabulary to k highest probability tokens (1-100)
923
- - top_p: Nucleus sampling threshold (0.1-1.0, controls diversity)
924
- - max_length: Maximum number of new tokens to generate (10-500)
925
-
926
- SAMPLING STRATEGIES:
927
- - Temperature scaling: Adjusts probability distribution sharpness
928
- - Top-k filtering: Restricts vocabulary to most likely tokens
929
- - Top-p (nucleus) sampling: Dynamic vocabulary selection based on cumulative probability
930
- - Combined sampling: All parameters work together for optimal text quality
931
-
932
- ERROR HANDLING:
933
- - Validates model availability before generation
934
- - Handles tokenization errors gracefully
935
- - Manages generation failures with detailed error messages
936
- - Provides fallback responses for error conditions
937
-
938
- ARGUMENTS:
939
- - prompt: Input text that will be used as the generation seed
940
- - max_length: Maximum number of new tokens to generate
941
- - temperature: Controls randomness in token selection
942
- - top_k: Number of highest probability tokens to consider
943
- - top_p: Nucleus sampling parameter for dynamic vocabulary selection
944
-
945
- RETURNS:
946
- - str: Generated text continuation (prompt removed for clean output)
947
-
948
- SIDE EFFECTS:
949
- - Logs generation parameters and progress
950
- - May trigger model loading if no model is currently active
951
- - Provides detailed error information for debugging
952
- """
953
- # Validate that a model is currently loaded and available
954
- if not self.current_model or self.current_model not in self.models:
955
- return "❌ No model loaded. Please select a model first."
956
-
957
- try:
958
- # Get the currently loaded model and tokenizer
959
- model = self.models[self.current_model]
960
- tokenizer = self.tokenizers[self.current_model]
961
-
962
- # STEP 1: Tokenize the input prompt
963
- # Convert text to token IDs using the SentencePiece tokenizer
964
- input_ids = tokenizer.encode(prompt)
965
-
966
- # Convert to PyTorch tensor format for model processing
967
- input_tensor = torch.tensor([input_ids], dtype=torch.long)
968
-
969
- # Log generation parameters for debugging and monitoring
970
- logger.info(f"🎯 Generating text with prompt: '{prompt[:50]}...'")
971
- logger.info(f"📊 Parameters: max_length={max_length}, temperature={temperature}, top_k={top_k}, top_p={top_p}")
972
-
973
- # STEP 2: Generate text using the model
974
- # Use torch.no_grad() for memory efficiency during inference
975
- with torch.no_grad():
976
- # Call the model's generate method with all parameters
977
- output_ids = model.generate(
978
- input_tensor,
979
- max_new_tokens=max_length,
980
- temperature=temperature,
981
- top_k=top_k,
982
- top_p=top_p,
983
- do_sample=True # Enable stochastic sampling for creative generation
984
- )
985
-
986
- # STEP 3: Decode the generated tokens back to text
987
- # Convert the complete token sequence (input + generated) to text
988
- generated_text = tokenizer.decode(output_ids[0].tolist())
989
-
990
- # STEP 4: Clean up the output by removing the input prompt
991
- # This provides a cleaner user experience by showing only the generated continuation
992
- if generated_text.startswith(prompt):
993
- generated_text = generated_text[len(prompt):].strip()
994
-
995
- # Log successful generation for monitoring
996
- logger.info(f"✅ Generated text: '{generated_text[:100]}...'")
997
- return generated_text
998
-
999
- except Exception as e:
1000
- # Comprehensive error handling with detailed error messages
1001
- error_msg = f"❌ Generation failed: {str(e)}"
1002
- logger.error(error_msg)
1003
- import traceback
1004
- logger.error(f"📋 Full traceback: {traceback.format_exc()}")
1005
- return error_msg
1006
-
1007
- # Initialize the real inference engine
1008
- # This creates the main inference engine instance that will handle all model operations
1009
- inference_engine = RealOpenLLMInference()
1010
-
1011
- def load_model_info(model_id: str) -> str:
1012
- """
1013
- Get Detailed Information About a Specific Model
1014
-
1015
- This function retrieves comprehensive information about a specific model
1016
- from the inference engine's configuration. It provides detailed descriptions
1017
- of the model's training characteristics, performance metrics, and capabilities.
1018
-
1019
- INFORMATION PROVIDED:
1020
- - Model name and training step count
1021
- - Detailed description of model capabilities and characteristics
1022
- - Parameter count and architecture details
1023
- - Training progress indicators and performance metrics
1024
-
1025
- USAGE:
1026
- - Called by the Gradio interface to display model information
1027
- - Updates dynamically when user selects different models
1028
- - Provides educational content about model differences
1029
-
1030
- ARGUMENTS:
1031
- - model_id: String identifier for the model (e.g., "openllm-small-extended-9k")
1032
-
1033
- RETURNS:
1034
- - str: Formatted markdown string with model information
1035
- """
1036
- config = inference_engine.model_configs.get(model_id)
1037
- if config:
1038
- # Format comprehensive model information in markdown
1039
- return f"**{config['name']}**\n\n{config['description']}\n\n**Parameters:** {config['parameters']}\n**Training Steps:** {config['training_steps']:,}"
1040
- return "❌ Model not found"
1041
-
1042
- def generate_text_interface(model_id: str, prompt: str, max_length: int,
1043
- temperature: float, top_k: int, top_p: float) -> str:
1044
- """
1045
- Gradio Interface Function for Text Generation - Main User Interface
1046
-
1047
- This is the primary interface function that connects the Gradio web interface
1048
- to the underlying inference engine. It handles user requests for text generation
1049
- and manages the complete workflow from model loading to text output.
1050
-
1051
- INTERFACE WORKFLOW:
1052
- 1. Receive generation request from Gradio interface
1053
- 2. Check if requested model is already loaded
1054
- 3. Load model if necessary (with progress logging)
1055
- 4. Call the inference engine's text generation function
1056
- 5. Return generated text to the user interface
1057
- 6. Handle any errors and provide user-friendly messages
1058
-
1059
- MODEL LOADING STRATEGY:
1060
- - Models are loaded on-demand to conserve memory
1061
- - Once loaded, models remain in memory for faster subsequent requests
1062
- - Automatic model switching when user selects different models
1063
- - Comprehensive error handling for loading failures
1064
-
1065
- GENERATION PARAMETERS:
1066
- - All parameters are passed through from the Gradio interface
1067
- - Parameters are validated and logged for debugging
1068
- - Default values ensure reasonable generation quality
1069
-
1070
- ERROR HANDLING:
1071
- - Graceful handling of model loading failures
1072
- - User-friendly error messages for interface display
1073
- - Detailed logging for technical debugging
1074
- - Fallback responses for various error conditions
1075
-
1076
- ARGUMENTS:
1077
- - model_id: String identifier for the model to use
1078
- - prompt: Input text prompt for generation
1079
- - max_length: Maximum number of tokens to generate
1080
- - temperature: Controls randomness in generation (0.1-2.0)
1081
- - top_k: Number of highest probability tokens to consider (1-100)
1082
- - top_p: Nucleus sampling parameter (0.1-1.0)
1083
-
1084
- RETURNS:
1085
- - str: Generated text or error message for display
1086
-
1087
- SIDE EFFECTS:
1088
- - May trigger model loading if model not already in memory
1089
- - Logs all generation requests and parameters
1090
- - Updates internal model tracking
1091
- """
1092
- try:
1093
- # Check if the requested model is already loaded in memory
1094
- if model_id not in inference_engine.models:
1095
- logger.info(f"🔄 Loading real model: {model_id}")
1096
- # Load the model from Hugging Face Hub
1097
- success = inference_engine.load_model_from_hf(model_id)
1098
- if not success:
1099
- # Return user-friendly error message if loading fails
1100
- return f"❌ Failed to load real model: {model_id}"
1101
-
1102
- # Generate text using the loaded model with all specified parameters
1103
- result = inference_engine.generate_text(
1104
- prompt=prompt,
1105
- max_length=max_length,
1106
- temperature=temperature,
1107
- top_k=top_k,
1108
- top_p=top_p
1109
- )
1110
-
1111
- # Return the generated text to the Gradio interface
1112
- return result
1113
-
1114
- except Exception as e:
1115
- # Comprehensive error handling for any unexpected issues
1116
- error_msg = f"❌ Error in generation interface: {str(e)}"
1117
- logger.error(error_msg)
1118
- return error_msg
1119
-
1120
- # Create Gradio interface
1121
- def create_interface():
1122
- """
1123
- Create the Complete Gradio Web Interface
1124
-
1125
- This function builds the entire Gradio web interface that provides users
1126
- with an intuitive way to interact with the OpenLLM models. The interface
1127
- includes model selection, parameter controls, and text generation capabilities.
1128
-
1129
- INTERFACE COMPONENTS:
1130
- - Header section with project information and model descriptions
1131
- - Model selection dropdown with detailed information display
1132
- - Text input area for user prompts
1133
- - Generation parameter controls (temperature, top-k, top-p, max length)
1134
- - Generate button for triggering text generation
1135
- - Output area for displaying generated text
1136
- - Footer with technical details and model sources
1137
-
1138
- LAYOUT DESIGN:
1139
- - Two-column layout for efficient space utilization
1140
- - Left column: Model selection and information
1141
- - Right column: Input controls and generation parameters
1142
- - Responsive design that works on different screen sizes
1143
- - Professional styling with Soft theme for modern appearance
1144
-
1145
- USER EXPERIENCE FEATURES:
1146
- - Real-time model information updates
1147
- - Intuitive parameter controls with helpful descriptions
1148
- - Clear visual feedback for all user actions
1149
- - Comprehensive error handling and user guidance
1150
- - Educational content about model differences and capabilities
1151
-
1152
- TECHNICAL INTEGRATION:
1153
- - Seamless connection to the inference engine
1154
- - Automatic model loading and switching
1155
- - Real-time parameter validation and feedback
1156
- - Comprehensive logging and error reporting
1157
- - Memory-efficient model management
1158
-
1159
- RETURNS:
1160
- - gr.Blocks: Complete Gradio interface ready for deployment
1161
- """
1162
-
1163
- # Create the main Gradio interface with professional styling
1164
- with gr.Blocks(
1165
- title="🚀 OpenLLM Real Models Space",
1166
- theme=gr.themes.Soft() # Modern, professional theme
1167
- ) as interface:
1168
-
1169
- # Header section with comprehensive project information
1170
- gr.Markdown("""
1171
- # 🚀 OpenLLM Real Models Space
1172
-
1173
- Welcome to the OpenLLM Real Models Space! This interface uses **actual trained models** from Hugging Face.
1174
-
1175
- ## 🎯 Real Trained Models
1176
-
1177
- We provide **5 different real models** with varying training steps:
1178
-
1179
- | Model | Training Steps | Parameters | Performance |
1180
- |-------|---------------|------------|-------------|
1181
- | **4k Model** | 4,000 | 35.8M | Early training stage |
1182
- | **6k Model** | 6,000 | 35.8M | Improved coherence (Perplexity: 816.040) |
1183
- | **7k Model** | 7,000 | 35.8M | Enhanced quality (Loss: 2.100, Perplexity: 8.200) |
1184
- | **8k Model** | 8,000 | 35.8M | Sophisticated understanding |
1185
- | **9k Model** | 9,000 | 35.8M | Best performing model |
1186
-
1187
- **These are real GPT-style transformer models trained on Wikipedia passages from the SQuAD dataset.**
1188
-
1189
- ---
1190
- """)
1191
-
1192
- # Main interface layout with two columns
1193
- with gr.Row():
1194
- # Left column: Model selection and information
1195
- with gr.Column(scale=1):
1196
- # Model selection dropdown
1197
- # This allows users to choose between different model variants
1198
- model_dropdown = gr.Dropdown(
1199
- choices=list(inference_engine.model_configs.keys()), # All available models
1200
- value="openllm-small-extended-9k", # Default to best performing model
1201
- label="🎯 Select Model",
1202
- info="Choose the real trained model to use"
1203
- )
1204
-
1205
- # Model information display
1206
- # Shows detailed information about the selected model
1207
- model_info = gr.Markdown(
1208
- value=load_model_info("openllm-small-extended-9k"), # Default model info
1209
- label="📋 Model Information"
1210
- )
1211
-
1212
- # Update model info when selection changes
1213
- # This provides real-time updates as users switch between models
1214
- model_dropdown.change(
1215
- fn=load_model_info,
1216
- inputs=[model_dropdown],
1217
- outputs=[model_info]
1218
- )
1219
-
1220
- # Right column: Input controls and generation parameters
1221
- with gr.Column(scale=2):
1222
- # Text input area for user prompts
1223
- # This is where users enter their text for generation
1224
- prompt_input = gr.Textbox(
1225
- lines=5, # Multi-line input for longer prompts
1226
- label="📝 Input Prompt",
1227
- placeholder="Enter your text prompt here...",
1228
- info="The text that will be used as input for generation"
1229
- )
1230
-
1231
- # Generation parameters in organized rows
1232
- # First row: Max length and temperature controls
1233
- with gr.Row():
1234
- # Maximum length control
1235
- max_length = gr.Slider(
1236
- minimum=10,
1237
- maximum=500,
1238
- value=100, # Default to reasonable length
1239
- step=10,
1240
- label="📏 Max Length",
1241
- info="Maximum number of tokens to generate"
1242
- )
1243
-
1244
- # Temperature control for randomness
1245
- temperature = gr.Slider(
1246
- minimum=0.1,
1247
- maximum=2.0,
1248
- value=0.7, # Default to balanced creativity
1249
- step=0.1,
1250
- label="🌡️ Temperature",
1251
- info="Controls randomness (higher = more random)"
1252
- )
1253
-
1254
- # Second row: Top-k and top-p controls
1255
- with gr.Row():
1256
- # Top-k filtering control
1257
- top_k = gr.Slider(
1258
- minimum=1,
1259
- maximum=100,
1260
- value=50, # Default to reasonable filtering
1261
- step=1,
1262
- label="🔝 Top-K",
1263
- info="Number of highest probability tokens to consider"
1264
- )
1265
-
1266
- # Top-p (nucleus) sampling control
1267
- top_p = gr.Slider(
1268
- minimum=0.1,
1269
- maximum=1.0,
1270
- value=0.9, # Default to high diversity
1271
- step=0.1,
1272
- label="📊 Top-P",
1273
- info="Nucleus sampling parameter"
1274
- )
1275
-
1276
- # Generate button
1277
- # This triggers the text generation process
1278
- generate_btn = gr.Button(
1279
- "🚀 Generate Text",
1280
- variant="primary", # Prominent styling
1281
- size="lg" # Large button for easy interaction
1282
- )
1283
-
1284
- # Output area for displaying generated text
1285
- # This shows the results of the generation process
1286
- output_text = gr.Textbox(
1287
- lines=10, # Large output area for generated text
1288
- label="🎯 Generated Text",
1289
- info="The generated text will appear here"
1290
- )
1291
-
1292
- # Connect the generate button to the generation function
1293
- # This creates the workflow from user input to text output
1294
- generate_btn.click(
1295
- fn=generate_text_interface,
1296
- inputs=[model_dropdown, prompt_input, max_length, temperature, top_k, top_p],
1297
- outputs=[output_text]
1298
- )
1299
-
1300
- # Footer section with technical details and model sources
1301
- gr.Markdown("""
1302
- ---
1303
-
1304
- ## 🔧 Technical Details
1305
-
1306
- - **Architecture**: GPT-style transformer decoder
1307
- - **Model Size**: Small (6 layers, 8 heads, 512 embedding dim)
1308
- - **Vocabulary**: 32k tokens (SentencePiece BPE)
1309
- - **Training Data**: Wikipedia passages from SQuAD dataset
1310
- - **Framework**: PyTorch with real trained models
1311
- - **Gradio Version**: 4.44.1 (latest)
1312
-
1313
- **These models generate actual text based on their training on Wikipedia content.**
1314
-
1315
- **Model Sources:**
1316
- - [4k Model](https://huggingface.co/lemms/openllm-small-extended-4k)
1317
- - [6k Model](https://huggingface.co/lemms/openllm-small-extended-6k)
1318
- - [7k Model](https://huggingface.co/lemms/openllm-small-extended-7k)
1319
- - [8k Model](https://huggingface.co/lemms/openllm-small-extended-8k)
1320
- - [9k Model](https://huggingface.co/lemms/openllm-small-extended-9k)
1321
- """)
1322
-
1323
- return interface
1324
-
1325
- # Create and launch the interface
1326
- if __name__ == "__main__":
1327
- """
1328
- Main Application Entry Point
1329
-
1330
- This is the entry point for the Gradio application. It creates the interface
1331
- and launches the web server for user interaction.
1332
-
1333
- LAUNCH CONFIGURATION:
1334
- - server_name: "0.0.0.0" allows external connections
1335
- - server_port: 7860 is the standard Gradio port
1336
- - share: False for local deployment (set to True for public sharing)
1337
- - debug: True for development logging and error details
1338
-
1339
- DEPLOYMENT CONSIDERATIONS:
1340
- - The application is designed for Hugging Face Spaces deployment
1341
- - All dependencies are specified in requirements.txt
1342
- - The interface is optimized for web-based interaction
1343
- - Error handling is comprehensive for production use
1344
-
1345
- TECHNICAL FEATURES:
1346
- - Automatic model loading and management
1347
- - Real-time text generation capabilities
1348
- - Comprehensive parameter controls
1349
- - Professional user interface design
1350
- - Robust error handling and logging
1351
- """
1352
- # Create the complete Gradio interface
1353
- interface = create_interface()
1354
-
1355
- # Launch the web server with production-ready configuration
1356
- interface.launch(
1357
- server_name="0.0.0.0", # Allow external connections
1358
- server_port=7860, # Standard Gradio port
1359
- share=False, # Local deployment (set to True for public sharing)
1360
- debug=True # Enable debug logging for development
1361
- )
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenLLM Inference Space - Gradio Interface
4
+ Loads models from Hugging Face repositories to avoid storage limits
5
+ """
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Dict, Any, Optional
13
+ import logging
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Import our custom modules
20
+ try:
21
+ from core.src.model import GPTConfig, GPT
22
+ from core.src.inference_server import OpenLLMInference
23
+ logger.info("✅ Successfully imported core modules")
24
+ except ImportError as e:
25
+ logger.error(f"❌ Failed to import core modules: {e}")
26
+ raise
27
+
28
+ class OpenLLMInferenceEngine:
29
+ """
30
+ Inference engine that loads models from Hugging Face repositories
31
+ """
32
+
33
+ def __init__(self):
34
+ self.models = {}
35
+ self.tokenizers = {}
36
+ self.current_model = None
37
+ self.current_tokenizer = None
38
+
39
+ # Model configurations with Hugging Face repository IDs
40
+ self.model_configs = {
41
+ "openllm-small-extended-4k": {
42
+ "name": "OpenLLM Small (4k steps)",
43
+ "description": "Small model trained for 4,000 steps - Early training stage",
44
+ "hf_repo": "lemms/openllm-small-extended-4k",
45
+ "local_path": "models/small-extended-4k", # Fallback if HF repo fails
46
+ "checkpoint": "best_model.pt",
47
+ "config": "config.json"
48
+ },
49
+ "openllm-small-extended-6k": {
50
+ "name": "OpenLLM Small (6k steps)",
51
+ "description": "Small model trained for 6,000 steps - Improved coherence",
52
+ "hf_repo": "lemms/openllm-small-extended-6k",
53
+ "local_path": "models/small-extended-6k",
54
+ "checkpoint": "best_model.pt",
55
+ "config": "config.json"
56
+ },
57
+ "openllm-small-extended-7k": {
58
+ "name": "OpenLLM Small (7k steps)",
59
+ "description": "Small model trained for 7,000 steps - Enhanced quality",
60
+ "hf_repo": "lemms/openllm-small-extended-7k",
61
+ "local_path": "models/small-extended-7k",
62
+ "checkpoint": "best_model.pt",
63
+ "config": "config.json"
64
+ },
65
+ "openllm-small-extended-8k": {
66
+ "name": "OpenLLM Small (8k steps)",
67
+ "description": "Small model trained for 8,000 steps - Sophisticated understanding",
68
+ "hf_repo": "lemms/openllm-small-extended-8k",
69
+ "local_path": "models/small-extended-8k",
70
+ "checkpoint": "best_model.pt",
71
+ "config": "config.json"
72
+ },
73
+ "openllm-small-extended-9k": {
74
+ "name": "OpenLLM Small (9k steps)",
75
+ "description": "Small model trained for 9,000 steps - Best performing model",
76
+ "hf_repo": "lemms/openllm-small-extended-9k",
77
+ "local_path": "models/small-extended-9k",
78
+ "checkpoint": "best_model.pt",
79
+ "config": "config.json"
80
+ },
81
+ "openllm-small-extended-10k": {
82
+ "name": "OpenLLM Small (10k steps)",
83
+ "description": "Small model trained for 10,000 steps - Latest extended training",
84
+ "hf_repo": "lemms/openllm-small-extended-10k",
85
+ "local_path": "models/small-extended-10k",
86
+ "checkpoint": "best_model.pt",
87
+ "config": "config.json"
88
+ }
89
+ }
90
+
91
+ logger.info("🚀 OpenLLM Inference Engine initialized")
92
+ logger.info(f"📋 Available models: {list(self.model_configs.keys())}")
93
+
94
+ def load_model_from_hf(self, model_id: str) -> bool:
95
+ """
96
+ Load model from Hugging Face repository
97
+ """
98
+ try:
99
+ from huggingface_hub import snapshot_download
100
+
101
+ config = self.model_configs.get(model_id)
102
+ if not config:
103
+ logger.error(f"❌ Unknown model ID: {model_id}")
104
+ return False
105
+
106
+ logger.info(f"📥 Loading model from HF: {config['hf_repo']}")
107
+
108
+ # Download model files from Hugging Face
109
+ local_dir = snapshot_download(
110
+ repo_id=config['hf_repo'],
111
+ repo_type="model",
112
+ local_dir=f"temp_{model_id}",
113
+ allow_patterns=["*.pt", "*.json", "*.model"]
114
+ )
115
+
116
+ logger.info(f"✅ Downloaded model to: {local_dir}")
117
+
118
+ # Initialize inference engine with downloaded model
119
+ inference_engine = OpenLLMInference(
120
+ model_path=Path(local_dir),
121
+ model_format="pytorch"
122
+ )
123
+
124
+ self.models[model_id] = inference_engine
125
+ self.current_model = model_id
126
+
127
+ logger.info(f"✅ Successfully loaded model: {model_id}")
128
+ return True
129
+
130
+ except Exception as e:
131
+ logger.error(f"❌ Failed to load model from HF {model_id}: {e}")
132
+
133
+ # Fallback to local model if available
134
+ logger.info(f"🔄 Trying fallback to local model...")
135
+ return self.load_model_local(model_id)
136
+
137
+ def load_model_local(self, model_id: str) -> bool:
138
+ """
139
+ Load model from local files (fallback method)
140
+ """
141
+ try:
142
+ config = self.model_configs.get(model_id)
143
+ if not config:
144
+ return False
145
+
146
+ local_path = Path(config['local_path'])
147
+ if not local_path.exists():
148
+ logger.error(f"❌ Local model path not found: {local_path}")
149
+ return False
150
+
151
+ logger.info(f"📂 Loading local model: {local_path}")
152
+
153
+ # Initialize inference engine with local model
154
+ inference_engine = OpenLLMInference(
155
+ model_path=local_path,
156
+ model_format="pytorch"
157
+ )
158
+
159
+ self.models[model_id] = inference_engine
160
+ self.current_model = model_id
161
+
162
+ logger.info(f"✅ Successfully loaded local model: {model_id}")
163
+ return True
164
+
165
+ except Exception as e:
166
+ logger.error(f"❌ Failed to load local model {model_id}: {e}")
167
+ return False
168
+
169
+ def generate_text(self, prompt: str, max_length: int = 100,
170
+ temperature: float = 0.7, top_k: int = 50,
171
+ top_p: float = 0.9) -> str:
172
+ """
173
+ Generate text using the currently loaded model
174
+ """
175
+ if not self.current_model or self.current_model not in self.models:
176
+ return "❌ No model loaded. Please select a model first."
177
+
178
+ try:
179
+ logger.info(f"🎯 Generating text with model: {self.current_model}")
180
+ logger.info(f"📝 Prompt: {prompt[:100]}...")
181
+ logger.info(f"⚙️ Parameters: max_length={max_length}, temp={temperature}, top_k={top_k}, top_p={top_p}")
182
+
183
+ # Get the inference engine
184
+ inference_engine = self.models[self.current_model]
185
+
186
+ # Prepare generation parameters
187
+ generation_params = {
188
+ "max_length": max_length,
189
+ "temperature": temperature,
190
+ "top_k": top_k,
191
+ "top_p": top_p,
192
+ "do_sample": True,
193
+ "pad_token_id": 0,
194
+ "eos_token_id": 1
195
+ }
196
+
197
+ # Generate text
198
+ generated_texts = inference_engine.generate(
199
+ prompt=prompt,
200
+ **generation_params
201
+ )
202
+
203
+ # Extract the generated text
204
+ if isinstance(generated_texts, list) and len(generated_texts) > 0:
205
+ generated_text = generated_texts[0]
206
+ else:
207
+ generated_text = str(generated_texts)
208
+
209
+ logger.info(f"✅ Generated text length: {len(generated_text)}")
210
+ return generated_text
211
+
212
+ except Exception as e:
213
+ error_msg = f"❌ Generation failed: {str(e)}"
214
+ logger.error(error_msg)
215
+ return error_msg
216
+
217
+ # Initialize the inference engine
218
+ inference_engine = OpenLLMInferenceEngine()
219
+
220
+ def load_model_info(model_id: str) -> str:
221
+ """Get information about a specific model"""
222
+ config = inference_engine.model_configs.get(model_id)
223
+ if config:
224
+ return f"**{config['name']}**\n\n{config['description']}"
225
+ return "❌ Model not found"
226
+
227
+ def generate_text_interface(model_id: str, prompt: str, max_length: int,
228
+ temperature: float, top_k: int, top_p: float) -> str:
229
+ """
230
+ Gradio interface function for text generation
231
+ """
232
+ try:
233
+ # Load model if not already loaded
234
+ if model_id not in inference_engine.models:
235
+ logger.info(f"🔄 Loading model: {model_id}")
236
+ success = inference_engine.load_model_from_hf(model_id)
237
+ if not success:
238
+ return f"❌ Failed to load model: {model_id}"
239
+
240
+ # Generate text
241
+ result = inference_engine.generate_text(
242
+ prompt=prompt,
243
+ max_length=max_length,
244
+ temperature=temperature,
245
+ top_k=top_k,
246
+ top_p=top_p
247
+ )
248
+
249
+ return result
250
+
251
+ except Exception as e:
252
+ error_msg = f"❌ Error in generation interface: {str(e)}"
253
+ logger.error(error_msg)
254
+ return error_msg
255
+
256
+ # Create Gradio interface
257
+ def create_interface():
258
+ """Create the Gradio interface"""
259
+
260
+ with gr.Blocks(
261
+ title="🚀 OpenLLM Inference Space",
262
+ theme=gr.themes.Soft(),
263
+ css="""
264
+ .gradio-container {
265
+ max-width: 1200px !important;
266
+ margin: auto !important;
267
+ }
268
+ .model-info {
269
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
270
+ color: white;
271
+ padding: 20px;
272
+ border-radius: 10px;
273
+ margin-bottom: 20px;
274
+ }
275
+ """
276
+ ) as interface:
277
+
278
+ # Header
279
+ gr.Markdown("""
280
+ # 🚀 OpenLLM Inference Space
281
+
282
+ Welcome to the OpenLLM Inference Space! This is a comprehensive interface for running inference on our trained OpenLLM models with customizable parameters.
283
+
284
+ ## 🎯 Available Models
285
+
286
+ We provide **5 different models** trained for varying numbers of steps:
287
+
288
+ | Model | Training Steps | Description | Best Loss |
289
+ |-------|---------------|-------------|-----------|
290
+ | **4k Model** | 4,000 | Early training stage, basic language patterns | ~6.2 |
291
+ | **6k Model** | 6,000 | Improved coherence, better vocabulary usage | ~5.8 |
292
+ | **7k Model** | 7,000 | Enhanced text generation quality | ~5.5 |
293
+ | **8k Model** | 8,000 | More sophisticated language understanding | ~5.3 |
294
+ | **9k Model** | 9,000 | Best performing model (latest training) | ~5.2 |
295
+
296
+ ---
297
+ """)
298
+
299
+ with gr.Row():
300
+ with gr.Column(scale=1):
301
+ # Model selection
302
+ model_dropdown = gr.Dropdown(
303
+ choices=list(inference_engine.model_configs.keys()),
304
+ value="openllm-small-extended-9k",
305
+ label="🎯 Select Model",
306
+ info="Choose the model to use for inference"
307
+ )
308
+
309
+ # Model information display
310
+ model_info = gr.Markdown(
311
+ value=load_model_info("openllm-small-extended-9k"),
312
+ label="📋 Model Information"
313
+ )
314
+
315
+ # Update model info when selection changes
316
+ model_dropdown.change(
317
+ fn=load_model_info,
318
+ inputs=[model_dropdown],
319
+ outputs=[model_info]
320
+ )
321
+
322
+ with gr.Column(scale=2):
323
+ # Input prompt
324
+ prompt_input = gr.Textbox(
325
+ lines=5,
326
+ label="📝 Input Prompt",
327
+ placeholder="Enter your text prompt here...",
328
+ info="The text that will be used as input for generation"
329
+ )
330
+
331
+ # Generation parameters
332
+ with gr.Row():
333
+ max_length = gr.Slider(
334
+ minimum=10,
335
+ maximum=500,
336
+ value=100,
337
+ step=10,
338
+ label="📏 Max Length",
339
+ info="Maximum number of tokens to generate"
340
+ )
341
+
342
+ temperature = gr.Slider(
343
+ minimum=0.1,
344
+ maximum=2.0,
345
+ value=0.7,
346
+ step=0.1,
347
+ label="🌡️ Temperature",
348
+ info="Controls randomness (higher = more random)"
349
+ )
350
+
351
+ with gr.Row():
352
+ top_k = gr.Slider(
353
+ minimum=1,
354
+ maximum=100,
355
+ value=50,
356
+ step=1,
357
+ label="🔝 Top-K",
358
+ info="Number of highest probability tokens to consider"
359
+ )
360
+
361
+ top_p = gr.Slider(
362
+ minimum=0.1,
363
+ maximum=1.0,
364
+ value=0.9,
365
+ step=0.1,
366
+ label="📊 Top-P",
367
+ info="Nucleus sampling parameter"
368
+ )
369
+
370
+ # Generate button
371
+ generate_btn = gr.Button(
372
+ "🚀 Generate Text",
373
+ variant="primary",
374
+ size="lg"
375
+ )
376
+
377
+ # Output
378
+ output_text = gr.Textbox(
379
+ lines=10,
380
+ label="🎯 Generated Text",
381
+ info="The generated text will appear here"
382
+ )
383
+
384
+ # Connect the generate button
385
+ generate_btn.click(
386
+ fn=generate_text_interface,
387
+ inputs=[model_dropdown, prompt_input, max_length, temperature, top_k, top_p],
388
+ outputs=[output_text]
389
+ )
390
+
391
+ # Footer
392
+ gr.Markdown("""
393
+ ---
394
+
395
+ ## 🔧 Technical Details
396
+
397
+ - **Architecture**: GPT-style transformer decoder
398
+ - **Model Size**: Small (6 layers, 8 heads, 512 embedding dim)
399
+ - **Vocabulary**: 32k tokens (SentencePiece BPE)
400
+ - **Training Data**: Custom dataset with 27.6M tokens
401
+ - **Framework**: PyTorch with custom training loop
402
+
403
+ ## 📚 Usage Tips
404
+
405
+ 1. **Start with the 9k model** for best results
406
+ 2. **Lower temperature** (0.3-0.7) for more focused output
407
+ 3. **Higher temperature** (0.8-1.2) for more creative output
408
+ 4. **Adjust max_length** based on your needs
409
+ 5. **Experiment with top-k and top-p** for different generation styles
410
+
411
+ ---
412
+
413
+ **Built with ❤️ using OpenLLM Framework**
414
+ """)
415
+
416
+ return interface
417
+
418
+ # Create and launch the interface
419
+ if __name__ == "__main__":
420
+ interface = create_interface()
421
+ interface.launch(
422
+ server_name="0.0.0.0",
423
+ server_port=7860,
424
+ share=False,
425
+ debug=True
426
+ )