Spaces:

abi96062
/

smolLM2-135-implementation

Running

App Files Files Community

abi96062 commited on 11 days ago

Commit

144aae5

verified ·

1 Parent(s): 0e3e3d6

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -120

app.py CHANGED Viewed

@@ -1,55 +1,38 @@
 import gradio as gr
 import torch
 import torch.nn as nn
-from model import SmolLM2_135M  # Import your model class
-import yaml
 # Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load model
 @torch.no_grad()
 def load_model():
     """Load the trained model"""
     print("Loading model...")
-    # Load config
-    with open('config.yaml', 'r') as f:
-        config = yaml.safe_load(f)
-    # Initialize model
-    model = SmolLM2_135M(
-        vocab_size=config['vocab_size'],
-        d_model=config['d_model'],
-        n_layers=config['n_layers'],
-        n_heads=config['n_heads'],
-        # Add other config parameters
-    ).to(device)
     # Load checkpoint
-    checkpoint = torch.load('checkpoints/checkpoint_step_5050.pt',
-                           map_location=device)
     model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
-    print(f"Model loaded successfully on {device}")
     return model, checkpoint
 # Load model at startup
 model, checkpoint = load_model()
-# Tokenizer (adjust based on your implementation)
-def tokenize(text, max_length=128):
-    """Simple character-level tokenizer - REPLACE with your actual tokenizer"""
-    # This is a placeholder - use your actual tokenizer
-    tokens = [ord(c) for c in text[:max_length]]
-    return torch.tensor(tokens).unsqueeze(0).to(device)
-def detokenize(tokens):
-    """Convert tokens back to text - REPLACE with your actual detokenizer"""
-    # This is a placeholder - use your actual detokenizer
-    return ''.join([chr(t) for t in tokens if t < 128])
 @torch.no_grad()
 def generate_text(
     prompt,
@@ -61,79 +44,62 @@ def generate_text(
     """Generate text from prompt"""
     try:
         # Tokenize input
-        input_ids = tokenize(prompt)
-        # Generate
-        generated = input_ids[0].tolist()
-        for _ in range(max_length):
-            # Get model predictions
-            input_tensor = torch.tensor([generated]).to(device)
-            logits = model(input_tensor)
-            # Get next token logits
-            next_token_logits = logits[0, -1, :] / temperature
-            # Apply top-k filtering
-            if top_k > 0:
-                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
-                next_token_logits[indices_to_remove] = float('-inf')
-            # Apply top-p (nucleus) filtering
-            if top_p < 1.0:
-                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
-                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                sorted_indices_to_remove[..., 0] = 0
-                indices_to_remove = sorted_indices[sorted_indices_to_remove]
-                next_token_logits[indices_to_remove] = float('-inf')
-            # Sample next token
-            probs = torch.softmax(next_token_logits, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1).item()
-            generated.append(next_token)
-            # Stop if EOS token (adjust based on your vocab)
-            if next_token == 0:  # Assuming 0 is EOS
-                break
-        # Detokenize
-        output_text = detokenize(generated)
         return output_text
     except Exception as e:
-        return f"Error generating text: {str(e)}"
 def get_model_info():
     """Display model information"""
-    total_params = sum(p.numel() for p in model.parameters())
     trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
     info = f"""
-    ### 📊 Model Information
-    **Total Parameters:** {total_params:,} (~{total_params/1e6:.1f}M)
-    **Trainable Parameters:** {trainable_params:,}
-    **Training Steps:** {checkpoint.get('step', 'N/A')}
-    **Device:** {device}
-    **Model Architecture:** SmolLM2-135M
-    ### 🎯 Training Details
-    - Trained for 5,000 steps
-    - Checkpoint saved and reloaded
-    - Additional 50 steps after reload
-    - Predictions logged every 500 steps
     """
     return info
 # Gradio Interface
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🤖 SmolLM2-135M: From-Scratch Implementation
-    This is a complete reverse-engineered implementation of SmolLM2-135M, trained from scratch.
     **GitHub:** [abi2024/smollm2-135-implementation](https://github.com/abi2024/smollm2-135-implementation)
     """)
@@ -151,10 +117,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 with gr.Row():
                     max_length_slider = gr.Slider(
                         minimum=10,
-                        maximum=500,
-                        value=100,
                         step=10,
-                        label="Max Length"
                     )
                     temperature_slider = gr.Slider(
                         minimum=0.1,
@@ -177,15 +143,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                         maximum=1.0,
                         value=0.9,
                         step=0.05,
-                        label="Top-P"
                     )
-                generate_btn = gr.Button("🚀 Generate", variant="primary")
             with gr.Column():
                 output_text = gr.Textbox(
                     label="Generated Text",
-                    lines=10,
                     interactive=False
                 )
@@ -202,65 +168,82 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         )
         gr.Markdown("""
-        ### 💡 Tips:
-        - **Temperature**: Higher = more creative, Lower = more focused
-        - **Top-K**: Limits vocabulary to K most likely tokens
-        - **Top-P**: Nucleus sampling - cumulative probability threshold
         """)
     with gr.Tab("📊 Model Info"):
         model_info_display = gr.Markdown(get_model_info())
         gr.Markdown("""
-        ### 🏗️ Architecture Details
-        This model was reverse-engineered by:
-        1. Analyzing the official SmolLM2 repository
-        2. Extracting architecture from pretrained weights
-        3. Implementing from scratch in PyTorch
-        4. Validating by swapping weights with pretrained model
-        ### ⚡ Optimizations Used
-        - Flash Attention 2
-        - Mixed Precision Training (BF16/FP16)
-        - Gradient Accumulation
-        - torch.compile()
-        ### 📈 Training Process
-        - **Step 0-5000**: Main training with periodic predictions
-        - **Checkpoint**: Model saved and reloaded to validate state preservation
-        - **Step 5000-5050**: Continued training to test checkpoint robustness
         """)
     with gr.Tab("🎯 Example Prompts"):
         gr.Markdown("""
         ### Try these prompts:
-        1. **Story Generation**
 ```
-           Once upon a time in a land far away
 ```
-        2. **Code Completion**
 ```
-           def fibonacci(n):
 ```
-        3. **Question Answering**
 ```
-           Q: What is machine learning?
-           A:
 ```
-        4. **Creative Writing**
 ```
-           The old house at the end of the street was
 ```
-        5. **Technical Explanation**
 ```
-           Neural networks work by
 ```
         """)
 # Launch

 import gradio as gr
 import torch
 import torch.nn as nn
+from model import SmolLM2Model  # ✅ Correct import
+from transformers import AutoTokenizer, AutoConfig
 # Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load tokenizer and config
+print("Loading tokenizer and config...")
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+config = AutoConfig.from_pretrained("HuggingFaceTB/SmolLM2-135M")
 # Load model
 @torch.no_grad()
 def load_model():
     """Load the trained model"""
     print("Loading model...")
+    # Initialize model with config
+    model = SmolLM2Model(config).to(device)
     # Load checkpoint
+    checkpoint = torch.load('checkpoint_step_5050.pt', map_location=device)
     model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
+    print(f"✅ Model loaded successfully on {device}")
+    print(f"✅ Training step: {checkpoint.get('step', 'N/A')}")
     return model, checkpoint
 # Load model at startup
 model, checkpoint = load_model()
 @torch.no_grad()
 def generate_text(
     prompt,
     """Generate text from prompt"""
     try:
         # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        input_ids = inputs['input_ids']
+        # Generate using model's built-in method
+        generated_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k if top_k > 0 else None,
+            do_sample=temperature > 0
+        )
+        # Decode
+        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         return output_text
     except Exception as e:
+        return f"❌ Error generating text: {str(e)}"
 def get_model_info():
     """Display model information"""
+    total_params = model.get_num_params()
     trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
     info = f"""
+### 📊 Model Information
+**Model:** SmolLM2-135M
+**Total Parameters:** {total_params:,} (~{total_params/1e6:.1f}M)
+**Trainable Parameters:** {trainable_params:,}
+**Training Steps:** {checkpoint.get('step', 'N/A')}
+**Device:** {device}
+**Vocab Size:** {config.vocab_size:,}
+### 🏗️ Architecture
+- **Layers:** {config.num_hidden_layers}
+- **Hidden Size:** {config.hidden_size}
+- **Attention Heads:** {config.num_attention_heads} (Query) / {config.num_key_value_heads} (KV)
+- **FFN Size:** {config.intermediate_size}
+- **Context Length:** {config.max_position_embeddings}
+### 🎯 Training Details
+- ✅ Trained for 5,000 steps
+- ✅ Checkpoint saved and reloaded
+- ✅ Additional 50 steps after reload
+- ✅ Predictions logged every 500 steps
     """
     return info
 # Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft(), title="SmolLM2-135M Demo") as demo:
     gr.Markdown("""
     # 🤖 SmolLM2-135M: From-Scratch Implementation
+    Complete reverse-engineered implementation of SmolLM2-135M, trained from scratch.
     **GitHub:** [abi2024/smollm2-135-implementation](https://github.com/abi2024/smollm2-135-implementation)
     """)
                 with gr.Row():
                     max_length_slider = gr.Slider(
                         minimum=10,
+                        maximum=200,
+                        value=50,
                         step=10,
+                        label="Max New Tokens"
                     )
                     temperature_slider = gr.Slider(
                         minimum=0.1,
                         maximum=1.0,
                         value=0.9,
                         step=0.05,
+                        label="Top-P (Nucleus)"
                     )
+                generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
             with gr.Column():
                 output_text = gr.Textbox(
                     label="Generated Text",
+                    lines=12,
                     interactive=False
                 )
         )
         gr.Markdown("""
+        ### 💡 Generation Tips:
+        - **Temperature**: Controls randomness (0.1 = focused, 2.0 = creative)
+        - **Top-K**: Limits to K most likely tokens (0 = disabled)
+        - **Top-P**: Nucleus sampling threshold (0.9 recommended)
         """)
     with gr.Tab("📊 Model Info"):
         model_info_display = gr.Markdown(get_model_info())
         gr.Markdown("""
+        ### 🔍 Reverse Engineering Process
+        1. **Architecture Analysis**
+           - Studied SmolLM2 GitHub repository
+           - Extracted model configuration from YAML
+           - Downloaded pretrained 135M checkpoint
+        2. **Implementation**
+           - Built from scratch using PyTorch
+           - Implemented Grouped Query Attention (9Q/3KV heads)
+           - Added RoPE position embeddings
+           - Used SwiGLU FFN and RMSNorm
+        3. **Validation**
+           - Loaded official pretrained weights
+           - Verified parameter count (134,515,008)
+           - Confirmed architecture matches exactly
+        ### ⚡ Optimizations Applied
+        - ✅ Flash Attention 2 (via scaled_dot_product_attention)
+        - ✅ Mixed Precision Training (BF16/FP16)
+        - ✅ Gradient Accumulation
+        - ✅ torch.compile() for inference speedup
+        - ✅ Grouped Query Attention (memory efficient)
+        ### 📈 Training Pipeline
+        1. **Main Training:** 5,000 steps with predictions every 500 steps
+        2. **Checkpoint Test:** Model saved and successfully reloaded
+        3. **Resume Training:** 50 additional steps (validates checkpoint integrity)
         """)
     with gr.Tab("🎯 Example Prompts"):
         gr.Markdown("""
         ### Try these prompts:
+        **1. Story Generation**
 ```
+        Once upon a time in a magical forest,
 ```
+        **2. Code Completion**
 ```
+        def calculate_fibonacci(n):
+            # Calculate the nth Fibonacci number
 ```
+        **3. Question Answering**
 ```
+        Q: What is the capital of France?
+        A:
 ```
+        **4. Technical Writing**
 ```
+        The main advantage of transformer architectures is
 ```
+        **5. Creative Writing**
 ```
+        The scientist discovered something extraordinary:
 ```
+        ### 🎛️ Recommended Settings:
+        - **Creative Writing:** Temperature=1.0, Top-P=0.95
+        - **Code Generation:** Temperature=0.3, Top-P=0.9, Top-K=40
+        - **Factual Q&A:** Temperature=0.5, Top-P=0.8, Top-K=30
         """)
 # Launch