Spaces:

Smilyai-labs
/

Sam-Z-chat

Running

App Files Files Community

Keeby-smilyai commited on Oct 23, 2025

Commit

853a0d4

verified ·

1 Parent(s): c783df6

Create app.py

Browse files

Files changed (1) hide show

app.py +618 -0

app.py ADDED Viewed

	@@ -0,0 +1,618 @@

+import gradio as gr
+import tensorflow as tf
+from huggingface_hub import hf_hub_download
+import json
+import os
+from tokenizers import Tokenizer
+import numpy as np
+import time
+# ============================================================================
+# 🎊 FESTIVE MODE TOGGLE 🎊
+# ============================================================================
+FESTIVE = True  # Set to False for production-only mode
+# ============================================================================
+# Configuration & Model Loading
+# ============================================================================
+print("🚀 Loading SAM-Z-1 Model...")
+MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
+CACHE_DIR = "./model_cache"
+# Download model files
+config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
+model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
+tokenizer_path = hf_hub_download(MODEL_REPO, "tokenizer.json", cache_dir=CACHE_DIR)
+# Load config
+with open(config_path, 'r') as f:
+    config = json.load(f)
+# Load tokenizer
+tokenizer = Tokenizer.from_file(tokenizer_path)
+eos_token_id = config.get('eos_token_id', 50256)
+# Load model with TF function optimization
+model = tf.keras.models.load_model(model_path, compile=False)
+# Create optimized inference function
+@tf.function(reduce_retracing=True)
+def fast_forward(input_tensor):
+    """TF-optimized forward pass for faster generation"""
+    return model(input_tensor, training=False)
+print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
+print(f"✅ TF function optimization enabled for faster inference")
+# Global stop flag
+stop_generation = False
+# ============================================================================
+# Generation Function with Streaming & Stop Button
+# ============================================================================
+def generate_stream(
+    prompt: str,
+    max_tokens: int = 512,
+    temperature: float = 0.8,
+    top_k: int = 40,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.1
+):
+    """Generate text with streaming output and stop support"""
+    global stop_generation
+    stop_generation = False
+    # Tokenize prompt
+    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
+    if len(input_ids) == 0:
+        yield "⚠️ Empty prompt after tokenization"
+        return
+    if len(input_ids) > config['max_position_embeddings'] - max_tokens:
+        input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
+    input_tensor = tf.constant([input_ids], dtype=tf.int32)
+    generated_text = ""
+    token_count = 0
+    # Track token frequencies for repetition penalty
+    token_freq = {}
+    start_time = time.time()
+    for step in range(max_tokens):
+        # Check stop flag
+        if stop_generation:
+            generated_text += "\n\n*[Generation stopped by user]*"
+            yield generated_text
+            break
+        # Get logits using optimized TF function
+        logits = fast_forward(input_tensor)
+        next_token_logits = logits[0, -1, :].numpy()
+        # Apply temperature
+        next_token_logits = next_token_logits / temperature
+        # Apply repetition penalty
+        if repetition_penalty != 1.0:
+            for token_id, freq in token_freq.items():
+                if token_id < len(next_token_logits):
+                    next_token_logits[token_id] /= (repetition_penalty ** freq)
+        # Top-k filtering
+        if top_k > 0:
+            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
+            top_k_logits = next_token_logits[top_k_indices]
+            top_k_probs = tf.nn.softmax(top_k_logits).numpy()
+            # Top-p (nucleus) sampling
+            if top_p < 1.0:
+                sorted_indices = np.argsort(top_k_probs)[::-1]
+                cumsum = np.cumsum(top_k_probs[sorted_indices])
+                cutoff_idx = np.searchsorted(cumsum, top_p)
+                nucleus_indices = sorted_indices[:cutoff_idx + 1]
+                nucleus_logits = top_k_logits[nucleus_indices]
+                nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
+                sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
+                next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
+            else:
+                sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
+                next_token_id = int(top_k_indices[sampled_idx])
+        else:
+            probs = tf.nn.softmax(next_token_logits).numpy()
+            next_token_id = np.random.choice(len(probs), p=probs)
+        # Stop on EOS
+        if next_token_id == eos_token_id:
+            break
+        # Update token frequency
+        token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
+        # Decode and yield
+        token_text = tokenizer.decode([next_token_id])
+        generated_text += token_text
+        token_count += 1
+        # Yield progressive output
+        yield generated_text
+        # Update input
+        input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
+        # Truncate if too long
+        if input_tensor.shape[1] > config['max_position_embeddings']:
+            input_tensor = input_tensor[:, -config['max_position_embeddings']:]
+    # Calculate stats
+    elapsed = time.time() - start_time
+    tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
+    # Add generation stats
+    if token_count > 0 and not stop_generation:
+        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tokens_per_sec:.1f} tok/s)]*"
+    yield generated_text
+# ============================================================================
+# Chat Interface Logic
+# ============================================================================
+def format_chat_prompt(message: str, history: list) -> str:
+    """Format message history into chat prompt"""
+    prompt = ""
+    # Add history
+    for user_msg, assistant_msg in history:
+        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
+        if assistant_msg:
+            prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
+    # Add current message
+    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    return prompt
+def chat_stream(
+    message: str,
+    history: list,
+    max_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float
+):
+    """Streaming chat response"""
+    if not message.strip():
+        yield history
+        return
+    # Format prompt
+    prompt = format_chat_prompt(message, history)
+    # Generate with streaming
+    partial_response = ""
+    for generated in generate_stream(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty
+    ):
+        partial_response = generated
+        # Stop at end tags
+        if "<|im_end|>" in partial_response:
+            partial_response = partial_response.split("<|im_end|>")[0]
+        # Update history
+        yield history + [[message, partial_response.strip()]]
+def stop_gen():
+    """Stop generation callback"""
+    global stop_generation
+    stop_generation = True
+    return None
+# ============================================================================
+# Gradio UI
+# ============================================================================
+# Festive CSS
+festive_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+    color: white;
+    border-radius: 12px;
+    margin-bottom: 2rem;
+    box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
+    animation: pulse 2s ease-in-out infinite;
+}
+@keyframes pulse {
+    0%, 100% { transform: scale(1); }
+    50% { transform: scale(1.02); }
+}
+.header h1 {
+    font-size: 2.8rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
+}
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.celebration {
+    font-size: 2rem;
+    margin: 0.5rem;
+    animation: bounce 1s ease infinite;
+}
+@keyframes bounce {
+    0%, 100% { transform: translateY(0); }
+    50% { transform: translateY(-10px); }
+}
+.stats-card {
+    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+    padding: 1.5rem;
+    border-radius: 12px;
+    border-left: 4px solid #f5576c;
+    margin: 1rem 0;
+    box-shadow: 0 4px 16px rgba(252, 182, 159, 0.3);
+}
+.twin-badge {
+    display: inline-block;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 20px;
+    font-weight: bold;
+    margin: 0.5rem;
+    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
+}
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
+.confetti {
+    position: fixed;
+    width: 10px;
+    height: 10px;
+    background: #f5576c;
+    position: absolute;
+    animation: confetti-fall 3s linear infinite;
+}
+@keyframes confetti-fall {
+    to { transform: translateY(100vh) rotate(360deg); }
+}
+"""
+# Production CSS
+production_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border-radius: 12px;
+    margin-bottom: 2rem;
+}
+.header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
+}
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.stats-card {
+    background: #f8f9fa;
+    padding: 1rem;
+    border-radius: 8px;
+    border-left: 4px solid #667eea;
+    margin: 1rem 0;
+}
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
+"""
+# Select CSS based on mode
+custom_css = festive_css if FESTIVE else production_css
+# Build interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    if FESTIVE:
+        gr.HTML("""
+            <div class="header">
+                <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
+                <h1>🤖 SAM-Z-1 Chat 🤖</h1>
+                <p><strong>LATEST RELEASE!</strong> Our fastest non-reasoning model</p>
+                <div class="twin-badge">Twin of SAM-X-1 (Reasoning Model)</div>
+                <p style="font-size: 0.9rem; margin-top: 1rem;">
+                    768D • 16 Layers • 12 Heads • ~140M Parameters • Trained on TPU v5e-8
+                </p>
+                <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
+            </div>
+        """)
+    else:
+        gr.HTML("""
+            <div class="header">
+                <h1>🤖 SAM-Z-1 Chat</h1>
+                <p>Fast, direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; margin-top: 0.5rem;">
+                    768D • 16 Layers • 12 Heads • Trained on TPU v5e-8
+                </p>
+            </div>
+        """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            # Chat interface
+            chatbot = gr.Chatbot(
+                height=600,
+                show_label=False,
+                avatar_images=(None, "🤖" if not FESTIVE else "🎉"),
+                bubble_full_width=False
+            )
+            with gr.Row():
+                msg = gr.Textbox(
+                    placeholder="Type your message here..." if not FESTIVE else "Ask me anything! I'm the fast twin! ⚡",
+                    show_label=False,
+                    scale=8,
+                    container=False
+                )
+                submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
+                stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
+                retry_btn = gr.Button("🔄 Retry", size="sm")
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Generation Settings")
+            max_tokens = gr.Slider(
+                minimum=50,
+                maximum=1024,
+                value=512,
+                step=50,
+                label="Max Tokens",
+                info="Maximum length of response"
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1,
+                label="Temperature",
+                info="Higher = more creative"
+            )
+            top_k = gr.Slider(
+                minimum=1,
+                maximum=100,
+                value=40,
+                step=1,
+                label="Top-K",
+                info="Sample from top K tokens"
+            )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-P",
+                info="Nucleus sampling threshold"
+            )
+            repetition_penalty = gr.Slider(
+                minimum=1.0,
+                maximum=2.0,
+                value=1.1,
+                step=0.1,
+                label="Repetition Penalty",
+                info="Penalize repeated tokens"
+            )
+            gr.Markdown("---")
+            # Model info
+            if FESTIVE:
+                gr.Markdown(f"""
+                    ### 🎊 SAM-Z-1 Model Info
+                    **🎯 The Fast Twin!**
+                    **Type:** Direct Response Model
+                    **Parameters:** ~140M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Speed:** ⚡ Optimized with TF Functions
+                    **Twin Model:**
+                    - **SAM-X-1**: Reasoning model (with thinking)
+                    - **SAM-Z-1**: Fast model (YOU ARE HERE! 🎉)
+                    **Architecture:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - No bias terms (efficient!)
+                    **Training:**
+                    - Trained from scratch
+                    - TPU v5e-8 (8 cores)
+                    - Mixed precision (bfloat16)
+                    - Cosine decay schedule
+                """)
+            else:
+                gr.Markdown(f"""
+                    ### 📊 Model Info
+                    **Architecture:** SAM-Z-1 (Direct Response)
+                    **Parameters:** ~140M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Twin Models:**
+                    - SAM-X-1: Reasoning model
+                    - SAM-Z-1: Direct response model
+                    **Features:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - TF-optimized inference
+                """)
+    # Example prompts
+    gr.Examples(
+        examples=[
+            "Hi! What can you do?",
+            "Explain quantum computing in simple terms",
+            "Write a short poem about AI",
+            "What's the capital of France?",
+            "How do I learn programming?",
+            "Tell me an interesting fact about space",
+            "What's the difference between you and SAM-X-1?",
+            "Why are you called the fast twin?",
+        ],
+        inputs=msg,
+        label="💡 Try these examples" if not FESTIVE else "🎯 Try these examples!"
+    )
+    # Footer
+    if FESTIVE:
+        gr.HTML("""
+            <footer>
+                <p style="font-size: 1.2rem;"><strong>🎉 SAM-Z-1 - LATEST RELEASE! 🎉</strong></p>
+                <p><strong>The Fast Twin</strong> - Direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; color: #999; margin-top: 0.5rem;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model) • Same architecture, different training objective
+                </p>
+                <div style="margin-top: 1rem; font-size: 1.5rem;">
+                    ⚡ 🚀 💫 ✨ 🎯
+                </div>
+            </footer>
+        """)
+    else:
+        gr.HTML("""
+            <footer>
+                <p><strong>SAM-Z-1</strong> - Direct response language model</p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model)
+                </p>
+            </footer>
+        """)
+    # Event handlers
+    submit_event = msg.submit(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    ).then(
+        lambda: ("", None),
+        outputs=[msg, None]
+    )
+    click_event = submit_btn.click(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    ).then(
+        lambda: ("", None),
+        outputs=[msg, None]
+    )
+    # Stop button
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, click_event]
+    )
+    clear_btn.click(lambda: (None, ""), outputs=[chatbot, msg])
+    def retry_last(history, max_tok, temp, topk, topp, rep_pen):
+        if not history:
+            return history
+        last_user_msg = history[-1][0]
+        history = history[:-1]
+        for update in chat_stream(last_user_msg, history, max_tok, temp, topk, topp, rep_pen):
+            yield update
+    retry_event = retry_btn.click(
+        retry_last,
+        inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    )
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[retry_event]
+    )
+# Launch
+if __name__ == "__main__":
+    demo.queue(max_size=20)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )