Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on 28 days ago

Commit

acf0e5f

verified ·

1 Parent(s): 765bb8c

Update app.py

Browse files

Files changed (1) hide show

app.py +837 -381

app.py CHANGED Viewed

@@ -1,61 +1,31 @@
-import os
-os.environ['KERAS_BACKEND'] = 'tensorflow'
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import gradio as gr
 import tensorflow as tf
 import keras
 from huggingface_hub import hf_hub_download
 import json
-import numpy as np
 from tokenizers import Tokenizer
-import threading
 import time
-import queue
-import hashlib
-import sqlite3
-from datetime import datetime
-import uuid
-# ==============================================================================
-# 1. Hardware Optimization & Setup
-# ==============================================================================
-tf.config.threading.set_inter_op_parallelism_threads(2)
-tf.config.threading.set_intra_op_parallelism_threads(4)
-tf.config.optimizer.set_jit(True)
-print(f"🚀 SmilyAI Pro System Initializing...")
-# ==============================================================================
-# 2. Database
-# ==============================================================================
-def init_db():
-    conn = sqlite3.connect('sam_tasks.db', check_same_thread=False)
-    c = conn.cursor()
-    c.execute('''CREATE TABLE IF NOT EXISTS users
-                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
-                  username TEXT UNIQUE NOT NULL,
-                  password_hash TEXT NOT NULL)''')
-    c.execute('''CREATE TABLE IF NOT EXISTS tasks
-                 (id TEXT PRIMARY KEY,
-                  user_id INTEGER,
-                  model_name TEXT,
-                  prompt TEXT,
-                  status TEXT,
-                  progress INTEGER DEFAULT 0,
-                  result TEXT,
-                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                  tokens_per_sec REAL DEFAULT 0,
-                  FOREIGN KEY (user_id) REFERENCES users(id))''')
-    conn.commit()
-    return conn
-db_conn = init_db()
-db_lock = threading.Lock()
-# ==============================================================================
-# 3. Model (Fixed with tf.cond)
-# ==============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
@@ -64,403 +34,889 @@ class RotaryEmbedding(keras.layers.Layer):
         self.max_len = max_len
         self.theta = theta
         self.built_cache = False
     def _build_cache(self):
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
-        cos = self.cos_cached[:seq_len, :][None, None, :, :]
-        sin = self.sin_cached[:seq_len, :][None, None, :, :]
-        def rotate_half(x):
-            x1, x2 = tf.split(x, 2, axis=-1)
-            return tf.concat([-x2, x1], axis=-1)
-        q_rot = (q * cos) + (rotate_half(q) * sin)
-        k_rot = (k * cos) + (rotate_half(k) * sin)
-        return q_rot, k_rot
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
-    def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, **kwargs):
         super().__init__(**kwargs)
         self.n_heads = n_heads
         self.head_dim = d_model // n_heads
-        self.d_model = d_model
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
-        self.pre_attn_norm = keras.layers.LayerNormalization(epsilon=1e-5)
-        self.pre_ffn_norm = keras.layers.LayerNormalization(epsilon=1e-5)
-        self.q_proj = keras.layers.Dense(d_model, use_bias=False)
-        self.k_proj = keras.layers.Dense(d_model, use_bias=False)
-        self.v_proj = keras.layers.Dense(d_model, use_bias=False)
-        self.out_proj = keras.layers.Dense(d_model, use_bias=False)
-        self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False)
-        self.up_proj = keras.layers.Dense(ff_dim, use_bias=False)
-        self.down_proj = keras.layers.Dense(d_model, use_bias=False)
         self.dropout = keras.layers.Dropout(dropout)
-    def call(self, x, cache=None, training=None):
-        B = tf.shape(x)[0]
-        T = tf.shape(x)[1]
-        # 1. Attention
         res = x
         y = self.pre_attn_norm(x)
-        q = tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim])
-        k = tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim])
-        v = tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim])
-        # KV Cache
-        if cache is not None:
-            k_cache, v_cache = cache
-            k = tf.concat([k_cache, k], axis=1)
-            v = tf.concat([v_cache, v], axis=1)
-        new_cache = (k, v)
-        # RoPE
-        q = tf.transpose(q, [0, 2, 1, 3])
-        k_rot = tf.transpose(k, [0, 2, 1, 3])
-        v_t = tf.transpose(v, [0, 2, 1, 3])
-        q, k_rot = self.rope(q, k_rot)
-        # Attention Scores
-        scores = tf.matmul(q, k_rot, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, x.dtype))
-        # --- 🛠️ FIX: Graph-Safe Causal Mask ---
-        def apply_mask():
-            # Create triangular mask for prefill (T > 1)
-            mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0)
-            return (1.0 - mask) * -1e9
-        def no_mask():
-            # No mask needed for decoding step (T=1 attends to all past)
-            return tf.zeros((1, 1)) # Broadcastable 0
-        # Use tf.cond instead of python 'if'
-        mask_offset = tf.cond(tf.greater(T, 1), apply_mask, no_mask)
-        scores = scores + mask_offset
-        # -----------------------------------------
-        attn = tf.nn.softmax(scores, axis=-1)
-        out = tf.matmul(attn, v_t)
-        out = tf.reshape(tf.transpose(out, [0, 2, 1, 3]), [B, T, self.d_model])
-        x = res + self.out_proj(out)
-        # 2. FFN
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
-        return res + ffn, new_cache
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.embed = keras.layers.Embedding(config['vocab_size'], config['d_model'])
-        ff_dim = int(config['d_model'] * config['ff_mult'])
-        self.blocks = [
-            TransformerBlock(
-                config['d_model'], config['n_heads'], ff_dim, config['dropout'],
-                config['max_len'], config['rope_theta']
-            ) for i in range(config['n_layers'])
-        ]
-        self.norm = keras.layers.LayerNormalization(epsilon=1e-5)
-        self.lm_head = keras.layers.Dense(config['vocab_size'], use_bias=False)
-    def call(self, input_ids, cache=None, training=None):
         x = self.embed(input_ids)
-        new_caches = []
-        for i, block in enumerate(self.blocks):
-            c_i = cache[i] if cache is not None else None
-            x, nc_i = block(x, cache=c_i, training=training)
-            new_caches.append(nc_i)
-        return self.lm_head(self.norm(x)), new_caches
-# ==============================================================================
-# 4. Load Models
-# ==============================================================================
-print("\n📦 Loading Resources...")
-dummy_in = tf.zeros((1, 1), dtype=tf.int32)
-# SAM-X (Reasoning)
-print("🔹 SAM-X-1 (Reasoning)")
 try:
-    samx_cfg = json.load(open(hf_hub_download("Smilyai-labs/Sam-1-large-it-0002", "config.json")))
-    samx_model = SAM1Model({
-        'vocab_size': samx_cfg['vocab_size'], 'd_model': samx_cfg['hidden_size'],
-        'n_layers': samx_cfg['num_hidden_layers'], 'n_heads': samx_cfg['num_attention_heads'],
-        'ff_mult': samx_cfg['intermediate_size']/samx_cfg['hidden_size'],
-        'max_len': samx_cfg['max_position_embeddings'], 'rope_theta': samx_cfg['rope_theta'], 'dropout':0.0
-    })
-    _ = samx_model(dummy_in)
-    samx_model.load_weights(hf_hub_download("Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5"))
-    tokenizer_x = Tokenizer.from_file(hf_hub_download("Smilyai-labs/Sam-1-large-it-0002", "tokenizer.json"))
-except Exception as e: print(f"⚠️ Failed to load SAM-X: {e}")
-# SAM-Z (Speed)
-print("🔹 SAM-Z-1 (Fast)")
-try:
-    samz_cfg = json.load(open(hf_hub_download("Smilyai-labs/Sam-Z-1-tensorflow", "config.json")))
-    samz_model = SAM1Model({
-        'vocab_size': samz_cfg['vocab_size'], 'd_model': samz_cfg['hidden_size'],
-        'n_layers': samz_cfg['num_hidden_layers'], 'n_heads': samz_cfg['num_attention_heads'],
-        'ff_mult': samz_cfg['intermediate_size']/samz_cfg['hidden_size'],
-        'max_len': samz_cfg['max_position_embeddings'], 'rope_theta': samz_cfg['rope_theta'], 'dropout':0.0
-    })
-    _ = samz_model(dummy_in)
-    samz_model.load_weights(hf_hub_download("Smilyai-labs/Sam-Z-1-tensorflow", "ckpt.weights.h5"))
-    tokenizer_z = Tokenizer.from_file(hf_hub_download("Smilyai-labs/Sam-Z-1-tensorflow", "tokenizer.json"))
-except Exception as e: print(f"⚠️ Failed to load SAM-Z: {e}")
-@tf.function(jit_compile=True)
-def predict_x(ids, cache): return samx_model(ids, cache=cache, training=False)
-@tf.function(jit_compile=True)
-def predict_z(ids, cache): return samz_model(ids, cache=cache, training=False)
 # ==============================================================================
-# 5. Backend Workers
 # ==============================================================================
-task_queue = queue.Queue()
-def worker():
-    while True:
         try:
-            tid, model, prompt = task_queue.get(timeout=1)
-            # Select Model
-            if "SAM-X" in model: pred_fn, tok = predict_x, tokenizer_x
-            else: pred_fn, tok = predict_z, tokenizer_z
-            # Inference
-            try:
-                ids = [i for i in tok.encode(prompt).ids]
-                gen = []
-                # Prefill
-                curr = tf.constant([ids], dtype=tf.int32)
-                logits, cache = pred_fn(curr, cache=None)
-                next_t = np.argmax(logits[0,-1,:])
-                gen.append(next_t)
-                # Decode
-                start = time.time()
-                for i in range(1024):
-                    curr = tf.constant([[gen[-1]]], dtype=tf.int32)
-                    logits, cache = pred_fn(curr, cache=cache)
-                    next_t = np.argmax(logits[0,-1,:])
-                    if next_t == 50256: break
-                    gen.append(next_t)
-                    if i % 5 == 0:
-                        txt = tok.decode(gen)
-                        with db_lock:
-                            db_conn.execute("UPDATE tasks SET status='processing', result=?, progress=? WHERE id=?",
-                                          (txt, int(i/10.24), tid))
-                            db_conn.commit()
-                # Done
-                txt = tok.decode(gen)
-                with db_lock:
-                    db_conn.execute("UPDATE tasks SET status='completed', result=?, progress=100, completed_at=? WHERE id=?",
-                                  (txt, datetime.now().isoformat(), tid))
-                    db_conn.commit()
-            except Exception as e:
-                print(f"Error {tid}: {e}")
-                with db_lock:
-                    db_conn.execute("UPDATE tasks SET status='failed', result=? WHERE id=?", (str(e), tid))
-                    db_conn.commit()
-            task_queue.task_done()
-        except queue.Empty: continue
-threading.Thread(target=worker, daemon=True).start()
-# ==============================================================================
-# 6. "More Better" UI (Custom CSS + Chat Layout)
-# ==============================================================================
-css = """
-body { background-color: #0b0f19; color: #e5e7eb; }
-.sidebar { background-color: #111827; border-right: 1px solid #374151; height: 100vh; overflow-y: auto; padding: 20px; }
-.main-content { padding: 20px; max-width: 900px; margin: 0 auto; }
-.task-card {
-    background: #1f2937; border: 1px solid #374151; border-radius: 8px;
-    padding: 12px; margin-bottom: 8px; cursor: pointer; transition: all 0.2s;
 }
-.task-card:hover { background: #374151; border-color: #60a5fa; }
-.status-badge {
-    font-size: 10px; padding: 2px 6px; border-radius: 4px; text-transform: uppercase; font-weight: bold;
 }
-.status-queued { background: #f59e0b20; color: #f59e0b; }
-.status-processing { background: #3b82f620; color: #3b82f6; animation: pulse 2s infinite; }
-.status-completed { background: #10b98120; color: #10b981; }
-.status-failed { background: #ef444420; color: #ef4444; }
-/* Message Bubbles */
-.chat-container { display: flex; flex-direction: column; gap: 20px; margin-top: 20px; }
-.message { padding: 16px; border-radius: 12px; max-width: 85%; line-height: 1.6; }
-.user-msg { align-self: flex-end; background: #2563eb; color: white; }
-.bot-msg { align-self: flex-start; background: #1f2937; border: 1px solid #374151; color: #e5e7eb; width: 100%; }
-/* Thought Block */
-details.think {
-    background: #172554; border-left: 3px solid #3b82f6; border-radius: 4px;
-    padding: 8px; margin-bottom: 12px; font-size: 0.9em; color: #93c5fd;
 }
-details.think summary { cursor: pointer; font-weight: bold; opacity: 0.8; }
-details.think[open] summary { margin-bottom: 8px; border-bottom: 1px solid #3b82f640; padding-bottom: 4px; }
-@keyframes pulse { 0% { opacity: 1; } 50% { opacity: 0.6; } 100% { opacity: 1; } }
 """
-def format_chat(text):
-    if not text: return ""
-    # Beautiful formatted thought blocks
-    if "<think>" in text:
-        parts = text.split("<think>")
-        pre = parts[0]
-        rest = parts[1]
-        if "</think>" in rest:
-            thought, ans = rest.split("</think>")
-            return f"{pre}<details class='think'><summary>🧠 Thought Process</summary>{thought}</details>{ans}"
-        return f"{pre}<details class='think' open><summary>🧠 Thinking...</summary>{rest} <span class='status-processing'>●</span></details>"
-    return text.replace("\n", "<br>")
-with gr.Blocks(css=css, title="SmilyAI Studio", theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate")) as demo:
-    user_id = gr.State(value=None)
-    current_task = gr.State(value=None)
-    with gr.Row(elem_classes="container"):
-        # --- Left Sidebar (History) ---
-        with gr.Column(scale=1, elem_classes="sidebar"):
-            gr.Markdown("### 🗂️ History")
-            refresh_btn = gr.Button("🔄 Refresh", size="sm", variant="secondary")
-            history_list = gr.HTML("Log in to see tasks")
             gr.Markdown("---")
-            gr.Markdown("### 👤 Account")
-            u_in = gr.Textbox(placeholder="Username", show_label=False)
-            p_in = gr.Textbox(placeholder="Password", show_label=False, type="password")
-            login_btn = gr.Button("Login", size="sm")
-        # --- Main Content (Chat & Monitor) ---
-        with gr.Column(scale=3, elem_classes="main-content"):
-            gr.Markdown("# ✨ SmilyAI Studio")
-            with gr.Group():
-                with gr.Row():
-                    model_sel = gr.Dropdown(
-                        ["SAM-X-1 (Reasoning)", "SAM-Z-1 (Fast)"],
-                        value="SAM-Z-1 (Fast)", label="Select Model", interactive=True
-                    )
-                prompt_in = gr.Textbox(
-                    placeholder="Ask anything... (e.g. 'Explain quantum physics')",
-                    lines=3, show_label=False
-                )
-                with gr.Row():
-                    generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
-            # Live View
-            gr.Markdown("### 📡 Live Monitor")
-            with gr.Group():
-                stream_display = gr.HTML(
-                    "<div style='padding:20px; text-align:center; color:#6b7280'>Select a task to watch</div>",
-                    elem_id="stream-box"
-                )
-    # --- Logic Functions ---
-    def login(u, p):
-        h = hashlib.sha256(p.encode()).hexdigest()
-        with db_lock:
-            c = db_conn.cursor()
-            c.execute("SELECT id FROM users WHERE username=?", (u,))
-            row = c.fetchone()
-            if not row: # Auto-register for demo
-                c.execute("INSERT INTO users (username, password_hash) VALUES (?,?)", (u, h))
-                db_conn.commit()
-                row = (c.lastrowid,)
-            return row[0], load_history(row[0])
-    def create_task(uid, model, text):
-        if not uid: return None, "Please login first"
-        tid = str(uuid.uuid4())
-        with db_lock:
-            db_conn.execute("INSERT INTO tasks (id, user_id, model_name, prompt, status) VALUES (?,?,?,?,?)",
-                          (tid, uid, model, text, 'queued'))
-            db_conn.commit()
-        task_queue.put((tid, model, text))
-        return tid, tid # Set current task
-    def load_history(uid):
-        if not uid: return "Please Login"
-        with db_lock:
-            rows = db_conn.execute("SELECT id, model_name, status, prompt FROM tasks WHERE user_id=? ORDER BY created_at DESC LIMIT 10", (uid,)).fetchall()
-        html = ""
-        for r in rows:
-            tid, mod, stat, p = r
-            short_mod = "Reasoning" if "SAM-X" in mod else "Fast"
-            html += f"""
-            <div class='task-card' onclick="setTask('{tid}')">
-                <div style='display:flex; justify-content:space-between; margin-bottom:4px'>
-                    <span style='font-weight:bold; color:#e5e7eb'>{short_mod}</span>
-                    <span class='status-badge status-{stat}'>{stat}</span>
                 </div>
-                <div style='font-size:12px; color:#9ca3af; white-space:nowrap; overflow:hidden; text-overflow:ellipsis'>{p}</div>
-                <div style='font-size:10px; color:#4b5563; margin-top:4px'>ID: {tid[:8]}</div>
-            </div>
-            """
-        return html
-    def watch_stream(tid):
-        if not tid: return "Select a task..."
-        with db_lock:
-            row = db_conn.execute("SELECT result, status FROM tasks WHERE id=?", (tid,)).fetchone()
-        if not row: return "Task not found"
-        text, status = row
-        formatted = format_chat(text)
-        container = f"""
-        <div class='chat-container'>
-            <div class='message bot-msg'>
-                {formatted}
-            </div>
-        </div>
-        """
-        return container
-    # --- Wiring ---
-    login_btn.click(login, [u_in, p_in], [user_id, history_list])
-    generate_btn.click(
-        create_task, [user_id, model_sel, prompt_in], [current_task, current_task]
     ).then(
-        load_history, [user_id], [history_list]
     )
-    refresh_btn.click(load_history, [user_id], [history_list])
-    # Helper to handle Javascript click on HTML cards
-    # Requires a hidden text input to bridge JS -> Python (omitted for brevity, polling works fine)
-    # Auto-refresh stream
-    timer = gr.Timer(0.5)
-    timer.tick(watch_stream, [current_task], [stream_display])
-    timer.tick(load_history, [user_id], [history_list])
 if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import tensorflow as tf
 import keras
 from huggingface_hub import hf_hub_download
 import json
+import os
 from tokenizers import Tokenizer
+import numpy as np
 import time
+# ============================================================================
+# 🎊 FESTIVE MODE TOGGLE 🎊
+# ============================================================================
+FESTIVE = True  # Set to False for production-only mode
+# ============================================================================
+# Configuration & Model Loading
+# ============================================================================
+print("🚀 Loading SAM-Z-1 Model...")
+MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
+CACHE_DIR = "./model_cache"
+# ============================================================================
+# Model Architecture Definitions (FIXED for model loading)
+# ============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
         self.max_len = max_len
         self.theta = theta
         self.built_cache = False
+    def build(self, input_shape):
+        # Use the ORIGINAL training code - compute cache on first call, not in build
+        super().build(input_shape)
     def _build_cache(self):
+        """Build RoPE cache on first forward pass"""
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
+            # Store as numpy arrays to avoid graph issues
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
+    def rotate_half(self, x):
+        x1, x2 = tf.split(x, 2, axis=-1)
+        return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
+        # Build cache on first call (avoids build-time issues)
         self._build_cache()
         seq_len = tf.shape(q)[2]
+        dtype = q.dtype
+        cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
+        sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
+        q_rotated = (q * cos) + (self.rotate_half(q) * sin)
+        k_rotated = (k * cos) + (self.rotate_half(k) * sin)
+        return q_rotated, k_rotated
+    def get_config(self):
+        config = super().get_config()
+        config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
+        return config
+@keras.saving.register_keras_serializable()
+class RMSNorm(keras.layers.Layer):
+    def __init__(self, epsilon=1e-5, **kwargs):
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+    def build(self, input_shape):
+        self.scale = self.add_weight(name="scale", shape=(input_shape[-1],), initializer="ones")
+    def call(self, x):
+        variance = tf.reduce_mean(tf.square(x), axis=-1, keepdims=True)
+        return x * tf.math.rsqrt(variance + self.epsilon) * self.scale
+    def get_config(self):
+        config = super().get_config()
+        config.update({"epsilon": self.epsilon})
+        return config
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
+    def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
         super().__init__(**kwargs)
+        self.d_model = d_model
         self.n_heads = n_heads
+        self.ff_dim = ff_dim
+        self.dropout_rate = dropout
+        self.max_len = max_len
+        self.rope_theta = rope_theta
         self.head_dim = d_model // n_heads
+        self.layer_idx = layer_idx
+        self.pre_attn_norm = RMSNorm()
+        self.pre_ffn_norm = RMSNorm()
+        self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
+        self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
+        self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
+        self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
+        self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
+        self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
+        self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
+    def call(self, x, training=None):
+        B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
+        dtype = x.dtype
+        # Attention
         res = x
         y = self.pre_attn_norm(x)
+        q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        q, k = self.rope(q, k)
+        scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        mask = tf.where(
+            tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
+            tf.constant(-1e9, dtype=dtype),
+            tf.constant(0.0, dtype=dtype)
+        )
+        scores += mask
+        attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
+        attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
+        x = res + self.dropout(self.out_proj(attn), training=training)
+        # FFN (SwiGLU)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
+        return res + self.dropout(ffn, training=training)
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "d_model": self.d_model,
+            "n_heads": self.n_heads,
+            "ff_dim": self.ff_dim,
+            "dropout": self.dropout_rate,
+            "max_len": self.max_len,
+            "rope_theta": self.rope_theta,
+            "layer_idx": self.layer_idx
+        })
+        return config
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
+    def __init__(self, **kwargs):
+        super().__init__()
+        if 'config' in kwargs and isinstance(kwargs['config'], dict):
+            self.cfg = kwargs['config']
+        elif 'vocab_size' in kwargs:
+            self.cfg = kwargs
+        else:
+            self.cfg = kwargs.get('cfg', kwargs)
+        self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
+        ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
+        block_args = {
+            'd_model': self.cfg['d_model'],
+            'n_heads': self.cfg['n_heads'],
+            'ff_dim': ff_dim,
+            'dropout': self.cfg['dropout'],
+            'max_len': self.cfg['max_len'],
+            'rope_theta': self.cfg['rope_theta']
+        }
+        self.blocks = []
+        for i in range(self.cfg['n_layers']):
+            block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
+            self.blocks.append(block)
+        self.norm = RMSNorm(name="final_norm")
+        self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
+    def call(self, input_ids, training=None):
         x = self.embed(input_ids)
+        for block in self.blocks:
+            x = block(x, training=training)
+        return self.lm_head(self.norm(x))
+    def get_config(self):
+        base_config = super().get_config()
+        base_config['config'] = self.cfg
+        return base_config
+print("✅ Model architecture registered")
+# Download model files
+config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
+# Try to download checkpoint weights first (more reliable)
 try:
+    weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
+    print("✅ Found checkpoint weights (ckpt.weights.h5)")
+    use_checkpoint = True
+except Exception as e:
+    print(f"⚠️  Checkpoint not found, falling back to model.keras: {e}")
+    model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
+    use_checkpoint = False
+# Load config
+with open(config_path, 'r') as f:
+    config = json.load(f)
+# Create tokenizer from scratch
+print("📦 Creating tokenizer from GPT-2 base...")
+from transformers import AutoTokenizer
+hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+# Add custom tokens to match model's vocab size
+custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
+hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
+# Save and reload as tokenizers format
+os.makedirs("./temp_tokenizer", exist_ok=True)
+hf_tokenizer.save_pretrained("./temp_tokenizer")
+tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
+print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
+print(f"   Custom tokens added: {custom_tokens}")
+print(f"   Model vocab size: {config.get('vocab_size', 'unknown')}")
+# Verify vocab sizes match
+if tokenizer.get_vocab_size() != config.get('vocab_size'):
+    print(f"⚠️  WARNING: Tokenizer vocab ({tokenizer.get_vocab_size()}) != Model vocab ({config.get('vocab_size')})")
+    print(f"   Model was trained with these tokens, but SAM-Z-1 doesn't use <think> tags in generation")
+eos_token_id = config.get('eos_token_id', 50256)
 # ==============================================================================
+# Load Model - Priority: checkpoint weights > saved model
 # ==============================================================================
+print("\n🔄 Loading model...")
+if use_checkpoint:
+    print("📦 Building model from config and loading checkpoint weights...")
+    # Build model from scratch with config
+    model_config = {
+        'vocab_size': config['vocab_size'],
+        'd_model': config['hidden_size'],
+        'n_layers': config['num_hidden_layers'],
+        'n_heads': config['num_attention_heads'],
+        'ff_mult': config['intermediate_size'] / config['hidden_size'],
+        'max_len': config['max_position_embeddings'],
+        'dropout': 0.1,  # Default dropout
+        'rope_theta': config['rope_theta']
+    }
+    model = SAM1Model(config=model_config)
+    # Build model by running a dummy forward pass
+    dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
+    _ = model(dummy_input, training=False)
+    print(f"✅ Model architecture built: {model.count_params():,} parameters")
+    # Load checkpoint weights
+    print(f"📥 Loading checkpoint weights from: {weights_path}")
+    model.load_weights(weights_path)
+    print("✅ Checkpoint weights loaded successfully!")
+else:
+    print("📦 Loading full saved model...")
+    try:
+        model = keras.models.load_model(model_path, compile=False)
+        print("✅ Model loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load model: {e}")
+        print("\n🔄 Trying alternative: building from config + loading weights...")
+        # Fallback to building model
+        model_config = {
+            'vocab_size': config['vocab_size'],
+            'd_model': config['hidden_size'],
+            'n_layers': config['num_hidden_layers'],
+            'n_heads': config['num_attention_heads'],
+            'ff_mult': config['intermediate_size'] / config['hidden_size'],
+            'max_len': config['max_position_embeddings'],
+            'dropout': 0.1,
+            'rope_theta': config['rope_theta']
+        }
+        model = SAM1Model(config=model_config)
+        dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
+        _ = model(dummy_input, training=False)
+        # Try to load weights from model.keras
         try:
+            temp_model = keras.models.load_model(model_path, compile=False)
+            model.set_weights(temp_model.get_weights())
+            print("✅ Weights transferred successfully")
+        except:
+            print("❌ Could not load weights - model may not work correctly!")
+            raise
+# Create optimized inference function
+@tf.function(reduce_retracing=True)
+def fast_forward(input_tensor):
+    """TF-optimized forward pass for faster generation"""
+    return model(input_tensor, training=False)
+print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
+print(f"✅ TF function optimization enabled for faster inference")
+# Global stop flag
+stop_generation = False
+# ============================================================================
+# Generation Function with Streaming & Stop Button
+# ============================================================================
+def generate_stream(
+    prompt: str,
+    max_tokens: int = 512,
+    temperature: float = 0.8,
+    top_k: int = 40,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.1
+):
+    """Generate text with streaming output and stop support"""
+    global stop_generation
+    stop_generation = False
+    # Tokenize prompt
+    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
+    if len(input_ids) == 0:
+        yield "⚠️ Empty prompt after tokenization"
+        return
+    if len(input_ids) > config['max_position_embeddings'] - max_tokens:
+        input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
+    input_tensor = tf.constant([input_ids], dtype=tf.int32)
+    generated_text = ""
+    token_count = 0
+    # Track token frequencies for repetition penalty
+    token_freq = {}
+    start_time = time.time()
+    for step in range(max_tokens):
+        # Check stop flag
+        if stop_generation:
+            generated_text += "\n\n*[Generation stopped by user]*"
+            yield generated_text
+            break
+        # Get logits using optimized TF function
+        logits = fast_forward(input_tensor)
+        next_token_logits = logits[0, -1, :].numpy()
+        # Apply temperature
+        next_token_logits = next_token_logits / temperature
+        # Apply repetition penalty
+        if repetition_penalty != 1.0:
+            for token_id, freq in token_freq.items():
+                if token_id < len(next_token_logits):
+                    next_token_logits[token_id] /= (repetition_penalty ** freq)
+        # Top-k filtering
+        if top_k > 0:
+            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
+            top_k_logits = next_token_logits[top_k_indices]
+            top_k_probs = tf.nn.softmax(top_k_logits).numpy()
+            # Top-p (nucleus) sampling
+            if top_p < 1.0:
+                sorted_indices = np.argsort(top_k_probs)[::-1]
+                cumsum = np.cumsum(top_k_probs[sorted_indices])
+                cutoff_idx = np.searchsorted(cumsum, top_p)
+                nucleus_indices = sorted_indices[:cutoff_idx + 1]
+                nucleus_logits = top_k_logits[nucleus_indices]
+                nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
+                sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
+                next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
+            else:
+                sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
+                next_token_id = int(top_k_indices[sampled_idx])
+        else:
+            probs = tf.nn.softmax(next_token_logits).numpy()
+            next_token_id = np.random.choice(len(probs), p=probs)
+        # Stop on EOS
+        if next_token_id == eos_token_id:
+            break
+        # Update token frequency
+        token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
+        # Decode and yield
+        token_text = tokenizer.decode([next_token_id])
+        generated_text += token_text
+        token_count += 1
+        # Yield progressive output
+        yield generated_text
+        # Update input
+        input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
+        # Truncate if too long
+        if input_tensor.shape[1] > config['max_position_embeddings']:
+            input_tensor = input_tensor[:, -config['max_position_embeddings']:]
+    # Calculate stats
+    elapsed = time.time() - start_time
+    tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
+    # Add generation stats
+    if token_count > 0 and not stop_generation:
+        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tokens_per_sec:.1f} tok/s)]*"
+    yield generated_text
+# ============================================================================
+# Chat Interface Logic
+# ============================================================================
+def format_chat_prompt(message: str, history: list) -> str:
+    """Format message history into chat prompt"""
+    prompt = ""
+    # Add history
+    for user_msg, assistant_msg in history:
+        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
+        if assistant_msg:
+            prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
+    # Add current message
+    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    return prompt
+def chat_stream(
+    message: str,
+    history: list,
+    max_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float
+):
+    """Streaming chat response"""
+    if not message.strip():
+        yield history
+        return
+    # Format prompt
+    prompt = format_chat_prompt(message, history)
+    # Generate with streaming
+    partial_response = ""
+    for generated in generate_stream(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty
+    ):
+        partial_response = generated
+        # Stop at end tags
+        if "<|im_end|>" in partial_response:
+            partial_response = partial_response.split("<|im_end|>")[0]
+        # Update history
+        yield history + [[message, partial_response.strip()]]
+def stop_gen():
+    """Stop generation callback"""
+    global stop_generation
+    stop_generation = True
+    return None
+# ============================================================================
+# Gradio UI
+# ============================================================================
+# Festive CSS
+festive_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+    color: white;
+    border-radius: 12px;
+    margin-bottom: 2rem;
+    box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
+    animation: pulse 2s ease-in-out infinite;
+}
+@keyframes pulse {
+    0%, 100% { transform: scale(1); }
+    50% { transform: scale(1.02); }
+}
+.header h1 {
+    font-size: 2.8rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
+}
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.celebration {
+    font-size: 2rem;
+    margin: 0.5rem;
+    animation: bounce 1s ease infinite;
+}
+@keyframes bounce {
+    0%, 100% { transform: translateY(0); }
+    50% { transform: translateY(-10px); }
+}
+.stats-card {
+    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+    padding: 1.5rem;
+    border-radius: 12px;
+    border-left: 4px solid #f5576c;
+    margin: 1rem 0;
+    box-shadow: 0 4px 16px rgba(252, 182, 159, 0.3);
+}
+.twin-badge {
+    display: inline-block;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 20px;
+    font-weight: bold;
+    margin: 0.5rem;
+    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
+}
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
+.confetti {
+    position: fixed;
+    width: 10px;
+    height: 10px;
+    background: #f5576c;
+    position: absolute;
+    animation: confetti-fall 3s linear infinite;
+}
+@keyframes confetti-fall {
+    to { transform: translateY(100vh) rotate(360deg); }
+}
+"""
+# Production CSS
+production_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border-radius: 12px;
+    margin-bottom: 2rem;
+}
+.header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
 }
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
 }
+.stats-card {
+    background: #f8f9fa;
+    padding: 1rem;
+    border-radius: 8px;
+    border-left: 4px solid #667eea;
+    margin: 1rem 0;
 }
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
 """
+# Select CSS based on mode
+custom_css = festive_css if FESTIVE else production_css
+# Build interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    if FESTIVE:
+        gr.HTML("""
+            <div class="header">
+                <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
+                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
+                     alt="SAM-Z-1"
+                     style="max-width: 400px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 8px 24px rgba(0,0,0,0.2);">
+                <h1>🤖 SAM-Z-1 Chat 🤖</h1>
+                <p><strong>LATEST RELEASE!</strong> Our <strong>Best</strong> non-reasoning model</p>
+                <div class="twin-badge">Twin of SAM-X-1 (Reasoning Model)</div>
+                <p style="font-size: 0.9rem; margin-top: 1rem;">
+                    768D • 16 Layers • 12 Heads • ~313M Parameters • Trained on TPU v5e-8
+                </p>
+                <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
+            </div>
+        """)
+    else:
+        gr.HTML("""
+            <div class="header">
+                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
+                     alt="SAM-Z-1"
+                     style="max-width: 300px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 4px 16px rgba(0,0,0,0.15);">
+                <h1>🤖 SAM-Z-1 Chat</h1>
+                <p>Fast, direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; margin-top: 0.5rem;">
+                    768D • 16 Layers • 12 Heads • Trained on TPU v5e-8
+                </p>
+            </div>
+        """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            # Chat interface with bot avatar
+            chatbot = gr.Chatbot(
+                height=600,
+                show_label=False,
+                avatar_images=(
+                    None,
+                    "https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/KtiMi-aDUOOeN--YNT-Fu.jpeg"
+                ),
+                bubble_full_width=False
+            )
+            with gr.Row():
+                msg = gr.Textbox(
+                    placeholder="Type your message here..." if not FESTIVE else "Ask me anything! I'm the fast twin! ⚡",
+                    show_label=False,
+                    scale=8,
+                    container=False
+                )
+                submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
+                stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
+                retry_btn = gr.Button("🔄 Retry", size="sm")
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Generation Settings")
+            max_tokens = gr.Slider(
+                minimum=50,
+                maximum=1024,
+                value=512,
+                step=50,
+                label="Max Tokens",
+                info="Maximum length of response"
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1,
+                label="Temperature",
+                info="Higher = more creative"
+            )
+            top_k = gr.Slider(
+                minimum=1,
+                maximum=100,
+                value=40,
+                step=1,
+                label="Top-K",
+                info="Sample from top K tokens"
+            )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-P",
+                info="Nucleus sampling threshold"
+            )
+            repetition_penalty = gr.Slider(
+                minimum=1.0,
+                maximum=2.0,
+                value=1.1,
+                step=0.1,
+                label="Repetition Penalty",
+                info="Penalize repeated tokens"
+            )
             gr.Markdown("---")
+            # Model info
+            if FESTIVE:
+                gr.Markdown(f"""
+                    ### 🎊 SAM-Z-1 Model Info
+                    **🎯 The Fast Twin!**
+                    **Type:** Direct Response Model
+                    **Parameters:** ~313M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Speed:** ⚡ Optimized with TF Functions
+                    **Twin Model:**
+                    - **SAM-X-1**: Reasoning model (uses `<think>` tags)
+                    - **SAM-Z-1**: Fast model (no thinking, direct answers! 🎉)
+                    **Note:** Model includes `<think>` tokens in vocab but doesn't use them. Training used same tokenizer as SAM-X-1.
+                    **Architecture:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - No bias terms (efficient!)
+                    **Training:**
+                    - Trained from scratch
+                    - TPU v5e-8 (8 cores)
+                    - Mixed precision (bfloat16)
+                    - Cosine decay schedule
+                """)
+            else:
+                gr.Markdown(f"""
+                    ### 📊 Model Info
+                    **Architecture:** SAM-Z-1 (Direct Response)
+                    **Parameters:** ~313M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Twin Models:**
+                    - SAM-X-1: Reasoning model (uses `<think>` tags)
+                    - SAM-Z-1: Direct response model (no thinking)
+                    **Note:** Vocab includes `<think>` tokens but model doesn't use them in generation.
+                    **Features:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - TF-optimized inference
+                """)
+    # Example prompts
+    gr.Examples(
+        examples=[
+            "Hi! What can you do?",
+            "Explain quantum computing in simple terms",
+            "Write a short poem about AI",
+            "What's the capital of France?",
+            "How do I learn programming?",
+            "Tell me an interesting fact about space",
+            "What's the difference between you and SAM-X-1?",
+            "Why are you called the fast twin?",
+        ],
+        inputs=msg,
+        label="💡 Try these examples" if not FESTIVE else "🎯 Try these examples!"
+    )
+    # Footer
+    if FESTIVE:
+        gr.HTML("""
+            <footer>
+                <p style="font-size: 1.2rem;"><strong>🎉 SAM-Z-1 - LATEST RELEASE! 🎉</strong></p>
+                <p><strong>The Fast Twin</strong> - Direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; color: #999; margin-top: 0.5rem;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model) • Same architecture, different training objective
+                </p>
+                <div style="margin-top: 1rem; font-size: 1.5rem;">
+                    ⚡ 🚀 💫 ✨ 🎯
                 </div>
+            </footer>
+        """)
+    else:
+        gr.HTML("""
+            <footer>
+                <p><strong>SAM-Z-1</strong> - Direct response language model</p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model)
+                </p>
+            </footer>
+        """)
+    # Event handlers
+    submit_event = msg.submit(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    ).then(
+        lambda: "",
+        outputs=[msg]
+    )
+    click_event = submit_btn.click(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
     ).then(
+        lambda: "",
+        outputs=[msg]
     )
+    # Stop button
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, click_event]
+    )
+    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
+    def retry_last(history, max_tok, temp, topk, topp, rep_pen):
+        if not history:
+            return history
+        last_user_msg = history[-1][0]
+        history = history[:-1]
+        for update in chat_stream(last_user_msg, history, max_tok, temp, topk, topp, rep_pen):
+            yield update
+    retry_event = retry_btn.click(
+        retry_last,
+        inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    )
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[retry_event]
+    )
+# Launch
 if __name__ == "__main__":
+    demo.queue(max_size=20)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )