Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Oct 23

Commit

0feb44a

verified ·

1 Parent(s): 10fd5b9

Update app.py

Browse files

Files changed (1) hide show

app.py +592 -673

app.py CHANGED Viewed

@@ -1,8 +1,3 @@
-"""
-SAM-Z-1 Production API with Gradio UI
-OpenAI-compatible API interface for Hugging Face Spaces
-"""
 import gradio as gr
 import tensorflow as tf
 import keras
@@ -12,23 +7,23 @@ import os
 from tokenizers import Tokenizer
 import numpy as np
 import time
-from typing import Dict, Any, List
 # ============================================================================
-# Configuration
 # ============================================================================
 MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
 CACHE_DIR = "./model_cache"
-# Global model storage
-model = None
-tokenizer = None
-config = None
-eos_token_id = None
 # ============================================================================
-# Model Architecture (same as original)
 # ============================================================================
 @keras.saving.register_keras_serializable()
@@ -41,14 +36,18 @@ class RotaryEmbedding(keras.layers.Layer):
         self.built_cache = False
     def build(self, input_shape):
         super().build(input_shape)
     def _build_cache(self):
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
@@ -58,13 +57,17 @@ class RotaryEmbedding(keras.layers.Layer):
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
@@ -107,20 +110,25 @@ class TransformerBlock(keras.layers.Layer):
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
@@ -129,7 +137,9 @@ class TransformerBlock(keras.layers.Layer):
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
         mask = tf.where(
             tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
             tf.constant(-1e9, dtype=dtype),
@@ -137,9 +147,11 @@ class TransformerBlock(keras.layers.Layer):
         )
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
@@ -149,9 +161,13 @@ class TransformerBlock(keras.layers.Layer):
     def get_config(self):
         config = super().get_config()
         config.update({
-            "d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim,
-            "dropout": self.dropout_rate, "max_len": self.max_len,
-            "rope_theta": self.rope_theta, "layer_idx": self.layer_idx
         })
         return config
@@ -171,20 +187,28 @@ class SAM1Model(keras.Model):
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
         block_args = {
-            'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'],
-            'ff_dim': ff_dim, 'dropout': self.cfg['dropout'],
-            'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']
         }
-        self.blocks = [TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
-                       for i in range(self.cfg['n_layers'])]
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
@@ -192,43 +216,60 @@ class SAM1Model(keras.Model):
         base_config['config'] = self.cfg
         return base_config
-# ============================================================================
-# Model Loading
-# ============================================================================
-print("🚀 Loading SAM-Z-1 Model for API...")
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
 try:
     weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
     use_checkpoint = True
-    print("✅ Found checkpoint weights")
-except:
     model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
     use_checkpoint = False
-    print("✅ Found saved model")
 with open(config_path, 'r') as f:
     config = json.load(f)
-eos_token_id = config.get('eos_token_id', 50256)
-# Create tokenizer
-print("📦 Creating tokenizer...")
 from transformers import AutoTokenizer
 hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-hf_tokenizer.add_special_tokens({
-    "additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
-})
 os.makedirs("./temp_tokenizer", exist_ok=True)
 hf_tokenizer.save_pretrained("./temp_tokenizer")
 tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
-# Load model
 if use_checkpoint:
-    print("📦 Building model and loading weights...")
     model_config = {
         'vocab_size': config['vocab_size'],
         'd_model': config['hidden_size'],
@@ -236,49 +277,119 @@ if use_checkpoint:
         'n_heads': config['num_attention_heads'],
         'ff_mult': config['intermediate_size'] / config['hidden_size'],
         'max_len': config['max_position_embeddings'],
-        'dropout': 0.1,
         'rope_theta': config['rope_theta']
     }
     model = SAM1Model(config=model_config)
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
     model.load_weights(weights_path)
 else:
-    model = keras.models.load_model(model_path, compile=False)
 @tf.function(reduce_retracing=True)
 def fast_forward(input_tensor):
     return model(input_tensor, training=False)
-print(f"✅ Model loaded: {config['num_hidden_layers']} layers, ~313M params")
 # ============================================================================
-# Generation Engine
 # ============================================================================
-def generate_tokens(
-    input_ids: List[int],
     max_tokens: int = 512,
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
-    """Generator that yields tokens one at a time"""
     if len(input_ids) > config['max_position_embeddings'] - max_tokens:
         input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
     input_tensor = tf.constant([input_ids], dtype=tf.int32)
     token_freq = {}
     for step in range(max_tokens):
         logits = fast_forward(input_tensor)
         next_token_logits = logits[0, -1, :].numpy()
-        # Temperature
         next_token_logits = next_token_logits / temperature
-        # Repetition penalty
         if repetition_penalty != 1.0:
             for token_id, freq in token_freq.items():
                 if token_id < len(next_token_logits):
@@ -290,14 +401,16 @@ def generate_tokens(
             top_k_logits = next_token_logits[top_k_indices]
             top_k_probs = tf.nn.softmax(top_k_logits).numpy()
-            # Top-p sampling
             if top_p < 1.0:
                 sorted_indices = np.argsort(top_k_probs)[::-1]
                 cumsum = np.cumsum(top_k_probs[sorted_indices])
                 cutoff_idx = np.searchsorted(cumsum, top_p)
                 nucleus_indices = sorted_indices[:cutoff_idx + 1]
                 nucleus_logits = top_k_logits[nucleus_indices]
                 nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
                 sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
                 next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
             else:
@@ -307,159 +420,196 @@ def generate_tokens(
             probs = tf.nn.softmax(next_token_logits).numpy()
             next_token_id = np.random.choice(len(probs), p=probs)
         if next_token_id == eos_token_id:
             break
         token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
-        yield next_token_id
         input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
         if input_tensor.shape[1] > config['max_position_embeddings']:
             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
 # ============================================================================
-# API Functions - FIXED FOR GRADIO
 # ============================================================================
-def chat_completion_api(
-    messages_json: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    top_k: int,
-    repetition_penalty: float,
-    stream: bool
-) -> str:
-    """OpenAI-style chat completion API"""
-    try:
-        messages = json.loads(messages_json)
-        # Format messages
-        prompt = ""
-        for msg in messages:
-            role = msg.get("role", "user")
-            content = msg.get("content", "")
-            if role == "system":
-                prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
-            elif role == "user":
-                prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
-            elif role == "assistant":
-                prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
-        prompt += "<|im_start|>assistant\n"
-        # Tokenize
-        input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
-        start_time = time.time()
-        token_count = 0
-        response_text = ""
-        for token_id in generate_tokens(
-            input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-        ):
-            token_text = tokenizer.decode([token_id])
-            response_text += token_text
-            token_count += 1
-            if "<|im_end|>" in response_text:
-                response_text = response_text.split("<|im_end|>")[0]
-                break
-        elapsed = time.time() - start_time
-        result = {
-            "id": f"chatcmpl-{int(time.time())}",
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "model": "sam-z-1",
-            "choices": [{
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": response_text.strip()
-                },
-                "finish_reason": "stop"
-            }],
-            "usage": {
-                "prompt_tokens": len(input_ids),
-                "completion_tokens": token_count,
-                "total_tokens": len(input_ids) + token_count
-            },
-            "stats": {
-                "elapsed_sec": round(elapsed, 2),
-                "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
-            }
-        }
-        return json.dumps(result, indent=2)
-    except Exception as e:
-        return json.dumps({"error": str(e)}, indent=2)
-def text_completion_api(
-    prompt: str,
     max_tokens: int,
     temperature: float,
-    top_p: float,
     top_k: int,
-    repetition_penalty: float,
-    stream: bool
-) -> str:
-    """OpenAI-style text completion API"""
-    try:
-        input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
-        start_time = time.time()
-        token_count = 0
-        response_text = ""
-        for token_id in generate_tokens(
-            input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-        ):
-            token_text = tokenizer.decode([token_id])
-            response_text += token_text
-            token_count += 1
-        elapsed = time.time() - start_time
-        result = {
-            "id": f"cmpl-{int(time.time())}",
-            "object": "text_completion",
-            "created": int(time.time()),
-            "model": "sam-z-1",
-            "choices": [{
-                "text": response_text,
-                "index": 0,
-                "finish_reason": "stop"
-            }],
-            "usage": {
-                "prompt_tokens": len(input_ids),
-                "completion_tokens": token_count,
-                "total_tokens": len(input_ids) + token_count
-            },
-            "stats": {
-                "elapsed_sec": round(elapsed, 2),
-                "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
-            }
-        }
-        return json.dumps(result, indent=2)
-    except Exception as e:
-        return json.dumps({"error": str(e)}, indent=2)
 # ============================================================================
-# Gradio UI with API Routes
 # ============================================================================
-custom_css = """
-.api-container {
-    max-width: 1400px;
-    margin: auto;
 }
 .header {
@@ -471,525 +621,294 @@ custom_css = """
     margin-bottom: 2rem;
 }
-.endpoint-card {
     background: #f8f9fa;
-    padding: 1.5rem;
     border-radius: 8px;
     border-left: 4px solid #667eea;
     margin: 1rem 0;
 }
 """
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
-    gr.HTML("""
-        <div class="header">
-            <h1>🚀 SAM-Z-1 API Server</h1>
-            <p>OpenAI-Compatible API for SAM-Z-1 Language Model</p>
-            <p style="font-size: 0.9rem; opacity: 0.9;">
-                313M Parameters • 768D • 16 Layers • TensorFlow Optimized
-            </p>
-        </div>
-    """)
-    with gr.Tabs():
-        # ========== Chat Completion Tab ==========
-        with gr.Tab("💬 Chat Completion"):
-            gr.Markdown("""
-            ### Chat Completions API
-            OpenAI-compatible chat completion endpoint
-            """)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    messages_input = gr.Code(
-                        label="Messages (JSON)",
-                        language="json",
-                        value=json.dumps([
-                            {"role": "user", "content": "Hello! Who are you?"}
-                        ], indent=2),
-                        lines=10
-                    )
-                    with gr.Row():
-                        chat_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
-                        chat_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")
-                    with gr.Row():
-                        chat_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
-                        chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
-                    chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
-                    chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
-                    chat_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    chat_output = gr.Code(
-                        label="API Response (JSON)",
-                        language="json",
-                        lines=20
-                    )
-            gr.Markdown("""
-            ### Python Example with Gradio Client
-            ```python
-            from gradio_client import Client
-            client = Client("YOUR-SPACE-URL")
-            messages = [
-                {"role": "user", "content": "Hello! Who are you?"}
-            ]
-            result = client.predict(
-                messages_json=json.dumps(messages),
-                max_tokens=512,
-                temperature=0.8,
-                top_p=0.9,
-                top_k=40,
-                repetition_penalty=1.1,
-                stream=False,
-                api_name="/chat_completions"
             )
-            print(result)
-            ```
-            """)
-        # ========== Text Completion Tab ==========
-        with gr.Tab("📝 Text Completion"):
-            gr.Markdown("""
-            ### Text Completions API
-            OpenAI-compatible text completion endpoint
-            """)
             with gr.Row():
-                with gr.Column(scale=1):
-                    prompt_input = gr.Textbox(
-                        label="Prompt",
-                        placeholder="Once upon a time...",
-                        lines=5
-                    )
-                    with gr.Row():
-                        text_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
-                        text_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")
-                    with gr.Row():
-                        text_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
-                        text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
-                    text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
-                    text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
-                    text_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    text_output = gr.Code(
-                        label="API Response (JSON)",
-                        language="json",
-                        lines=20
-                    )
-            gr.Markdown("""
-            ### Python Example with Gradio Client
-            ```python
-            from gradio_client import Client
-            client = Client("YOUR-SPACE-URL")
-            result = client.predict(
-                prompt="Once upon a time",
-                max_tokens=512,
-                temperature=0.8,
-                top_p=0.9,
-                top_k=40,
-                repetition_penalty=1.1,
-                stream=False,
-                api_name="/text_completions"
             )
-            print(result)
-            ```
-            """)
-        # ========== Documentation Tab ==========
-        with gr.Tab("📖 Documentation"):
-            gr.Markdown("""
-            # SAM-Z-1 API Documentation
-            ## Model Information
-            - **Model**: SAM-Z-1 (Direct Response Model)
-            - **Parameters**: ~313M
-            - **Architecture**: Transformer with RoPE, SwiGLU, RMSNorm
-            - **Context Length**: {config['max_position_embeddings']} tokens
-            - **Vocabulary Size**: {config['vocab_size']}
-            ## Using the API
-            ### Method 1: Gradio Client (Recommended)
-            Install the Gradio client:
-            ```bash
-            pip install gradio_client
-            ```
-            **Chat Completion:**
-            ```python
-            from gradio_client import Client
-            import json
-            client = Client("https://YOUR-SPACE.hf.space")
-            messages = [
-                {{"role": "user", "content": "What is Python?"}}
-            ]
-            result = client.predict(
-                messages_json=json.dumps(messages),
-                max_tokens=512,
-                temperature=0.8,
-                top_p=0.9,
-                top_k=40,
-                repetition_penalty=1.1,
-                stream=False,
-                api_name="/chat_completions"
             )
-            response = json.loads(result)
-            print(response["choices"][0]["message"]["content"])
-            ```
-            **Text Completion:**
-            ```python
-            result = client.predict(
-                prompt="Once upon a time",
-                max_tokens=512,
-                temperature=0.8,
-                top_p=0.9,
-                top_k=40,
-                repetition_penalty=1.1,
-                stream=False,
-                api_name="/text_completions"
             )
-            response = json.loads(result)
-            print(response["choices"][0]["text"])
-            ```
-            ### Method 2: Direct HTTP Requests
-            **Chat Completion:**
-            ```python
-            import requests
-            import json
-            url = "https://YOUR-SPACE.hf.space/call/chat_completions"
-            payload = {{
-                "data": [
-                    json.dumps([{{"role": "user", "content": "Hello!"}}]),  # messages_json
-                    512,   # max_tokens
-                    0.8,   # temperature
-                    0.9,   # top_p
-                    40,    # top_k
-                    1.1,   # repetition_penalty
-                    False  # stream
-                ]
-            }}
-            response = requests.post(url, json=payload)
-            print(response.json())
-            ```
-            ## API Endpoints
-            ### Chat Completions
-            - **API Name**: `/chat_completions`
-            - **URL**: `https://YOUR-SPACE.hf.space/call/chat_completions`
-            **Parameters:**
-            1. `messages_json` (str): JSON string of messages array
-            2. `max_tokens` (int): Maximum tokens to generate (50-1024)
-            3. `temperature` (float): Sampling temperature (0.1-2.0)
-            4. `top_p` (float): Nucleus sampling threshold (0.1-1.0)
-            5. `top_k` (int): Top-K sampling (1-100)
-            6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0)
-            7. `stream` (bool): Stream response (UI only, not functional)
-            ### Text Completions
-            - **API Name**: `/text_completions`
-            - **URL**: `https://YOUR-SPACE.hf.space/call/text_completions`
-            **Parameters:**
-            1. `prompt` (str): Text prompt
-            2. `max_tokens` (int): Maximum tokens to generate
-            3. `temperature` (float): Sampling temperature
-            4. `top_p` (float): Nucleus sampling threshold
-            5. `top_k` (int): Top-K sampling
-            6. `repetition_penalty` (float): Penalty for repetition
-            7. `stream` (bool): Stream response (UI only)
-            ## Response Format
-            **Chat Completion Response:**
-            ```json
-            {{
-              "id": "chatcmpl-1234567890",
-              "object": "chat.completion",
-              "created": 1234567890,
-              "model": "sam-z-1",
-              "choices": [{{
-                "index": 0,
-                "message": {{
-                  "role": "assistant",
-                  "content": "Response text here"
-                }},
-                "finish_reason": "stop"
-              }}],
-              "usage": {{
-                "prompt_tokens": 10,
-                "completion_tokens": 20,
-                "total_tokens": 30
-              }},
-              "stats": {{
-                "elapsed_sec": 1.5,
-                "tokens_per_sec": 13.3
-              }}
-            }}
-            ```
-            **Text Completion Response:**
-            ```json
-            {{
-              "id": "cmpl-1234567890",
-              "object": "text_completion",
-              "created": 1234567890,
-              "model": "sam-z-1",
-              "choices": [{{
-                "text": "Completion text here",
-                "index": 0,
-                "finish_reason": "stop"
-              }}],
-              "usage": {{
-                "prompt_tokens": 5,
-                "completion_tokens": 15,
-                "total_tokens": 20
-              }},
-              "stats": {{
-                "elapsed_sec": 1.2,
-                "tokens_per_sec": 12.5
-              }}
-            }}
-            ```
-            ## Complete Example Script
-            ```python
-            #!/usr/bin/env python3
-            """
-            SAM-Z-1 API Client Example
-            """
-            from gradio_client import Client
-            import json
-            # Initialize client
-            client = Client("https://YOUR-SPACE.hf.space")
-            def chat(message, history=[]):
-                \"\"\"Send a chat message\"\"\"
-                messages = history + [{{"role": "user", "content": message}}]
-                result = client.predict(
-                    messages_json=json.dumps(messages),
-                    max_tokens=512,
-                    temperature=0.8,
-                    top_p=0.9,
-                    top_k=40,
-                    repetition_penalty=1.1,
-                    stream=False,
-                    api_name="/chat_completions"
-                )
-                response = json.loads(result)
-                assistant_msg = response["choices"][0]["message"]["content"]
-                # Update history
-                history.append({{"role": "user", "content": message}})
-                history.append({{"role": "assistant", "content": assistant_msg}})
-                return assistant_msg, history
-            def complete(prompt):
-                \"\"\"Complete text\"\"\"
-                result = client.predict(
-                    prompt=prompt,
-                    max_tokens=512,
-                    temperature=0.8,
-                    top_p=0.9,
-                    top_k=40,
-                    repetition_penalty=1.1,
-                    stream=False,
-                    api_name="/text_completions"
-                )
-                response = json.loads(result)
-                return response["choices"][0]["text"]
-            # Example usage
-            if __name__ == "__main__":
-                # Chat example
-                print("=== Chat Example ===")
-                history = []
-                response, history = chat("Hello! Who are you?", history)
-                print(f"Assistant: {{response}}\\n")
-                response, history = chat("What can you help me with?", history)
-                print(f"Assistant: {{response}}\\n")
-                # Text completion example
-                print("\\n=== Text Completion Example ===")
-                completion = complete("Once upon a time in a distant galaxy")
-                print(f"Completion: {{completion}}")
-            ```
-            ## Parameters Guide
-            ### Temperature (0.1 - 2.0)
-            - **Low (0.1-0.5)**: More focused, deterministic, factual
-            - **Medium (0.6-0.9)**: Balanced creativity and coherence
-            - **High (1.0-2.0)**: More creative, diverse, unpredictable
-            ### Top-P (0.1 - 1.0)
-            - Controls diversity via nucleus sampling
-            - **0.9** (default): Good balance
-            - Lower values = more focused
-            - Higher values = more diverse
-            ### Top-K (1 - 100)
-            - Limits vocabulary to top K tokens
-            - **40** (default): Good balance
-            - Lower values = more focused
-            - Higher values = more diverse
-            ### Repetition Penalty (1.0 - 2.0)
-            - **1.0**: No penalty
-            - **1.1** (default): Slight penalty
-            - **1.5+**: Strong penalty (use if model repeats)
-            ## Rate Limits & Performance
-            - **Concurrent Requests**: Supported via Gradio queue
-            - **Average Speed**: 10-20 tokens/sec on CPU
-            - **Context Window**: {config['max_position_embeddings']} tokens
-            - **Queue Size**: Up to 20 concurrent requests
-            ## Error Handling
-            ```python
-            try:
-                result = client.predict(
-                    messages_json=json.dumps(messages),
-                    max_tokens=512,
-                    temperature=0.8,
-                    top_p=0.9,
-                    top_k=40,
-                    repetition_penalty=1.1,
-                    stream=False,
-                    api_name="/chat_completions"
-                )
-                response = json.loads(result)
-                if "error" in response:
-                    print(f"API Error: {{response['error']}}")
-                else:
-                    print(response["choices"][0]["message"]["content"])
-            except Exception as e:
-                print(f"Request failed: {{e}}")
-            ```
-            ## Troubleshooting
-            **Connection Issues:**
-            - Verify Space URL is correct
-            - Check if Space is running
-            - Ensure gradio_client is installed
-            **Slow Responses:**
-            - Reduce `max_tokens`
-            - Lower `top_k` value
-            - Use shorter prompts
-            **Repetitive Output:**
-            - Increase `repetition_penalty` (try 1.2-1.5)
-            - Adjust `temperature` higher
-            - Use `top_p` sampling
-            **Incoherent Output:**
-            - Lower `temperature` (try 0.5-0.7)
-            - Reduce `top_k` (try 20-30)
-            - Ensure prompt is clear and well-formatted
-            ## Chat Template Format
-            The model uses ChatML format:
-            ```
-            <|im_start|>system
-            System message here<|im_end|>
-            <|im_start|>user
-            User message here<|im_end|>
-            <|im_start|>assistant
-            Assistant response here<|im_end|>
-            ```
-            ## Tips for Best Results
-            1. **Use clear, specific prompts**
-            2. **Lower temperature for factual tasks**
-            3. **Higher temperature for creative tasks**
-            4. **Adjust repetition penalty if model repeats phrases**
-            5. **Keep context under {config['max_position_embeddings']} tokens**
-            6. **Use system messages to set behavior**
-            ## Model Capabilities
-            ✅ General conversation
-            ✅ Question answering
-            ✅ Code generation
-            ✅ Creative writing
-            ✅ Text completion
-            ✅ Instruction following
-            ❌ Does NOT use reasoning tokens (`<think>` tags)
-            ❌ Not fine-tuned for specific domains
-            ---
-            **Model**: SAM-Z-1 | **API Version**: 1.0
-            **Support**: Open an issue on the Space for bugs or questions
-            """)
-    # ========== API Routes - MUST USE api_name parameter ==========
-    chat_btn.click(
-        fn=chat_completion_api,
-        inputs=[
-            messages_input, chat_max_tokens, chat_temperature,
-            chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
         ],
-        outputs=[chat_output],
-        api_name="chat_completions"  # This creates /call/chat_completions endpoint
     )
-    text_btn.click(
-        fn=text_completion_api,
-        inputs=[
-            prompt_input, text_max_tokens, text_temperature,
-            text_top_p, text_top_k, text_rep_penalty, text_stream
-        ],
-        outputs=[text_output],
-        api_name="text_completions"  # This creates /call/text_completions endpoint
     )
 # Launch

 import gradio as gr
 import tensorflow as tf
 import keras
 from tokenizers import Tokenizer
 import numpy as np
 import time
 # ============================================================================
+# 🎊 FESTIVE MODE TOGGLE 🎊
 # ============================================================================
+FESTIVE = True  # Set to False for production-only mode
+# ============================================================================
+# Configuration & Model Loading
+# ============================================================================
+print("🚀 Loading SAM-Z-1 Model...")
 MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
 CACHE_DIR = "./model_cache"
 # ============================================================================
+# Model Architecture Definitions (FIXED for model loading)
 # ============================================================================
 @keras.saving.register_keras_serializable()
         self.built_cache = False
     def build(self, input_shape):
+        # Use the ORIGINAL training code - compute cache on first call, not in build
         super().build(input_shape)
     def _build_cache(self):
+        """Build RoPE cache on first forward pass"""
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
+            # Store as numpy arrays to avoid graph issues
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
+        # Build cache on first call (avoids build-time issues)
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
+        # Attention
         res = x
         y = self.pre_attn_norm(x)
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
         mask = tf.where(
             tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
             tf.constant(-1e9, dtype=dtype),
         )
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
+        # FFN (SwiGLU)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
     def get_config(self):
         config = super().get_config()
         config.update({
+            "d_model": self.d_model,
+            "n_heads": self.n_heads,
+            "ff_dim": self.ff_dim,
+            "dropout": self.dropout_rate,
+            "max_len": self.max_len,
+            "rope_theta": self.rope_theta,
+            "layer_idx": self.layer_idx
         })
         return config
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
         block_args = {
+            'd_model': self.cfg['d_model'],
+            'n_heads': self.cfg['n_heads'],
+            'ff_dim': ff_dim,
+            'dropout': self.cfg['dropout'],
+            'max_len': self.cfg['max_len'],
+            'rope_theta': self.cfg['rope_theta']
         }
+        self.blocks = []
+        for i in range(self.cfg['n_layers']):
+            block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
+            self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
         base_config['config'] = self.cfg
         return base_config
+print("✅ Model architecture registered")
+# Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
+# Try to download checkpoint weights first (more reliable)
 try:
     weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
+    print("✅ Found checkpoint weights (ckpt.weights.h5)")
     use_checkpoint = True
+except Exception as e:
+    print(f"⚠️  Checkpoint not found, falling back to model.keras: {e}")
     model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
     use_checkpoint = False
+# Load config
 with open(config_path, 'r') as f:
     config = json.load(f)
+# Create tokenizer from scratch
+print("📦 Creating tokenizer from GPT-2 base...")
 from transformers import AutoTokenizer
 hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+# Add custom tokens to match model's vocab size
+custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
+hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
+# Save and reload as tokenizers format
 os.makedirs("./temp_tokenizer", exist_ok=True)
 hf_tokenizer.save_pretrained("./temp_tokenizer")
 tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
+print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
+print(f"   Custom tokens added: {custom_tokens}")
+print(f"   Model vocab size: {config.get('vocab_size', 'unknown')}")
+# Verify vocab sizes match
+if tokenizer.get_vocab_size() != config.get('vocab_size'):
+    print(f"⚠️  WARNING: Tokenizer vocab ({tokenizer.get_vocab_size()}) != Model vocab ({config.get('vocab_size')})")
+    print(f"   Model was trained with these tokens, but SAM-Z-1 doesn't use <think> tags in generation")
+eos_token_id = config.get('eos_token_id', 50256)
+# ==============================================================================
+# Load Model - Priority: checkpoint weights > saved model
+# ==============================================================================
+print("\n🔄 Loading model...")
 if use_checkpoint:
+    print("📦 Building model from config and loading checkpoint weights...")
+    # Build model from scratch with config
     model_config = {
         'vocab_size': config['vocab_size'],
         'd_model': config['hidden_size'],
         'n_heads': config['num_attention_heads'],
         'ff_mult': config['intermediate_size'] / config['hidden_size'],
         'max_len': config['max_position_embeddings'],
+        'dropout': 0.1,  # Default dropout
         'rope_theta': config['rope_theta']
     }
     model = SAM1Model(config=model_config)
+    # Build model by running a dummy forward pass
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
+    print(f"✅ Model architecture built: {model.count_params():,} parameters")
+    # Load checkpoint weights
+    print(f"📥 Loading checkpoint weights from: {weights_path}")
     model.load_weights(weights_path)
+    print("✅ Checkpoint weights loaded successfully!")
 else:
+    print("📦 Loading full saved model...")
+    try:
+        model = keras.models.load_model(model_path, compile=False)
+        print("✅ Model loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load model: {e}")
+        print("\n🔄 Trying alternative: building from config + loading weights...")
+        # Fallback to building model
+        model_config = {
+            'vocab_size': config['vocab_size'],
+            'd_model': config['hidden_size'],
+            'n_layers': config['num_hidden_layers'],
+            'n_heads': config['num_attention_heads'],
+            'ff_mult': config['intermediate_size'] / config['hidden_size'],
+            'max_len': config['max_position_embeddings'],
+            'dropout': 0.1,
+            'rope_theta': config['rope_theta']
+        }
+        model = SAM1Model(config=model_config)
+        dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
+        _ = model(dummy_input, training=False)
+        # Try to load weights from model.keras
+        try:
+            temp_model = keras.models.load_model(model_path, compile=False)
+            model.set_weights(temp_model.get_weights())
+            print("✅ Weights transferred successfully")
+        except:
+            print("❌ Could not load weights - model may not work correctly!")
+            raise
+# Create optimized inference function
 @tf.function(reduce_retracing=True)
 def fast_forward(input_tensor):
+    """TF-optimized forward pass for faster generation"""
     return model(input_tensor, training=False)
+print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
+print(f"✅ TF function optimization enabled for faster inference")
+# Global stop flag
+stop_generation = False
 # ============================================================================
+# Generation Function with Streaming & Stop Button
 # ============================================================================
+def generate_stream(
+    prompt: str,
     max_tokens: int = 512,
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
+    """Generate text with streaming output and stop support"""
+    global stop_generation
+    stop_generation = False
+    # Tokenize prompt
+    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
+    if len(input_ids) == 0:
+        yield "⚠️ Empty prompt after tokenization"
+        return
     if len(input_ids) > config['max_position_embeddings'] - max_tokens:
         input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
     input_tensor = tf.constant([input_ids], dtype=tf.int32)
+    generated_text = ""
+    token_count = 0
+    # Track token frequencies for repetition penalty
     token_freq = {}
+    start_time = time.time()
     for step in range(max_tokens):
+        # Check stop flag
+        if stop_generation:
+            generated_text += "\n\n*[Generation stopped by user]*"
+            yield generated_text
+            break
+        # Get logits using optimized TF function
         logits = fast_forward(input_tensor)
         next_token_logits = logits[0, -1, :].numpy()
+        # Apply temperature
         next_token_logits = next_token_logits / temperature
+        # Apply repetition penalty
         if repetition_penalty != 1.0:
             for token_id, freq in token_freq.items():
                 if token_id < len(next_token_logits):
             top_k_logits = next_token_logits[top_k_indices]
             top_k_probs = tf.nn.softmax(top_k_logits).numpy()
+            # Top-p (nucleus) sampling
             if top_p < 1.0:
                 sorted_indices = np.argsort(top_k_probs)[::-1]
                 cumsum = np.cumsum(top_k_probs[sorted_indices])
                 cutoff_idx = np.searchsorted(cumsum, top_p)
                 nucleus_indices = sorted_indices[:cutoff_idx + 1]
                 nucleus_logits = top_k_logits[nucleus_indices]
                 nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
                 sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
                 next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
             else:
             probs = tf.nn.softmax(next_token_logits).numpy()
             next_token_id = np.random.choice(len(probs), p=probs)
+        # Stop on EOS
         if next_token_id == eos_token_id:
             break
+        # Update token frequency
         token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
+        # Decode and yield
+        token_text = tokenizer.decode([next_token_id])
+        generated_text += token_text
+        token_count += 1
+        # Yield progressive output
+        yield generated_text
+        # Update input
         input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
+        # Truncate if too long
         if input_tensor.shape[1] > config['max_position_embeddings']:
             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
+    # Calculate stats
+    elapsed = time.time() - start_time
+    tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
+    # Add generation stats
+    if token_count > 0 and not stop_generation:
+        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tokens_per_sec:.1f} tok/s)]*"
+    yield generated_text
 # ============================================================================
+# Chat Interface Logic
 # ============================================================================
+def format_chat_prompt(message: str, history: list) -> str:
+    """Format message history into chat prompt"""
+    prompt = ""
+    # Add history
+    for user_msg, assistant_msg in history:
+        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
+        if assistant_msg:
+            prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
+    # Add current message
+    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    return prompt
+def chat_stream(
+    message: str,
+    history: list,
     max_tokens: int,
     temperature: float,
     top_k: int,
+    top_p: float,
+    repetition_penalty: float
+):
+    """Streaming chat response"""
+    if not message.strip():
+        yield history
+        return
+    # Format prompt
+    prompt = format_chat_prompt(message, history)
+    # Generate with streaming
+    partial_response = ""
+    for generated in generate_stream(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty
+    ):
+        partial_response = generated
+        # Stop at end tags
+        if "<|im_end|>" in partial_response:
+            partial_response = partial_response.split("<|im_end|>")[0]
+        # Update history
+        yield history + [[message, partial_response.strip()]]
+def stop_gen():
+    """Stop generation callback"""
+    global stop_generation
+    stop_generation = True
+    return None
 # ============================================================================
+# Gradio UI
 # ============================================================================
+# Festive CSS
+festive_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+    color: white;
+    border-radius: 12px;
+    margin-bottom: 2rem;
+    box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
+    animation: pulse 2s ease-in-out infinite;
+}
+@keyframes pulse {
+    0%, 100% { transform: scale(1); }
+    50% { transform: scale(1.02); }
+}
+.header h1 {
+    font-size: 2.8rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
+}
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.celebration {
+    font-size: 2rem;
+    margin: 0.5rem;
+    animation: bounce 1s ease infinite;
+}
+@keyframes bounce {
+    0%, 100% { transform: translateY(0); }
+    50% { transform: translateY(-10px); }
+}
+.stats-card {
+    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+    padding: 1.5rem;
+    border-radius: 12px;
+    border-left: 4px solid #f5576c;
+    margin: 1rem 0;
+    box-shadow: 0 4px 16px rgba(252, 182, 159, 0.3);
+}
+.twin-badge {
+    display: inline-block;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 20px;
+    font-weight: bold;
+    margin: 0.5rem;
+    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
+}
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
+.confetti {
+    position: fixed;
+    width: 10px;
+    height: 10px;
+    background: #f5576c;
+    position: absolute;
+    animation: confetti-fall 3s linear infinite;
+}
+@keyframes confetti-fall {
+    to { transform: translateY(100vh) rotate(360deg); }
+}
+"""
+# Production CSS
+production_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
 }
 .header {
     margin-bottom: 2rem;
 }
+.header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 0.5rem;
+    font-weight: 700;
+}
+.header p {
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.stats-card {
     background: #f8f9fa;
+    padding: 1rem;
     border-radius: 8px;
     border-left: 4px solid #667eea;
     margin: 1rem 0;
 }
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
 """
+# Select CSS based on mode
+custom_css = festive_css if FESTIVE else production_css
+# Build interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    if FESTIVE:
+        gr.HTML("""
+            <div class="header">
+                <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
+                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
+                     alt="SAM-Z-1"
+                     style="max-width: 400px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 8px 24px rgba(0,0,0,0.2);">
+                <h1>🤖 SAM-Z-1 Chat 🤖</h1>
+                <p><strong>LATEST RELEASE!</strong> Our <strong>Best</strong> non-reasoning model</p>
+                <div class="twin-badge">Twin of SAM-X-1 (Reasoning Model)</div>
+                <p style="font-size: 0.9rem; margin-top: 1rem;">
+                    768D • 16 Layers • 12 Heads • ~313M Parameters • Trained on TPU v5e-8
+                </p>
+                <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
+            </div>
+        """)
+    else:
+        gr.HTML("""
+            <div class="header">
+                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
+                     alt="SAM-Z-1"
+                     style="max-width: 300px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 4px 16px rgba(0,0,0,0.15);">
+                <h1>🤖 SAM-Z-1 Chat</h1>
+                <p>Fast, direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; margin-top: 0.5rem;">
+                    768D • 16 Layers • 12 Heads • Trained on TPU v5e-8
+                </p>
+            </div>
+        """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            # Chat interface with bot avatar
+            chatbot = gr.Chatbot(
+                height=600,
+                show_label=False,
+                avatar_images=(
+                    None,
+                    "https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/KtiMi-aDUOOeN--YNT-Fu.jpeg"
+                ),
+                bubble_full_width=False
             )
             with gr.Row():
+                msg = gr.Textbox(
+                    placeholder="Type your message here..." if not FESTIVE else "Ask me anything! I'm the fast twin! ⚡",
+                    show_label=False,
+                    scale=8,
+                    container=False
+                )
+                submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
+                stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
+                retry_btn = gr.Button("🔄 Retry", size="sm")
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Generation Settings")
+            max_tokens = gr.Slider(
+                minimum=50,
+                maximum=1024,
+                value=512,
+                step=50,
+                label="Max Tokens",
+                info="Maximum length of response"
             )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1,
+                label="Temperature",
+                info="Higher = more creative"
             )
+            top_k = gr.Slider(
+                minimum=1,
+                maximum=100,
+                value=40,
+                step=1,
+                label="Top-K",
+                info="Sample from top K tokens"
             )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-P",
+                info="Nucleus sampling threshold"
+            )
+            repetition_penalty = gr.Slider(
+                minimum=1.0,
+                maximum=2.0,
+                value=1.1,
+                step=0.1,
+                label="Repetition Penalty",
+                info="Penalize repeated tokens"
+            )
+            gr.Markdown("---")
+            # Model info
+            if FESTIVE:
+                gr.Markdown(f"""
+                    ### 🎊 SAM-Z-1 Model Info
+                    **🎯 The Fast Twin!**
+                    **Type:** Direct Response Model
+                    **Parameters:** ~313M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Speed:** ⚡ Optimized with TF Functions
+                    **Twin Model:**
+                    - **SAM-X-1**: Reasoning model (uses `<think>` tags)
+                    - **SAM-Z-1**: Fast model (no thinking, direct answers! 🎉)
+                    **Note:** Model includes `<think>` tokens in vocab but doesn't use them. Training used same tokenizer as SAM-X-1.
+                    **Architecture:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - No bias terms (efficient!)
+                    **Training:**
+                    - Trained from scratch
+                    - TPU v5e-8 (8 cores)
+                    - Mixed precision (bfloat16)
+                    - Cosine decay schedule
+                """)
+            else:
+                gr.Markdown(f"""
+                    ### 📊 Model Info
+                    **Architecture:** SAM-Z-1 (Direct Response)
+                    **Parameters:** ~313M
+                    **Context:** {config['max_position_embeddings']} tokens
+                    **Vocab:** {config['vocab_size']}
+                    **Twin Models:**
+                    - SAM-X-1: Reasoning model (uses `<think>` tags)
+                    - SAM-Z-1: Direct response model (no thinking)
+                    **Note:** Vocab includes `<think>` tokens but model doesn't use them in generation.
+                    **Features:**
+                    - RoPE positional encoding
+                    - SwiGLU activation
+                    - RMSNorm layers
+                    - TF-optimized inference
+                """)
+    # Example prompts
+    gr.Examples(
+        examples=[
+            "Hi! What can you do?",
+            "Explain quantum computing in simple terms",
+            "Write a short poem about AI",
+            "What's the capital of France?",
+            "How do I learn programming?",
+            "Tell me an interesting fact about space",
+            "What's the difference between you and SAM-X-1?",
+            "Why are you called the fast twin?",
         ],
+        inputs=msg,
+        label="💡 Try these examples" if not FESTIVE else "🎯 Try these examples!"
     )
+    # Footer
+    if FESTIVE:
+        gr.HTML("""
+            <footer>
+                <p style="font-size: 1.2rem;"><strong>🎉 SAM-Z-1 - LATEST RELEASE! 🎉</strong></p>
+                <p><strong>The Fast Twin</strong> - Direct responses without reasoning overhead</p>
+                <p style="font-size: 0.9rem; color: #999; margin-top: 0.5rem;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model) • Same architecture, different training objective
+                </p>
+                <div style="margin-top: 1rem; font-size: 1.5rem;">
+                    ⚡ 🚀 💫 ✨ 🎯
+                </div>
+            </footer>
+        """)
+    else:
+        gr.HTML("""
+            <footer>
+                <p><strong>SAM-Z-1</strong> - Direct response language model</p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
+                </p>
+                <p style="font-size: 0.9rem; color: #999;">
+                    Twin of SAM-X-1 (reasoning model)
+                </p>
+            </footer>
+        """)
+    # Event handlers
+    submit_event = msg.submit(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    ).then(
+        lambda: "",
+        outputs=[msg]
+    )
+    click_event = submit_btn.click(
+        chat_stream,
+        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    ).then(
+        lambda: "",
+        outputs=[msg]
+    )
+    # Stop button
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, click_event]
+    )
+    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
+    def retry_last(history, max_tok, temp, topk, topp, rep_pen):
+        if not history:
+            return history
+        last_user_msg = history[-1][0]
+        history = history[:-1]
+        for update in chat_stream(last_user_msg, history, max_tok, temp, topk, topp, rep_pen):
+            yield update
+    retry_event = retry_btn.click(
+        retry_last,
+        inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
+        outputs=[chatbot]
+    )
+    stop_btn.click(
+        fn=stop_gen,
+        inputs=None,
+        outputs=None,
+        cancels=[retry_event]
     )
 # Launch