Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Nov 26, 2025

Commit

436b502

verified ·

1 Parent(s): 579190c

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -76

app.py CHANGED Viewed

@@ -1,3 +1,28 @@
 import gradio as gr
 import tensorflow as tf
 import keras
@@ -52,13 +77,20 @@ class RotaryEmbedding(keras.layers.Layer):
         x1, x2 = tf.split(x, 2, axis=-1)
         return tf.concat([-x2, x1], axis=-1)
-    def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
-        cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
-        sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
-        return (q * cos) + (self.rotate_half(q) * sin), (k * cos) + (self.rotate_half(k) * sin)
     def get_config(self):
         config = super().get_config()
@@ -110,29 +142,82 @@ class TransformerBlock(keras.layers.Layer):
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
-    def call(self, x, training=None):
-        B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
-        q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
-        k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
-        v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
-        q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
-        mask = tf.where(tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0, tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
-        scores += mask
-        attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
-        attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
-        x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
-        return res + self.dropout(ffn, training=training)
     def get_config(self):
         config = super().get_config()
-        config.update({"d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim, "dropout": self.dropout_rate, "max_len": self.max_len, "rope_theta": self.rope_theta, "layer_idx": self.layer_idx})
         return config
@@ -149,25 +234,44 @@ class SAM1Model(keras.Model):
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
-        block_args = {'d_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'], 'ff_dim': ff_dim, 'dropout': self.cfg['dropout'], 'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']}
-        self.blocks = []
-        for i in range(self.cfg['n_layers']):
-            block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
-            self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
-    def call(self, input_ids, training=None):
         x = self.embed(input_ids)
-        for block in self.blocks:
-            x = block(x, training=training)
-        return self.lm_head(self.norm(x))
     def get_config(self):
         base_config = super().get_config()
         base_config['config'] = self.cfg
         return base_config
 # --- Model and Tokenizer Loading ---
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
@@ -255,96 +359,101 @@ def generate_stream(
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
-    """Generate text with streaming output using REAL model inference"""
     global stop_generation
     stop_generation = False
-    # Tokenize prompt
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
-    input_tensor = tf.constant([input_ids], dtype=tf.int32)
     generated_text = ""
     token_count = 0
     token_freq = {}
     start_time = time.time()
-    # --- REAL INFERENCE LOOP ---
     for step in range(max_tokens):
         if stop_generation:
             yield generated_text + "\n\n*[Generation stopped]*"
             break
-        # 1. Forward Pass (Real Model)
-        logits = fast_forward(input_tensor)
-        next_token_logits = logits[0, -1, :].numpy()
-        # 2. Temperature
-        next_token_logits = next_token_logits / temperature
-        # 3. Repetition Penalty
         if repetition_penalty != 1.0:
             for token_id, freq in token_freq.items():
-                if token_id < len(next_token_logits):
-                    next_token_logits[token_id] /= (repetition_penalty ** freq)
-        # 4. Sampling (Top-K / Top-P)
-        # Top-K
         if top_k > 0:
-            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
-            top_k_logits = next_token_logits[top_k_indices]
-            top_k_probs = tf.nn.softmax(top_k_logits).numpy()
-            # Top-P (Nucleus)
             if top_p < 1.0:
-                sorted_indices = np.argsort(top_k_probs)[::-1]
-                cumsum = np.cumsum(top_k_probs[sorted_indices])
-                cutoff_idx = np.searchsorted(cumsum, top_p)
-                nucleus_indices = sorted_indices[:cutoff_idx + 1]
-                nucleus_logits = top_k_logits[nucleus_indices]
-                nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
-                sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
-                next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
             else:
-                sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
-                next_token_id = int(top_k_indices[sampled_idx])
         else:
-            probs = tf.nn.softmax(next_token_logits).numpy()
             next_token_id = np.random.choice(len(probs), p=probs)
-        # 5. Stop Conditions
-        if next_token_id == eos_token_id or \
-           next_token_id == tokenizer.token_to_id("<|im_end|>") or \
-           next_token_id == tokenizer.token_to_id("<im end for model tun>"):
             break
-        # 6. Update Input & History
         token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
         token_text = tokenizer.decode([next_token_id])
         generated_text += token_text
         token_count += 1
         yield generated_text
-        # Prepare next input
-        input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
-        # Truncate if exceeding context
-        if input_tensor.shape[1] > config['max_position_embeddings']:
-             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
     elapsed = time.time() - start_time
-    tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
     if token_count > 0 and not stop_generation:
-        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tokens_per_sec:.1f} tok/s)]*"
     yield generated_text
 # ============================================================================
 # Chat Interface Logic
 # ============================================================================

+import os
+# === CPU Threading Optimization ===
+# Set these BEFORE importing TensorFlow
+NUM_CORES = os.cpu_count() or 4
+os.environ['TF_NUM_INTEROP_THREADS'] = str(NUM_CORES)
+os.environ['TF_NUM_INTRAOP_THREADS'] = str(NUM_CORES)
+# Disable GPU (ensures CPU-only, avoids GPU detection overhead)
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+import tensorflow as tf
+# Configure threading after import
+tf.config.threading.set_inter_op_parallelism_threads(NUM_CORES)
+tf.config.threading.set_intra_op_parallelism_threads(NUM_CORES)
+# Enable oneDNN optimizations (significant on Intel CPUs)
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
+# Optional: XLA JIT compilation (can help, test it)
+# tf.config.optimizer.set_jit(True)
+print(f"✅ CPU optimized: {NUM_CORES} threads, oneDNN enabled")
 import gradio as gr
 import tensorflow as tf
 import keras
         x1, x2 = tf.split(x, 2, axis=-1)
         return tf.concat([-x2, x1], axis=-1)
+    def call(self, q, k, offset=0):
+        """Apply rotary embeddings with position offset for KV-cache."""
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
+        # For q: positions are [offset, offset+seq_len)
+        # For k: same positions (k is only the new tokens, past_k already has RoPE applied)
+        cos = tf.cast(self.cos_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
+        sin = tf.cast(self.sin_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
+        q_embed = (q * cos) + (self.rotate_half(q) * sin)
+        k_embed = (k * cos) + (self.rotate_half(k) * sin)
+        return q_embed, k_embed
     def get_config(self):
         config = super().get_config()
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
+    def call(self, x, training=None, past_kv=None, use_cache=False):
+        """
+        Args:
+            x: input tensor [B, T, D] (T=1 during cached generation)
+            past_kv: tuple of (past_k, past_v) each [B, n_heads, past_len, head_dim]
+            use_cache: whether to return updated kv cache
+        Returns:
+            output, (new_k, new_v) if use_cache else output, None
+        """
+        B = tf.shape(x)[0]
+        T = tf.shape(x)[1]
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
+        # Project Q, K, V for current input
+        q = tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim])
+        q = tf.transpose(q, [0, 2, 1, 3])  # [B, n_heads, T, head_dim]
+        k = tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim])
+        k = tf.transpose(k, [0, 2, 1, 3])
+        v = tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim])
+        v = tf.transpose(v, [0, 2, 1, 3])
+        # Determine position offset for RoPE
+        if past_kv is not None:
+            past_len = tf.shape(past_kv[0])[2]
+        else:
+            past_len = 0
+        # Apply RoPE with position offset
+        q, k = self.rope(q, k, offset=past_len)
+        # Concatenate with past KV
+        if past_kv is not None:
+            k = tf.concat([past_kv[0], k], axis=2)
+            v = tf.concat([past_kv[1], v], axis=2)
+        new_kv = (k, v) if use_cache else None
+        # Attention
+        full_len = tf.shape(k)[2]
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        # Causal mask: q attends to all of k (including past)
+        # Shape: [T, full_len] where each query position can attend to positions <= its absolute position
+        q_positions = tf.range(past_len, past_len + T)
+        k_positions = tf.range(full_len)
+        mask = tf.cast(q_positions[:, None] >= k_positions[None, :], dtype)
+        mask = tf.where(mask == 0, tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
+        scores = scores + mask[None, None, :, :]
+        attn = tf.nn.softmax(scores, axis=-1)
+        attn_out = tf.matmul(attn, v)
+        attn_out = tf.transpose(attn_out, [0, 2, 1, 3])
+        attn_out = tf.reshape(attn_out, [B, T, self.d_model])
+        x = res + self.dropout(self.out_proj(attn_out), training=training)
+        # FFN
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
+        output = res + self.dropout(ffn, training=training)
+        return output, new_kv
     def get_config(self):
         config = super().get_config()
+        config.update({
+            "d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim,
+            "dropout": self.dropout_rate, "max_len": self.max_len,
+            "rope_theta": self.rope_theta, "layer_idx": self.layer_idx
+        })
         return config
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
+        block_args = {
+            'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'],
+            'ff_dim': ff_dim, 'dropout': self.cfg['dropout'],
+            'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']
+        }
+        self.blocks = [
+            TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
+            for i in range(self.cfg['n_layers'])
+        ]
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
+    def call(self, input_ids, training=None, past_kv=None, use_cache=False):
+        """
+        Args:
+            input_ids: [B, T]
+            past_kv: list of (k, v) tuples, one per layer
+            use_cache: whether to return updated cache
+        Returns:
+            logits, new_past_kv (or None)
+        """
         x = self.embed(input_ids)
+        new_past_kv = [] if use_cache else None
+        for i, block in enumerate(self.blocks):
+            layer_past = past_kv[i] if past_kv is not None else None
+            x, layer_kv = block(x, training=training, past_kv=layer_past, use_cache=use_cache)
+            if use_cache:
+                new_past_kv.append(layer_kv)
+        logits = self.lm_head(self.norm(x))
+        return logits, new_past_kv
     def get_config(self):
         base_config = super().get_config()
         base_config['config'] = self.cfg
         return base_config
 # --- Model and Tokenizer Loading ---
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
+    """Generate text with KV-cache for fast CPU inference."""
     global stop_generation
     stop_generation = False
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
     generated_text = ""
     token_count = 0
     token_freq = {}
     start_time = time.time()
+    # === PREFILL PHASE ===
+    # Process entire prompt, build initial KV cache
+    input_tensor = tf.constant([input_ids], dtype=tf.int32)
+    logits, past_kv = model(input_tensor, training=False, use_cache=True)
+    # Get logits for last position
+    next_token_logits = logits[0, -1, :].numpy()
+    # === GENERATION LOOP ===
     for step in range(max_tokens):
         if stop_generation:
             yield generated_text + "\n\n*[Generation stopped]*"
             break
+        # Temperature
+        scaled_logits = next_token_logits / temperature
+        # Repetition penalty
         if repetition_penalty != 1.0:
             for token_id, freq in token_freq.items():
+                if token_id < len(scaled_logits):
+                    scaled_logits[token_id] /= (repetition_penalty ** freq)
+        # Top-K sampling
         if top_k > 0:
+            top_k_indices = np.argpartition(scaled_logits, -top_k)[-top_k:]
+            top_k_logits = scaled_logits[top_k_indices]
+            top_k_probs = np.exp(top_k_logits - np.max(top_k_logits))
+            top_k_probs /= top_k_probs.sum()
+            # Top-P (nucleus) sampling
             if top_p < 1.0:
+                sorted_idx = np.argsort(top_k_probs)[::-1]
+                cumsum = np.cumsum(top_k_probs[sorted_idx])
+                cutoff = np.searchsorted(cumsum, top_p) + 1
+                nucleus_idx = sorted_idx[:cutoff]
+                nucleus_probs = top_k_probs[nucleus_idx]
+                nucleus_probs /= nucleus_probs.sum()
+                sampled = np.random.choice(len(nucleus_probs), p=nucleus_probs)
+                next_token_id = int(top_k_indices[nucleus_idx[sampled]])
             else:
+                sampled = np.random.choice(len(top_k_probs), p=top_k_probs)
+                next_token_id = int(top_k_indices[sampled])
         else:
+            probs = np.exp(scaled_logits - np.max(scaled_logits))
+            probs /= probs.sum()
             next_token_id = np.random.choice(len(probs), p=probs)
+        # Stop conditions
+        if next_token_id == eos_token_id:
             break
+        im_end_id = tokenizer.token_to_id("<|im_end|>")
+        model_end_id = tokenizer.token_to_id("<im end for model tun>")
+        if next_token_id in (im_end_id, model_end_id):
+            break
+        # Update frequency tracking
         token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
+        # Decode and yield
         token_text = tokenizer.decode([next_token_id])
         generated_text += token_text
         token_count += 1
         yield generated_text
+        # === DECODE PHASE (single token, reuse cache) ===
+        next_input = tf.constant([[next_token_id]], dtype=tf.int32)
+        logits, past_kv = model(next_input, training=False, past_kv=past_kv, use_cache=True)
+        next_token_logits = logits[0, -1, :].numpy()
+        # Truncate cache if too long
+        max_len = config['max_position_embeddings']
+        if past_kv[0][0].shape[2] > max_len:
+            past_kv = [(k[:, :, -max_len:, :], v[:, :, -max_len:, :]) for k, v in past_kv]
     elapsed = time.time() - start_time
+    tps = token_count / elapsed if elapsed > 0 else 0
     if token_count > 0 and not stop_generation:
+        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tps:.1f} tok/s)]*"
     yield generated_text
 # ============================================================================
 # Chat Interface Logic
 # ============================================================================