Spaces:

Bc-AI
/

Starshot-X-chat-thing

Sleeping

App Files Files Community

Update app.py

by Bc-AI - opened Oct 19, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+372

-414

Files changed (1) hide show

app.py +372 -414

app.py CHANGED Viewed

@@ -1,4 +1,10 @@
 import os
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -9,6 +15,10 @@ from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
 import json
 from abc import ABC, abstractmethod
 # ==============================================================================
 # Model Architecture (Must match training code)
@@ -237,6 +247,10 @@ class ModelBackend(ABC):
     @abstractmethod
     def get_info(self):
         pass
 class KerasBackend(ModelBackend):
@@ -256,6 +270,7 @@ class KerasBackend(ModelBackend):
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
     def predict(self, input_ids):
         inputs = np.array([input_ids], dtype=np.int32)
         logits = self.model(inputs, training=False)
         return logits[0, -1, :].numpy()
@@ -263,6 +278,9 @@ class KerasBackend(ModelBackend):
     def get_name(self):
         return self.display_name
     def get_info(self):
         info = f"{self.display_name}\n"
         info += f"  Total params: {format_param_count(self.total_params)}\n"
@@ -274,186 +292,145 @@ class KerasBackend(ModelBackend):
 # ==============================================================================
-# EASY MODEL REGISTRY - ADD YOUR MODELS HERE!
 # ==============================================================================
 MODEL_REGISTRY = [
     # Format: (display_name, repo_id, weights_filename, config_filename)
-    # Smaller models are ACTUALLY faster (fewer params = real speedup!)
     ("SAM-X-1-Large", "Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5", None),
     ("SAM-X-1-Fast ⚡ (BETA)", "Smilyai-labs/Sam-X-1-fast", "sam1_fast.weights.h5", "sam1_fast_config.json"),
     ("SAM-X-1-Mini 🚀 (BETA)", "Smilyai-labs/Sam-X-1-Mini", "sam1_mini.weights.h5", "sam1_mini_config.json"),
     ("SAM-X-1-Nano ⚡⚡ (BETA)", "Smilyai-labs/Sam-X-1-Nano", "sam1_nano.weights.h5", "sam1_nano_config.json"),
 ]
-# To add a new model, just add a new line above! Format:
-# ("Display Name", "repo_id", "weights.h5", "config.json")
-# If config_filename is None, uses the default config
-# ==============================================================================
-# Load Models
-# ==============================================================================
-CONFIG_TOKENIZER_REPO_ID = "Smilyai-labs/Sam-1-large-it-0002"
-print("="*80)
-print("🤖 SAM-X-1 Multi-Model Chat Interface".center(80))
-print("="*80)
-# Download config and tokenizer
-print(f"\n📦 Downloading config/tokenizer from: {CONFIG_TOKENIZER_REPO_ID}")
-config_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="config.json")
-tokenizer_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="tokenizer.json")
-# Load config
-with open(config_path, 'r') as f:
-    base_config = json.load(f)
-print(f"✅ Base config loaded")
-# Build base model config
-base_model_config = {
-    'vocab_size': base_config['vocab_size'],
-    'd_model': base_config['hidden_size'],
-    'n_heads': base_config['num_attention_heads'],
-    'ff_mult': base_config['intermediate_size'] / base_config['hidden_size'],
-    'dropout': base_config.get('dropout', 0.0),
-    'max_len': base_config['max_position_embeddings'],
-    'rope_theta': base_config['rope_theta'],
-    'n_layers': base_config['num_hidden_layers']
-}
-# Recreate tokenizer
-print("\n🔤 Recreating tokenizer...")
-tokenizer = Tokenizer.from_pretrained("gpt2")
-eos_token = ""
-eos_token_id = tokenizer.token_to_id(eos_token)
-if eos_token_id is None:
-    tokenizer.add_special_tokens([eos_token])
-    eos_token_id = tokenizer.token_to_id(eos_token)
-custom_tokens = ["<think>", "<think/>"]
-for token in custom_tokens:
-    if tokenizer.token_to_id(token) is None:
-        tokenizer.add_special_tokens([token])
-tokenizer.no_padding()
-tokenizer.enable_truncation(max_length=base_config['max_position_embeddings'])
-print(f"✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()})")
-# Load all models from registry
-print("\n" + "="*80)
-print("📦 LOADING MODELS".center(80))
-print("="*80)
-available_models = {}
-dummy_input = tf.zeros((1, 1), dtype=tf.int32)
-for display_name, repo_id, weights_filename, config_filename in MODEL_REGISTRY:
-    try:
-        print(f"\n⏳ Loading: {display_name}")
-        print(f"   Repo: {repo_id}")
-        print(f"   Weights: {weights_filename}")
-        # Download weights
-        weights_path = hf_hub_download(repo_id=repo_id, filename=weights_filename)
-        # Load custom config if specified (for pruned models)
-        if config_filename:
-            print(f"   Config: {config_filename}")
-            custom_config_path = hf_hub_download(repo_id=repo_id, filename=config_filename)
-            with open(custom_config_path, 'r') as f:
-                model_config = json.load(f)
-            print(f"   📐 Custom architecture: {model_config['n_heads']} heads, {int(model_config['d_model'] * model_config['ff_mult'])} FFN dim")
-        else:
-            model_config = base_model_config.copy()
-        # Create model with appropriate config
-        model = SAM1Model(**model_config)
-        model(dummy_input)
-        model.load_weights(weights_path)
-        model.trainable = False
-        # Create backend
-        backend = KerasBackend(model, display_name, display_name)
-        available_models[display_name] = backend
-        # Print stats
-        print(f"   ✅ Loaded successfully!")
-        print(f"   📊 Parameters: {format_param_count(backend.total_params)}")
-        print(f"   📊 Attention heads: {backend.n_heads}")
-        print(f"   📊 FFN dimension: {backend.ff_dim}")
-    except Exception as e:
-        print(f"   ⚠️  Failed to load: {e}")
-        print(f"   Skipping {display_name}...")
-if not available_models:
-    raise RuntimeError("❌ No models loaded! Check your MODEL_REGISTRY configuration.")
-print(f"\n✅ Successfully loaded {len(available_models)} model(s)")
-print(f"   Device: {'GPU' if len(tf.config.list_physical_devices('GPU')) > 0 else 'CPU'}")
-current_backend = list(available_models.values())[0]
-# ==============================================================================
-# Important Note About Pruning and Speed
-# ==============================================================================
-print("\n" + "="*80)
-print("💡 ABOUT PRUNING & SPEED".center(80))
-print("="*80)
-print("""
-📌 Does pruning reduce parameter count?
-   YES and NO:
-   • Total param count stays the same (architecture unchanged)
-   • BUT pruned weights are set to ZERO (sparse weights)
-   • Active/non-zero params are reduced significantly
-📌 Does pruning speed up inference?
-   IT DEPENDS:
-   • Dense operations (regular matrix multiply): NO speedup by default
-   • Need sparse kernels or hardware support for actual speedup
-   • HOWEVER: Smaller active weights = better cache utilization
-   • Less computation on zeros = potential speedup on some hardware
-📌 What DOES speed things up reliably?
-   ✅ Quantization (FP16, INT8) - smaller types = faster compute
-   ✅ Fewer layers (layer pruning)
-   ✅ Smaller hidden dimensions (width reduction)
-   ✅ Knowledge distillation to smaller architecture
-📌 Why use structured pruning then?
-   ✅ Reduces memory footprint (especially with sparse storage)
-   ✅ Can be combined with quantization for real speedups
-   ✅ Preserves quality better than aggressive dimension reduction
-   ✅ Foundation for converting to truly smaller architecture
-""")
-def generate_response_stream(prompt, temperature=0.7, backend=None):
     """Generate response and yield tokens one by one for streaming."""
-    if backend is None:
-        backend = current_backend
     encoded_prompt = tokenizer.encode(prompt)
     input_ids = [i for i in encoded_prompt.ids if i != eos_token_id]
     generated = input_ids.copy()
     current_text = ""
-    in_thinking = False
-    # Get max_len from the backend's model config
     max_len = backend.model.cfg['max_len']
-    for _ in range(512):
-        current_input = generated[-max_len:]
         # Get logits from selected backend
         next_token_logits = backend.predict(current_input)
         if temperature > 0:
             next_token_logits = next_token_logits / temperature
-            top_k_indices = np.argpartition(next_token_logits, -50)[-50:]
             top_k_logits = next_token_logits[top_k_indices]
             top_k_probs = np.exp(top_k_logits - np.max(top_k_logits))
             top_k_probs /= top_k_probs.sum()
@@ -466,299 +443,280 @@ def generate_response_stream(prompt, temperature=0.7, backend=None):
         generated.append(int(next_token))
         new_text = tokenizer.decode(generated[len(input_ids):])
         if len(new_text) > len(current_text):
             new_chunk = new_text[len(current_text):]
             current_text = new_text
-            if "<think>" in new_chunk:
-                in_thinking = True
-            elif "</think>" in new_chunk or "<think/>" in new_chunk:
-                in_thinking = False
             yield new_chunk, in_thinking
 # ==============================================================================
-# Gradio Interface
 # ==============================================================================
-if __name__ == "__main__":
-    import gradio as gr
-    custom_css = """
-    .chat-container {
-        height: 600px;
-        overflow-y: auto;
-        padding: 20px;
-        background: #ffffff;
-    }
-    .user-message {
-        background: #f7f7f8;
-        padding: 16px;
-        margin: 12px 0;
-        border-radius: 8px;
-    }
-    .assistant-message {
-        background: #ffffff;
-        padding: 16px;
-        margin: 12px 0;
-        border-radius: 8px;
-        border-left: 3px solid #10a37f;
-    }
-    .message-content {
-        color: #353740;
-        line-height: 1.6;
-        font-size: 15px;
-    }
-    .message-header {
-        font-weight: 600;
-        margin-bottom: 8px;
-        color: #353740;
-        font-size: 14px;
-    }
-    .thinking-content {
-        color: #6b7280;
-        font-style: italic;
-        border-left: 3px solid #d1d5db;
-        padding-left: 12px;
-        margin: 8px 0;
-        background: #f9fafb;
-        padding: 8px 12px;
-        border-radius: 4px;
-    }
-    .input-row {
-        background: #ffffff;
-        padding: 12px;
-        border-radius: 8px;
-        margin-top: 12px;
-        border: 1px solid #e5e7eb;
-    }
-    .gradio-container {
-        max-width: 900px !important;
-        margin: auto !important;
-    }
-    .announcement-banner {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        padding: 16px 24px;
-        border-radius: 12px;
-        margin-bottom: 20px;
-        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-        text-align: center;
-        font-size: 16px;
-        font-weight: 500;
-        animation: slideIn 0.5s ease-out;
-    }
-    @keyframes slideIn {
-        from {
-            opacity: 0;
-            transform: translateY(-20px);
-        }
-        to {
-            opacity: 1;
-            transform: translateY(0);
-        }
-    }
-    .announcement-banner strong {
-        font-weight: 700;
-        font-size: 18px;
-    }
-    .settings-panel {
-        background: #f9fafb;
-        padding: 16px;
-        border-radius: 8px;
-        margin-bottom: 12px;
-        border: 1px solid #e5e7eb;
-    }
-    .model-info {
-        background: #f0f9ff;
-        border: 1px solid #bae6fd;
-        padding: 12px;
-        border-radius: 8px;
-        margin-top: 8px;
-        font-size: 13px;
-        font-family: monospace;
-        white-space: pre-line;
-    }
-    """
-    def format_message_html(role, content, show_thinking=True):
-        """Format a single message as HTML."""
-        role_class = "user-message" if role == "user" else "assistant-message"
-        role_name = "You" if role == "user" else "SAM-X-1"
-        thinking = ""
-        answer = ""
-        if "<think>" in content:
-            parts = content.split("<think>", 1)
-            before_think = parts[0].strip()
-            if len(parts) > 1:
-                after_think = parts[1]
-                if "</think>" in after_think:
-                    think_parts = after_think.split("</think>", 1)
-                    thinking = think_parts[0].strip()
-                    answer = (before_think + " " + think_parts[1]).strip()
-                elif "<think/>" in after_think:
-                    think_parts = after_think.split("<think/>", 1)
-                    thinking = think_parts[0].strip()
-                    answer = (before_think + " " + think_parts[1]).strip()
-                else:
-                    thinking = after_think.strip()
-                    answer = before_think
-            else:
-                answer = before_think
-        else:
-            answer = content
-        html = f'<div class="{role_class}">'
-        html += f'<div class="message-header">{role_name}</div>'
-        html += f'<div class="message-content">'
-        if thinking and show_thinking:
-            html += f'<div class="thinking-content">💭 {thinking}</div>'
-        if answer:
-            html += f'<div>{answer}</div>'
-        html += '</div></div>'
-        return html
-    def render_history(history, show_thinking):
-        """Render chat history as HTML."""
-        html = ""
-        for msg in history:
-            html += format_message_html(msg["role"], msg["content"], show_thinking)
-        return html
-    def send_message(message, history, show_thinking, temperature, model_choice):
-        if not message.strip():
-            yield history, "", render_history(history, show_thinking), ""
-            return
-        # Switch backend based on selection
-        backend = available_models[model_choice]
-        # Add user message
-        history.append({"role": "user", "content": message})
-        yield history, "", render_history(history, show_thinking), backend.get_info()
-        # Generate prompt
-        prompt = f"User: {message}\nSam:   <think>"
-        # Start assistant message
-        history.append({"role": "assistant", "content": "<think>"})
-        # Stream response
-        for new_chunk, in_thinking in generate_response_stream(prompt, temperature, backend):
-            history[-1]["content"] += new_chunk
-            yield history, "", render_history(history, show_thinking), backend.get_info()
-    # Create Gradio interface
-    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="slate")) as demo:
-        # Announcement Banner
-        gr.HTML("""
-        <div class="announcement-banner">
-            🎉 <strong>NEW UPDATE:</strong> Multiple model variants now available!
-            Choose Fast/Mini/Nano for <strong>30-250% speed boost</strong>! ⚡
-            The models marked with (BETA) are not useful yet. <strong>They are still in development!</strong>
-        </div>
-        """)
-        gr.Markdown("# 🤖 SAM-X-1 Multi-Model Chat")
-        # Settings panel
-        with gr.Accordion("⚙️ Settings", open=False):
-            with gr.Row():
-                model_selector = gr.Dropdown(
-                    choices=list(available_models.keys()),
-                    value=list(available_models.keys())[0],
-                    label="Model Selection",
-                    info="Choose your speed/quality tradeoff"
-                )
-            model_info_box = gr.Textbox(
-                label="Selected Model Info",
-                value=list(available_models.values())[0].get_info(),
-                interactive=False,
-                lines=4,
-                elem_classes=["model-info"]
-            )
-            with gr.Row():
-                temperature_slider = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.7,
-                    step=0.1,
-                    label="Temperature",
-                    info="Higher = more creative, Lower = more focused"
-                )
-                show_thinking_checkbox = gr.Checkbox(
-                    label="Show Thinking Process",
-                    value=True,
-                    info="Display model's reasoning"
                 )
-        # Chat state and display
-        chatbot_state = gr.State([])
-        chat_html = gr.HTML(value="", elem_classes=["chat-container"])
-        # Input area
-        with gr.Row(elem_classes=["input-row"]):
-            msg_input = gr.Textbox(
-                placeholder="Ask me anything...",
-                show_label=False,
-                container=False,
-                scale=9
             )
-            send_btn = gr.Button("Send", variant="primary", scale=1)
-        with gr.Row():
-            clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
-        # Event handlers
-        msg_input.submit(
-            send_message,
-            inputs=[msg_input, chatbot_state, show_thinking_checkbox, temperature_slider, model_selector],
-            outputs=[chatbot_state, msg_input, chat_html, model_info_box]
         )
-        send_btn.click(
-            send_message,
-            inputs=[msg_input, chatbot_state, show_thinking_checkbox, temperature_slider, model_selector],
-            outputs=[chatbot_state, msg_input, chat_html, model_info_box]
         )
-        clear_btn.click(
-            lambda: ([], ""),
-            outputs=[chatbot_state, chat_html]
         )
-        show_thinking_checkbox.change(
-            lambda h, st: render_history(h, st),
-            inputs=[chatbot_state, show_thinking_checkbox],
-            outputs=[chat_html]
-        )
-        # Update model info when selection changes
-        model_selector.change(
-            lambda choice: available_models[choice].get_info(),
-            inputs=[model_selector],
-            outputs=[model_info_box]
         )
-    demo.launch(debug=True, share=True)

 import os
+import time
+import uuid
+from datetime import datetime
+from typing import List, Optional, Union, Dict, Any, Generator, Tuple
+# Set environment variables for Keras/TensorFlow
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 from huggingface_hub import hf_hub_download
 import json
 from abc import ABC, abstractmethod
+from fastapi import FastAPI, HTTPException, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, JSONResponse
+from pydantic import BaseModel, Field
 # ==============================================================================
 # Model Architecture (Must match training code)
     @abstractmethod
     def get_info(self):
         pass
+    @abstractmethod
+    def get_model(self) -> SAM1Model:
+        pass
 class KerasBackend(ModelBackend):
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
     def predict(self, input_ids):
+        # NOTE: This predicts the next token based on the input sequence
         inputs = np.array([input_ids], dtype=np.int32)
         logits = self.model(inputs, training=False)
         return logits[0, -1, :].numpy()
     def get_name(self):
         return self.display_name
+    def get_model(self) -> SAM1Model:
+        return self.model
     def get_info(self):
         info = f"{self.display_name}\n"
         info += f"  Total params: {format_param_count(self.total_params)}\n"
 # ==============================================================================
+# Model Registry and Asset Loading
 # ==============================================================================
 MODEL_REGISTRY = [
     # Format: (display_name, repo_id, weights_filename, config_filename)
     ("SAM-X-1-Large", "Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5", None),
     ("SAM-X-1-Fast ⚡ (BETA)", "Smilyai-labs/Sam-X-1-fast", "sam1_fast.weights.h5", "sam1_fast_config.json"),
     ("SAM-X-1-Mini 🚀 (BETA)", "Smilyai-labs/Sam-X-1-Mini", "sam1_mini.weights.h5", "sam1_mini_config.json"),
     ("SAM-X-1-Nano ⚡⚡ (BETA)", "Smilyai-labs/Sam-X-1-Nano", "sam1_nano.weights.h5", "sam1_nano_config.json"),
 ]
+CONFIG_TOKENIZER_REPO_ID = "Smilyai-labs/Sam-1-large-it-0002"
+available_models: Dict[str, KerasBackend] = {}
+tokenizer: Optional[Tokenizer] = None
+eos_token_id: Optional[int] = None
+DEFAULT_SYSTEM_PROMPT = "You are a helpful and friendly assistant named SAM-X-1. Answer the user's request. You must prepend your answer with '<think>' and end your thoughts with '</think>' or '<think/>' followed by your actual response."
+def load_all_assets():
+    """Load config, tokenizer, and all models."""
+    global tokenizer, eos_token_id, available_models, DEFAULT_SYSTEM_PROMPT
+    print("="*80)
+    print("🤖 SAM-X-1 API Backend Loading".center(80))
+    print("="*80)
+    # Download config and tokenizer
+    print(f"\n📦 Downloading config/tokenizer from: {CONFIG_TOKENIZER_REPO_ID}")
+    config_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="config.json")
+    # Load config
+    with open(config_path, 'r') as f:
+        base_config = json.load(f)
+    print(f"✅ Base config loaded")
+    # Build base model config
+    base_model_config = {
+        'vocab_size': base_config['vocab_size'],
+        'd_model': base_config['hidden_size'],
+        'n_heads': base_config['num_attention_heads'],
+        'ff_mult': base_config['intermediate_size'] / base_config['hidden_size'],
+        'dropout': base_config.get('dropout', 0.0),
+        'max_len': base_config['max_position_embeddings'],
+        'rope_theta': base_config['rope_theta'],
+        'n_layers': base_config['num_hidden_layers']
+    }
+    # Recreate tokenizer
+    print("\n🔤 Recreating tokenizer...")
+    # NOTE: The original code uses "gpt2" to load the tokenizer architecture.
+    tokenizer = Tokenizer.from_pretrained("gpt2")
+    eos_token = ""
+    eos_token_id = tokenizer.token_to_id(eos_token)
+    if eos_token_id is None:
+        tokenizer.add_special_tokens([eos_token])
+        eos_token_id = tokenizer.token_to_id(eos_token)
+    custom_tokens = ["<think>", "<think/>", "</think>"]
+    for token in custom_tokens:
+        if tokenizer.token_to_id(token) is None:
+            tokenizer.add_special_tokens([token])
+    tokenizer.no_padding()
+    tokenizer.enable_truncation(max_length=base_config['max_position_embeddings'])
+    print(f"✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()})")
+    # Load all models from registry
+    print("\n" + "="*80)
+    print("📦 LOADING MODELS".center(80))
+    print("="*80)
+    dummy_input = tf.zeros((1, 1), dtype=tf.int32)
+    for display_name, repo_id, weights_filename, config_filename in MODEL_REGISTRY:
+        try:
+            print(f"\n⏳ Loading: {display_name}")
+            # Download weights
+            weights_path = hf_hub_download(repo_id=repo_id, filename=weights_filename)
+            # Load custom config if specified (for pruned models)
+            if config_filename:
+                custom_config_path = hf_hub_download(repo_id=repo_id, filename=config_filename)
+                with open(custom_config_path, 'r') as f:
+                    model_config = json.load(f)
+            else:
+                model_config = base_model_config.copy()
+            # Create model with appropriate config
+            model = SAM1Model(**model_config)
+            model(dummy_input)
+            model.load_weights(weights_path)
+            model.trainable = False
+            # Create backend
+            backend = KerasBackend(model, display_name, display_name)
+            available_models[display_name] = backend
+            # Print stats
+            print(f"  ✅ Loaded successfully!")
+            print(f"  📊 Parameters: {format_param_count(backend.total_params)}")
+        except Exception as e:
+            print(f"  ⚠️  Failed to load {display_name}: {e}")
+            print(f"  Skipping {display_name}...")
+    if not available_models:
+        # NOTE: In a real system, you might want a graceful fallback. Here, we must exit.
+        print("FATAL: No models loaded! Check your MODEL_REGISTRY configuration.")
+        # We raise a RuntimeError but let the startup event handle the final failure
+        # to ensure the FastAPI application runs the event loop.
+def generate_response_stream(prompt: str, temperature: float, backend: KerasBackend, max_new_tokens: int = 512) -> Generator[Tuple[str, bool], None, None]:
     """Generate response and yield tokens one by one for streaming."""
+    if tokenizer is None or eos_token_id is None:
+        raise RuntimeError("Tokenizer not loaded.")
     encoded_prompt = tokenizer.encode(prompt)
     input_ids = [i for i in encoded_prompt.ids if i != eos_token_id]
     generated = input_ids.copy()
     current_text = ""
+    # Use max_len from the model config
     max_len = backend.model.cfg['max_len']
+    for _ in range(max_new_tokens):
+        # Sliding window for context
+        current_input = generated[-max_len:]
         # Get logits from selected backend
         next_token_logits = backend.predict(current_input)
         if temperature > 0:
+            # Top-K sampling
             next_token_logits = next_token_logits / temperature
+            top_k = 50
+            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
             top_k_logits = next_token_logits[top_k_indices]
             top_k_probs = np.exp(top_k_logits - np.max(top_k_logits))
             top_k_probs /= top_k_probs.sum()
         generated.append(int(next_token))
+        # Decode the newly generated part
         new_text = tokenizer.decode(generated[len(input_ids):])
         if len(new_text) > len(current_text):
             new_chunk = new_text[len(current_text):]
             current_text = new_text
+            # Simple check for thinking tags
+            in_thinking = "<think>" in current_text and not ( "</think>" in current_text or "<think/>" in current_text)
             yield new_chunk, in_thinking
+    yield "", False # End of stream
 # ==============================================================================
+# FastAPI API & Pydantic Schemas (OpenAI Style)
 # ==============================================================================
+# --- Pydantic Schemas for OpenAI API Compatibility ---
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(..., description="The ID of the model to use.")
+    messages: List[ChatMessage] = Field(..., description="A list of messages comprising the conversation.")
+    temperature: Optional[float] = Field(0.7, ge=0.0, le=2.0, description="Sampling temperature.")
+    max_tokens: Optional[int] = Field(512, ge=1, description="The maximum number of tokens to generate.")
+    stream: Optional[bool] = Field(False, description="Whether to stream the response.")
+# OpenAI Response Structure: Chunk for Streaming
+class ChatCompletionChunkChoice(BaseModel):
+    index: int = 0
+    delta: Dict[str, Optional[str]]
+    finish_reason: Optional[str] = None
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionChunkChoice]
+# OpenAI Response Structure: Full Response
+class ChatCompletionUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class ChatCompletionChoice(BaseModel):
+    index: int = 0
+    message: ChatMessage
+    finish_reason: Optional[str] = None
+class ChatCompletion(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: ChatCompletionUsage
+# Model Listing Response
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "SAM-X-1 Team"
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard]
+# --- FastAPI Application ---
+app = FastAPI(
+    title="SAM-X-1 Keras API (OpenAI-Style)",
+    description="A production-ready FastAPI backend for the SAM-X-1 Keras model.",
+    version="1.0.0",
+)
+# Production-grade CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Load models and tokenizer when the FastAPI app starts."""
+    try:
+        load_all_assets()
+    except Exception as e:
+        # Print the error and allow FastAPI to start, but subsequent requests will fail
+        print(f"FATAL: Failed to load assets during startup: {e}")
+        pass
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """Endpoint to list all available models."""
+    models_data = [
+        ModelCard(id=name, created=int(time.time()))
+        for name in available_models.keys()
+    ]
+    return ModelList(data=models_data)
+def build_prompt_from_messages(messages: List[ChatMessage], system_prompt: str) -> str:
+    """Constructs the model's instruction-style prompt from a list of messages."""
+    prompt = f"System: {system_prompt}\n"
+    for message in messages:
+        role = message.role.capitalize()
+        content = message.content.strip()
+        if role == "User":
+            prompt += f"{role}: {content}\n"
+        elif role == "Assistant":
+            prompt += f"Sam: {content}\n"
+    prompt += "Sam: <think>"
+    return prompt
+def format_sse_chunk(chunk: ChatCompletionChunk) -> str:
+    """Formats a Pydantic object as a Server-Sent Event (SSE) data block."""
+    return f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
+def streaming_generator(request: ChatCompletionRequest, backend: KerasBackend, full_prompt: str) -> Generator[str, None, None]:
+    """Generator function to stream LLM output in OpenAI SSE format."""
+    model_name = request.model
+    chat_id = f"chatcmpl-{uuid.uuid4().hex}"
+    max_new_tokens = request.max_tokens or 512
+    # 1. Send initial chunk with role
+    yield format_sse_chunk(
+        ChatCompletionChunk(
+            id=chat_id,
+            model=model_name,
+            choices=[ChatCompletionChunkChoice(index=0, delta={"role": "assistant"})]
+        )
+    )
+    full_response_text = ""
+    # 2. Stream tokens
+    try:
+        for new_chunk, _ in generate_response_stream(full_prompt, request.temperature, backend, max_new_tokens):
+            if not new_chunk:
+                continue
+            full_response_text += new_chunk
+            # Yield token chunk
+            yield format_sse_chunk(
+                ChatCompletionChunk(
+                    id=chat_id,
+                    model=model_name,
+                    choices=[ChatCompletionChunkChoice(index=0, delta={"content": new_chunk})]
                 )
             )
+    except Exception as e:
+        print(f"Error during streaming generation: {e}")
+        # A full production implementation would handle error chunks.
+        pass
+    # 3. Final chunk indicating the stream is finished
+    # NOTE: Calculating accurate token counts requires a dedicated token counter within the generation loop.
+    prompt_token_count = len(tokenizer.encode(full_prompt).ids) if tokenizer else 0
+    completion_token_count = len(tokenizer.encode(full_response_text).ids) if tokenizer else 0
+    yield format_sse_chunk(
+        ChatCompletionChunk(
+            id=chat_id,
+            model=model_name,
+            choices=[ChatCompletionChunkChoice(index=0, delta={}, finish_reason="stop")],
+            # Adding a usage object to the final chunk is non-standard but useful
+            # The official OpenAI spec includes usage in the final full response, not chunks.
+            # We'll omit it from the chunk for strict compatibility.
         )
+    )
+    # The required end-of-stream delimiter for SSE
+    yield "data: [DONE]\n\n"
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """Main endpoint for chat completions, supporting both streaming and non-streaming."""
+    # 1. Model Validation
+    if request.model not in available_models:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Model '{request.model}' not found in registry. Available models: {list(available_models.keys())}"
         )
+    if tokenizer is None:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Model assets not loaded.")
+    backend = available_models[request.model]
+    max_new_tokens = request.max_tokens or 512
+    # 2. Prompt Formatting
+    if not request.messages:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Messages array cannot be empty.")
+    full_prompt = build_prompt_from_messages(request.messages, DEFAULT_SYSTEM_PROMPT)
+    # 3. Streaming Response
+    if request.stream:
+        return StreamingResponse(
+            streaming_generator(request, backend, full_prompt),
+            media_type="text/event-stream"
         )
+    # 4. Non-Streaming Response (Blocking)
+    else:
+        full_response_text = ""
+        # Generator is forced to completion
+        for new_chunk, _ in generate_response_stream(full_prompt, request.temperature, backend, max_new_tokens):
+            full_response_text += new_chunk
+        # Build the final ChatCompletion response object
+        response_message = ChatMessage(role="assistant", content=full_response_text.strip())
+        # Token count approximation
+        prompt_token_count = len(tokenizer.encode(full_prompt).ids)
+        completion_token_count = len(tokenizer.encode(full_response_text).ids)
+        completion_response = ChatCompletion(
+            id=f"chatcmpl-{uuid.uuid4().hex}",
+            model=request.model,
+            choices=[ChatCompletionChoice(
+                message=response_message,
+                finish_reason="stop" # Simplified, could be "length" if max_tokens was hit precisely
+            )],
+            usage=ChatCompletionUsage(
+                prompt_tokens=prompt_token_count,
+                completion_tokens=completion_token_count,
+                total_tokens=prompt_token_count + completion_token_count
+            )
         )
+        return JSONResponse(content=completion_response.model_dump(exclude_none=True))
+# ==============================================================================
+# Execution Block
+# ==============================================================================
+if __name__ == "__main__":
+    # Ensure all models are loaded before running uvicorn
+    # This block is here for standalone execution and initial error checking
+    try:
+        load_all_assets()
+    except RuntimeError as e:
+        # If loading fails, print the error and exit gracefully
+        print(e)
+        exit(1)
+    import uvicorn
+    # Run the application
+    # NOTE: Set workers=1 for TensorFlow/Keras stability in standalone scripts.
+    # For robust production, use gunicorn to manage multiple uvicorn processes.
+    uvicorn.run(
+        "__main__:app",
+        host="0.0.0.0",
+        port=8000,
+        log_level="info",
+        workers=1,
+        # reload=True # Uncomment for development
+    )