Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Oct 23

Commit

2e319c6

verified ·

1 Parent(s): 3319054

Update app.py

Browse files

Files changed (1) hide show

app.py +386 -344

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from tokenizers import Tokenizer
 import numpy as np
 import time
 from typing import Dict, Any, List
-import asyncio
 # ============================================================================
 # Configuration
@@ -263,8 +262,7 @@ def generate_tokens(
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 0.9,
-    repetition_penalty: float = 1.1,
-    stop_sequences: List[str] = None
 ):
     """Generator that yields tokens one at a time"""
     if len(input_ids) > config['max_position_embeddings'] - max_tokens:
@@ -322,17 +320,17 @@ def generate_tokens(
             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
 # ============================================================================
-# API Functions
 # ============================================================================
 def chat_completion_api(
     messages_json: str,
-    max_tokens: int = 512,
-    temperature: float = 0.8,
-    top_p: float = 0.9,
-    top_k: int = 40,
-    repetition_penalty: float = 1.1,
-    stream: bool = False
 ) -> str:
     """OpenAI-style chat completion API"""
     try:
@@ -358,83 +356,58 @@ def chat_completion_api(
         start_time = time.time()
         token_count = 0
-        if stream:
-            # Streaming response
-            response_text = ""
-            for token_id in generate_tokens(
-                input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-            ):
-                token_text = tokenizer.decode([token_id])
-                response_text += token_text
-                token_count += 1
-                # Check for end token
-                if "<|im_end|>" in response_text:
-                    response_text = response_text.split("<|im_end|>")[0]
-                    break
-                # Yield streaming chunk (SSE format)
-                yield f"data: {json.dumps({'choices': [{'delta': {'content': token_text}, 'index': 0}]})}\n\n"
-            elapsed = time.time() - start_time
-            # Final chunk
-            yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop', 'index': 0}], 'usage': {'completion_tokens': token_count, 'total_tokens': len(input_ids) + token_count}, 'stats': {'elapsed_sec': round(elapsed, 2), 'tokens_per_sec': round(token_count / elapsed if elapsed > 0 else 0, 1)}})}\n\n"
-            yield "data: [DONE]\n\n"
-        else:
-            # Non-streaming response
-            response_text = ""
-            for token_id in generate_tokens(
-                input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-            ):
-                token_text = tokenizer.decode([token_id])
-                response_text += token_text
-                token_count += 1
-                if "<|im_end|>" in response_text:
-                    response_text = response_text.split("<|im_end|>")[0]
-                    break
-            elapsed = time.time() - start_time
-            result = {
-                "id": f"chatcmpl-{int(time.time())}",
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": "sam-z-1",
-                "choices": [{
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": response_text.strip()
-                    },
-                    "finish_reason": "stop"
-                }],
-                "usage": {
-                    "prompt_tokens": len(input_ids),
-                    "completion_tokens": token_count,
-                    "total_tokens": len(input_ids) + token_count
                 },
-                "stats": {
-                    "elapsed_sec": round(elapsed, 2),
-                    "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
-                }
             }
-            return json.dumps(result, indent=2)
     except Exception as e:
         return json.dumps({"error": str(e)}, indent=2)
 def text_completion_api(
     prompt: str,
-    max_tokens: int = 512,
-    temperature: float = 0.8,
-    top_p: float = 0.9,
-    top_k: int = 40,
-    repetition_penalty: float = 1.1,
-    stream: bool = False
 ) -> str:
     """OpenAI-style text completion API"""
     try:
@@ -442,61 +415,45 @@ def text_completion_api(
         start_time = time.time()
         token_count = 0
-        if stream:
-            response_text = ""
-            for token_id in generate_tokens(
-                input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-            ):
-                token_text = tokenizer.decode([token_id])
-                response_text += token_text
-                token_count += 1
-                yield f"data: {json.dumps({'choices': [{'text': token_text, 'index': 0}]})}\n\n"
-            elapsed = time.time() - start_time
-            yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop', 'index': 0}], 'usage': {'completion_tokens': token_count, 'total_tokens': len(input_ids) + token_count}, 'stats': {'elapsed_sec': round(elapsed, 2), 'tokens_per_sec': round(token_count / elapsed if elapsed > 0 else 0, 1)}})}\n\n"
-            yield "data: [DONE]\n\n"
-        else:
-            response_text = ""
-            for token_id in generate_tokens(
-                input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
-            ):
-                token_text = tokenizer.decode([token_id])
-                response_text += token_text
-                token_count += 1
-            elapsed = time.time() - start_time
-            result = {
-                "id": f"cmpl-{int(time.time())}",
-                "object": "text_completion",
-                "created": int(time.time()),
-                "model": "sam-z-1",
-                "choices": [{
-                    "text": response_text,
-                    "index": 0,
-                    "finish_reason": "stop"
-                }],
-                "usage": {
-                    "prompt_tokens": len(input_ids),
-                    "completion_tokens": token_count,
-                    "total_tokens": len(input_ids) + token_count
-                },
-                "stats": {
-                    "elapsed_sec": round(elapsed, 2),
-                    "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
-                }
             }
-            return json.dumps(result, indent=2)
     except Exception as e:
         return json.dumps({"error": str(e)}, indent=2)
 # ============================================================================
-# Gradio UI
 # ============================================================================
 custom_css = """
@@ -521,16 +478,6 @@ custom_css = """
     border-left: 4px solid #667eea;
     margin: 1rem 0;
 }
-.code-block {
-    background: #282c34;
-    color: #abb2bf;
-    padding: 1rem;
-    border-radius: 6px;
-    font-family: 'Monaco', 'Menlo', monospace;
-    font-size: 0.9rem;
-    overflow-x: auto;
-}
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
@@ -548,8 +495,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
         # ========== Chat Completion Tab ==========
         with gr.Tab("💬 Chat Completion"):
             gr.Markdown("""
-            ### `/v1/chat/completions` Endpoint
-            OpenAI-compatible chat completion API with streaming support
             """)
             with gr.Row():
@@ -572,7 +519,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                         chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
                     chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
-                    chat_stream = gr.Checkbox(label="Stream Response", value=False)
                     chat_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
@@ -584,48 +531,36 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                     )
             gr.Markdown("""
-            ### Python Example
             ```python
-            import requests
-            import json
-            # For Hugging Face Spaces
-            API_URL = "https://YOUR-SPACE.hf.space"  # Your Space URL
             messages = [
                 {"role": "user", "content": "Hello! Who are you?"}
             ]
-            response = requests.post(
-                f"{API_URL}/chat/completions",
-                json={
-                    "messages": messages,
-                    "max_tokens": 512,
-                    "temperature": 0.8,
-                    "stream": False
-                }
             )
-            print(response.json())
-            ```
-            ### cURL Example
-            ```bash
-            curl -X POST "https://YOUR-SPACE.hf.space/chat/completions" \\
-              -H "Content-Type: application/json" \\
-              -d '{
-                "messages": [{"role": "user", "content": "Hello!"}],
-                "max_tokens": 512,
-                "temperature": 0.8
-              }'
             ```
             """)
         # ========== Text Completion Tab ==========
         with gr.Tab("📝 Text Completion"):
             gr.Markdown("""
-            ### `/v1/completions` Endpoint
-            OpenAI-compatible text completion API
             """)
             with gr.Row():
@@ -645,7 +580,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                         text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
                     text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
-                    text_stream = gr.Checkbox(label="Stream Response", value=False)
                     text_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
@@ -657,22 +592,24 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                     )
             gr.Markdown("""
-            ### Python Example
             ```python
-            import requests
-            API_URL = "https://YOUR-SPACE.hf.space"
-            response = requests.post(
-                f"{API_URL}/completions",
-                json={
-                    "prompt": "Once upon a time",
-                    "max_tokens": 512,
-                    "temperature": 0.8
-                }
             )
-            print(response.json())
             ```
             """)
@@ -688,30 +625,117 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
             - **Context Length**: {config['max_position_embeddings']} tokens
             - **Vocabulary Size**: {config['vocab_size']}
-            ## Available Endpoints
-            ### 1. Chat Completions
-            **Endpoint**: `/v1/chat/completions` (OpenAI compatible)
-            **Request Format**:
-            ```json
-            {{
-              "messages": [
-                {{"role": "user", "content": "Hello!"}}
-              ],
-              "max_tokens": 512,
-              "temperature": 0.8,
-              "top_p": 0.9,
-              "top_k": 40,
-              "repetition_penalty": 1.1,
-              "stream": false
             }}
             ```
-            **Response Format**:
             ```json
             {{
-              "id": "chatcmpl-123",
               "object": "chat.completion",
               "created": 1234567890,
               "model": "sam-z-1",
@@ -719,7 +743,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                 "index": 0,
                 "message": {{
                   "role": "assistant",
-                  "content": "Hello! I'm SAM-Z-1..."
                 }},
                 "finish_reason": "stop"
               }}],
@@ -735,31 +759,15 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
             }}
             ```
-            ### 2. Text Completions
-            **Endpoint**: `/v1/completions`
-            **Request Format**:
-            ```json
-            {{
-              "prompt": "Once upon a time",
-              "max_tokens": 512,
-              "temperature": 0.8,
-              "top_p": 0.9,
-              "top_k": 40,
-              "repetition_penalty": 1.1,
-              "stream": false
-            }}
-            ```
-            **Response Format**:
             ```json
             {{
-              "id": "cmpl-123",
               "object": "text_completion",
               "created": 1234567890,
               "model": "sam-z-1",
               "choices": [{{
-                "text": " in a distant galaxy...",
                 "index": 0,
                 "finish_reason": "stop"
               }}],
@@ -767,187 +775,221 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
                 "prompt_tokens": 5,
                 "completion_tokens": 15,
                 "total_tokens": 20
               }}
             }}
             ```
-            ## Parameters
-            | Parameter | Type | Default | Description |
-            |-----------|------|---------|-------------|
-            | `max_tokens` | int | 512 | Maximum tokens to generate |
-            | `temperature` | float | 0.8 | Sampling temperature (0.1-2.0) |
-            | `top_p` | float | 0.9 | Nucleus sampling threshold |
-            | `top_k` | int | 40 | Top-K sampling |
-            | `repetition_penalty` | float | 1.1 | Penalty for repeated tokens |
-            | `stream` | bool | false | Enable streaming responses |
-            ## Streaming Responses
-            When `stream=true`, the API returns Server-Sent Events (SSE):
-            ```
-            data: {{"choices": [{{"delta": {{"content": "Hello"}}, "index": 0}}]}}
-            data: {{"choices": [{{"delta": {{"content": " there"}}, "index": 0}}]}}
-            data: {{"choices": [{{"finish_reason": "stop", "index": 0}}]}}
-            data: [DONE]
-            ```
             ## Rate Limits & Performance
-            - **Optimized for CPU**: Uses TensorFlow graph optimization
             - **Average Speed**: 10-20 tokens/sec on CPU
             - **Context Window**: {config['max_position_embeddings']} tokens
-            - **Concurrent Requests**: Supported via Gradio queue
             ## Error Handling
-            Errors return JSON with error description:
-            ```json
-            {{
-              "error": "Error message here"
-            }}
-            ```
-            ## Usage Tips
-            1. **Lower temperature** (0.3-0.5) for factual responses
-            2. **Higher temperature** (0.8-1.2) for creative content
-            3. **Use streaming** for better UX in production
-            4. **Adjust top_k/top_p** to control diversity
-            5. **Increase repetition_penalty** if model repeats phrases
-            ## Model Capabilities
-            ✅ General conversation
-            ✅ Question answering
-            ✅ Code generation
-            ✅ Creative writing
-            ✅ Text completion
-            ✅ Instruction following
-            ❌ Does NOT use reasoning tokens (`<think>` tags)
-            ❌ Not fine-tuned for specific domains
-            ## Integration Examples
-            ### Python (requests)
-            ```python
-            import requests
-            def chat(message, history=[]):
-                messages = history + [{{"role": "user", "content": message}}]
-                response = requests.post(
-                    "https://YOUR-SPACE.hf.space/chat/completions",
-                    json={{"messages": messages, "temperature": 0.8}}
-                )
-                return response.json()["choices"][0]["message"]["content"]
-            ```
-            ### Python (streaming)
             ```python
-            import requests
-            def chat_stream(message):
-                response = requests.post(
-                    "https://YOUR-SPACE.hf.space/chat/completions",
-                    json={{
-                        "messages": [{{"role": "user", "content": message}}],
-                        "stream": True
-                    }},
-                    stream=True
                 )
-                for line in response.iter_lines():
-                    if line:
-                        line = line.decode('utf-8')
-                        if line.startswith('data: '):
-                            data = line[6:]
-                            if data != '[DONE]':
-                                import json
-                                chunk = json.loads(data)
-                                if 'choices' in chunk:
-                                    delta = chunk['choices'][0].get('delta', {{}})
-                                    if 'content' in delta:
-                                        print(delta['content'], end='', flush=True)
-            ```
-            ### JavaScript (fetch)
-            ```javascript
-            async function chat(message) {{
-              const response = await fetch('https://YOUR-SPACE.hf.space/chat/completions', {{
-                method: 'POST',
-                headers: {{'Content-Type': 'application/json'}},
-                body: JSON.stringify({{
-                  messages: [{{role: 'user', content: message}}],
-                  temperature: 0.8
-                }})
-              }});
-              const data = await response.json();
-              return data.choices[0].message.content;
-            }}
-            ```
-            ### cURL
-            ```bash
-            curl -X POST https://YOUR-SPACE.hf.space/chat/completions \\
-              -H "Content-Type: application/json" \\
-              -d '{{
-                "messages": [
-                  {{"role": "user", "content": "What is the capital of France?"}}
-                ],
-                "max_tokens": 100,
-                "temperature": 0.7
-              }}'
             ```
             ## Troubleshooting
-            **Slow responses?**
             - Reduce `max_tokens`
             - Lower `top_k` value
-            - Use smaller prompts
-            **Repetitive output?**
             - Increase `repetition_penalty` (try 1.2-1.5)
             - Adjust `temperature` higher
             - Use `top_p` sampling
-            **Incoherent output?**
             - Lower `temperature` (try 0.5-0.7)
             - Reduce `top_k` (try 20-30)
-            - Ensure prompt is clear
             ---
-            **Model**: SAM-Z-1 | **License**: Check model card on Hugging Face
             **Support**: Open an issue on the Space for bugs or questions
             """)
-    # Event handlers for Chat Completion
     chat_btn.click(
         fn=chat_completion_api,
         inputs=[
             messages_input, chat_max_tokens, chat_temperature,
             chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
         ],
-        outputs=[chat_output]
     )
-    # Event handlers for Text Completion
     text_btn.click(
         fn=text_completion_api,
         inputs=[
             prompt_input, text_max_tokens, text_temperature,
             text_top_p, text_top_k, text_rep_penalty, text_stream
         ],
-        outputs=[text_output]
     )
 # Launch

 import numpy as np
 import time
 from typing import Dict, Any, List
 # ============================================================================
 # Configuration
     temperature: float = 0.8,
     top_k: int = 40,
     top_p: float = 0.9,
+    repetition_penalty: float = 1.1
 ):
     """Generator that yields tokens one at a time"""
     if len(input_ids) > config['max_position_embeddings'] - max_tokens:
             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
 # ============================================================================
+# API Functions - FIXED FOR GRADIO
 # ============================================================================
 def chat_completion_api(
     messages_json: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    stream: bool
 ) -> str:
     """OpenAI-style chat completion API"""
     try:
         start_time = time.time()
         token_count = 0
+        response_text = ""
+        for token_id in generate_tokens(
+            input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
+        ):
+            token_text = tokenizer.decode([token_id])
+            response_text += token_text
+            token_count += 1
+            if "<|im_end|>" in response_text:
+                response_text = response_text.split("<|im_end|>")[0]
+                break
+        elapsed = time.time() - start_time
+        result = {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": "sam-z-1",
+            "choices": [{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": response_text.strip()
                 },
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": len(input_ids),
+                "completion_tokens": token_count,
+                "total_tokens": len(input_ids) + token_count
+            },
+            "stats": {
+                "elapsed_sec": round(elapsed, 2),
+                "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
             }
+        }
+        return json.dumps(result, indent=2)
     except Exception as e:
         return json.dumps({"error": str(e)}, indent=2)
 def text_completion_api(
     prompt: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    stream: bool
 ) -> str:
     """OpenAI-style text completion API"""
     try:
         start_time = time.time()
         token_count = 0
+        response_text = ""
+        for token_id in generate_tokens(
+            input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
+        ):
+            token_text = tokenizer.decode([token_id])
+            response_text += token_text
+            token_count += 1
+        elapsed = time.time() - start_time
+        result = {
+            "id": f"cmpl-{int(time.time())}",
+            "object": "text_completion",
+            "created": int(time.time()),
+            "model": "sam-z-1",
+            "choices": [{
+                "text": response_text,
+                "index": 0,
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": len(input_ids),
+                "completion_tokens": token_count,
+                "total_tokens": len(input_ids) + token_count
+            },
+            "stats": {
+                "elapsed_sec": round(elapsed, 2),
+                "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
             }
+        }
+        return json.dumps(result, indent=2)
     except Exception as e:
         return json.dumps({"error": str(e)}, indent=2)
 # ============================================================================
+# Gradio UI with API Routes
 # ============================================================================
 custom_css = """
     border-left: 4px solid #667eea;
     margin: 1rem 0;
 }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
         # ========== Chat Completion Tab ==========
         with gr.Tab("💬 Chat Completion"):
             gr.Markdown("""
+            ### Chat Completions API
+            OpenAI-compatible chat completion endpoint
             """)
             with gr.Row():
                         chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
                     chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
+                    chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
                     chat_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
                     )
             gr.Markdown("""
+            ### Python Example with Gradio Client
             ```python
+            from gradio_client import Client
+            client = Client("YOUR-SPACE-URL")
             messages = [
                 {"role": "user", "content": "Hello! Who are you?"}
             ]
+            result = client.predict(
+                messages_json=json.dumps(messages),
+                max_tokens=512,
+                temperature=0.8,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.1,
+                stream=False,
+                api_name="/chat_completions"
             )
+            print(result)
             ```
             """)
         # ========== Text Completion Tab ==========
         with gr.Tab("📝 Text Completion"):
             gr.Markdown("""
+            ### Text Completions API
+            OpenAI-compatible text completion endpoint
             """)
             with gr.Row():
                         text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
                     text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
+                    text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
                     text_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
                     )
             gr.Markdown("""
+            ### Python Example with Gradio Client
             ```python
+            from gradio_client import Client
+            client = Client("YOUR-SPACE-URL")
+            result = client.predict(
+                prompt="Once upon a time",
+                max_tokens=512,
+                temperature=0.8,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.1,
+                stream=False,
+                api_name="/text_completions"
             )
+            print(result)
             ```
             """)
             - **Context Length**: {config['max_position_embeddings']} tokens
             - **Vocabulary Size**: {config['vocab_size']}
+            ## Using the API
+            ### Method 1: Gradio Client (Recommended)
+            Install the Gradio client:
+            ```bash
+            pip install gradio_client
+            ```
+            **Chat Completion:**
+            ```python
+            from gradio_client import Client
+            import json
+            client = Client("https://YOUR-SPACE.hf.space")
+            messages = [
+                {{"role": "user", "content": "What is Python?"}}
+            ]
+            result = client.predict(
+                messages_json=json.dumps(messages),
+                max_tokens=512,
+                temperature=0.8,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.1,
+                stream=False,
+                api_name="/chat_completions"
+            )
+            response = json.loads(result)
+            print(response["choices"][0]["message"]["content"])
+            ```
+            **Text Completion:**
+            ```python
+            result = client.predict(
+                prompt="Once upon a time",
+                max_tokens=512,
+                temperature=0.8,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.1,
+                stream=False,
+                api_name="/text_completions"
+            )
+            response = json.loads(result)
+            print(response["choices"][0]["text"])
+            ```
+            ### Method 2: Direct HTTP Requests
+            **Chat Completion:**
+            ```python
+            import requests
+            import json
+            url = "https://YOUR-SPACE.hf.space/call/chat_completions"
+            payload = {{
+                "data": [
+                    json.dumps([{{"role": "user", "content": "Hello!"}}]),  # messages_json
+                    512,   # max_tokens
+                    0.8,   # temperature
+                    0.9,   # top_p
+                    40,    # top_k
+                    1.1,   # repetition_penalty
+                    False  # stream
+                ]
             }}
+            response = requests.post(url, json=payload)
+            print(response.json())
             ```
+            ## API Endpoints
+            ### Chat Completions
+            - **API Name**: `/chat_completions`
+            - **URL**: `https://YOUR-SPACE.hf.space/call/chat_completions`
+            **Parameters:**
+            1. `messages_json` (str): JSON string of messages array
+            2. `max_tokens` (int): Maximum tokens to generate (50-1024)
+            3. `temperature` (float): Sampling temperature (0.1-2.0)
+            4. `top_p` (float): Nucleus sampling threshold (0.1-1.0)
+            5. `top_k` (int): Top-K sampling (1-100)
+            6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0)
+            7. `stream` (bool): Stream response (UI only, not functional)
+            ### Text Completions
+            - **API Name**: `/text_completions`
+            - **URL**: `https://YOUR-SPACE.hf.space/call/text_completions`
+            **Parameters:**
+            1. `prompt` (str): Text prompt
+            2. `max_tokens` (int): Maximum tokens to generate
+            3. `temperature` (float): Sampling temperature
+            4. `top_p` (float): Nucleus sampling threshold
+            5. `top_k` (int): Top-K sampling
+            6. `repetition_penalty` (float): Penalty for repetition
+            7. `stream` (bool): Stream response (UI only)
+            ## Response Format
+            **Chat Completion Response:**
             ```json
             {{
+              "id": "chatcmpl-1234567890",
               "object": "chat.completion",
               "created": 1234567890,
               "model": "sam-z-1",
                 "index": 0,
                 "message": {{
                   "role": "assistant",
+                  "content": "Response text here"
                 }},
                 "finish_reason": "stop"
               }}],
             }}
             ```
+            **Text Completion Response:**
             ```json
             {{
+              "id": "cmpl-1234567890",
               "object": "text_completion",
               "created": 1234567890,
               "model": "sam-z-1",
               "choices": [{{
+                "text": "Completion text here",
                 "index": 0,
                 "finish_reason": "stop"
               }}],
                 "prompt_tokens": 5,
                 "completion_tokens": 15,
                 "total_tokens": 20
+              }},
+              "stats": {{
+                "elapsed_sec": 1.2,
+                "tokens_per_sec": 12.5
               }}
             }}
             ```
+            ## Complete Example Script
+            ```python
+            #!/usr/bin/env python3
+            """
+            SAM-Z-1 API Client Example
+            """
+            from gradio_client import Client
+            import json
+            # Initialize client
+            client = Client("https://YOUR-SPACE.hf.space")
+            def chat(message, history=[]):
+                \"\"\"Send a chat message\"\"\"
+                messages = history + [{{"role": "user", "content": message}}]
+                result = client.predict(
+                    messages_json=json.dumps(messages),
+                    max_tokens=512,
+                    temperature=0.8,
+                    top_p=0.9,
+                    top_k=40,
+                    repetition_penalty=1.1,
+                    stream=False,
+                    api_name="/chat_completions"
+                )
+                response = json.loads(result)
+                assistant_msg = response["choices"][0]["message"]["content"]
+                # Update history
+                history.append({{"role": "user", "content": message}})
+                history.append({{"role": "assistant", "content": assistant_msg}})
+                return assistant_msg, history
+            def complete(prompt):
+                \"\"\"Complete text\"\"\"
+                result = client.predict(
+                    prompt=prompt,
+                    max_tokens=512,
+                    temperature=0.8,
+                    top_p=0.9,
+                    top_k=40,
+                    repetition_penalty=1.1,
+                    stream=False,
+                    api_name="/text_completions"
+                )
+                response = json.loads(result)
+                return response["choices"][0]["text"]
+            # Example usage
+            if __name__ == "__main__":
+                # Chat example
+                print("=== Chat Example ===")
+                history = []
+                response, history = chat("Hello! Who are you?", history)
+                print(f"Assistant: {{response}}\\n")
+                response, history = chat("What can you help me with?", history)
+                print(f"Assistant: {{response}}\\n")
+                # Text completion example
+                print("\\n=== Text Completion Example ===")
+                completion = complete("Once upon a time in a distant galaxy")
+                print(f"Completion: {{completion}}")
+            ```
+            ## Parameters Guide
+            ### Temperature (0.1 - 2.0)
+            - **Low (0.1-0.5)**: More focused, deterministic, factual
+            - **Medium (0.6-0.9)**: Balanced creativity and coherence
+            - **High (1.0-2.0)**: More creative, diverse, unpredictable
+            ### Top-P (0.1 - 1.0)
+            - Controls diversity via nucleus sampling
+            - **0.9** (default): Good balance
+            - Lower values = more focused
+            - Higher values = more diverse
+            ### Top-K (1 - 100)
+            - Limits vocabulary to top K tokens
+            - **40** (default): Good balance
+            - Lower values = more focused
+            - Higher values = more diverse
+            ### Repetition Penalty (1.0 - 2.0)
+            - **1.0**: No penalty
+            - **1.1** (default): Slight penalty
+            - **1.5+**: Strong penalty (use if model repeats)
             ## Rate Limits & Performance
+            - **Concurrent Requests**: Supported via Gradio queue
             - **Average Speed**: 10-20 tokens/sec on CPU
             - **Context Window**: {config['max_position_embeddings']} tokens
+            - **Queue Size**: Up to 20 concurrent requests
             ## Error Handling
             ```python
+            try:
+                result = client.predict(
+                    messages_json=json.dumps(messages),
+                    max_tokens=512,
+                    temperature=0.8,
+                    top_p=0.9,
+                    top_k=40,
+                    repetition_penalty=1.1,
+                    stream=False,
+                    api_name="/chat_completions"
                 )
+                response = json.loads(result)
+                if "error" in response:
+                    print(f"API Error: {{response['error']}}")
+                else:
+                    print(response["choices"][0]["message"]["content"])
+            except Exception as e:
+                print(f"Request failed: {{e}}")
             ```
             ## Troubleshooting
+            **Connection Issues:**
+            - Verify Space URL is correct
+            - Check if Space is running
+            - Ensure gradio_client is installed
+            **Slow Responses:**
             - Reduce `max_tokens`
             - Lower `top_k` value
+            - Use shorter prompts
+            **Repetitive Output:**
             - Increase `repetition_penalty` (try 1.2-1.5)
             - Adjust `temperature` higher
             - Use `top_p` sampling
+            **Incoherent Output:**
             - Lower `temperature` (try 0.5-0.7)
             - Reduce `top_k` (try 20-30)
+            - Ensure prompt is clear and well-formatted
+            ## Chat Template Format
+            The model uses ChatML format:
+            ```
+            <|im_start|>system
+            System message here<|im_end|>
+            <|im_start|>user
+            User message here<|im_end|>
+            <|im_start|>assistant
+            Assistant response here<|im_end|>
+            ```
+            ## Tips for Best Results
+            1. **Use clear, specific prompts**
+            2. **Lower temperature for factual tasks**
+            3. **Higher temperature for creative tasks**
+            4. **Adjust repetition penalty if model repeats phrases**
+            5. **Keep context under {config['max_position_embeddings']} tokens**
+            6. **Use system messages to set behavior**
+            ## Model Capabilities
+            ✅ General conversation
+            ✅ Question answering
+            ✅ Code generation
+            ✅ Creative writing
+            ✅ Text completion
+            ✅ Instruction following
+            ❌ Does NOT use reasoning tokens (`<think>` tags)
+            ❌ Not fine-tuned for specific domains
             ---
+            **Model**: SAM-Z-1 | **API Version**: 1.0
             **Support**: Open an issue on the Space for bugs or questions
             """)
+    # ========== API Routes - MUST USE api_name parameter ==========
     chat_btn.click(
         fn=chat_completion_api,
         inputs=[
             messages_input, chat_max_tokens, chat_temperature,
             chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
         ],
+        outputs=[chat_output],
+        api_name="chat_completions"  # This creates /call/chat_completions endpoint
     )
     text_btn.click(
         fn=text_completion_api,
         inputs=[
             prompt_input, text_max_tokens, text_temperature,
             text_top_p, text_top_k, text_rep_penalty, text_stream
         ],
+        outputs=[text_output],
+        api_name="text_completions"  # This creates /call/text_completions endpoint
     )
 # Launch