Sam-Z-chat / app.py
Keeby-smilyai's picture
Update app.py
2e319c6 verified
raw
history blame
35.9 kB
"""
SAM-Z-1 Production API with Gradio UI
OpenAI-compatible API interface for Hugging Face Spaces
"""
import gradio as gr
import tensorflow as tf
import keras
from huggingface_hub import hf_hub_download
import json
import os
from tokenizers import Tokenizer
import numpy as np
import time
from typing import Dict, Any, List
# ============================================================================
# Configuration
# ============================================================================
MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
CACHE_DIR = "./model_cache"
# Global model storage
model = None
tokenizer = None
config = None
eos_token_id = None
# ============================================================================
# Model Architecture (same as original)
# ============================================================================
@keras.saving.register_keras_serializable()
class RotaryEmbedding(keras.layers.Layer):
def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
super().__init__(**kwargs)
self.dim = dim
self.max_len = max_len
self.theta = theta
self.built_cache = False
def build(self, input_shape):
super().build(input_shape)
def _build_cache(self):
if not self.built_cache:
inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
t = tf.range(self.max_len, dtype=tf.float32)
freqs = tf.einsum("i,j->ij", t, inv_freq)
emb = tf.concat([freqs, freqs], axis=-1)
self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
self.built_cache = True
def rotate_half(self, x):
x1, x2 = tf.split(x, 2, axis=-1)
return tf.concat([-x2, x1], axis=-1)
def call(self, q, k):
self._build_cache()
seq_len = tf.shape(q)[2]
dtype = q.dtype
cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
q_rotated = (q * cos) + (self.rotate_half(q) * sin)
k_rotated = (k * cos) + (self.rotate_half(k) * sin)
return q_rotated, k_rotated
def get_config(self):
config = super().get_config()
config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
return config
@keras.saving.register_keras_serializable()
class RMSNorm(keras.layers.Layer):
def __init__(self, epsilon=1e-5, **kwargs):
super().__init__(**kwargs)
self.epsilon = epsilon
def build(self, input_shape):
self.scale = self.add_weight(name="scale", shape=(input_shape[-1],), initializer="ones")
def call(self, x):
variance = tf.reduce_mean(tf.square(x), axis=-1, keepdims=True)
return x * tf.math.rsqrt(variance + self.epsilon) * self.scale
def get_config(self):
config = super().get_config()
config.update({"epsilon": self.epsilon})
return config
@keras.saving.register_keras_serializable()
class TransformerBlock(keras.layers.Layer):
def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
super().__init__(**kwargs)
self.d_model = d_model
self.n_heads = n_heads
self.ff_dim = ff_dim
self.dropout_rate = dropout
self.max_len = max_len
self.rope_theta = rope_theta
self.head_dim = d_model // n_heads
self.layer_idx = layer_idx
self.pre_attn_norm = RMSNorm()
self.pre_ffn_norm = RMSNorm()
self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
self.dropout = keras.layers.Dropout(dropout)
def call(self, x, training=None):
B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
dtype = x.dtype
res = x
y = self.pre_attn_norm(x)
q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
q, k = self.rope(q, k)
scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
mask = tf.where(
tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
tf.constant(-1e9, dtype=dtype),
tf.constant(0.0, dtype=dtype)
)
scores += mask
attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
x = res + self.dropout(self.out_proj(attn), training=training)
res = x
y = self.pre_ffn_norm(x)
ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
return res + self.dropout(ffn, training=training)
def get_config(self):
config = super().get_config()
config.update({
"d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim,
"dropout": self.dropout_rate, "max_len": self.max_len,
"rope_theta": self.rope_theta, "layer_idx": self.layer_idx
})
return config
@keras.saving.register_keras_serializable()
class SAM1Model(keras.Model):
def __init__(self, **kwargs):
super().__init__()
if 'config' in kwargs and isinstance(kwargs['config'], dict):
self.cfg = kwargs['config']
elif 'vocab_size' in kwargs:
self.cfg = kwargs
else:
self.cfg = kwargs.get('cfg', kwargs)
self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
block_args = {
'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'],
'ff_dim': ff_dim, 'dropout': self.cfg['dropout'],
'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']
}
self.blocks = [TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
for i in range(self.cfg['n_layers'])]
self.norm = RMSNorm(name="final_norm")
self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
def call(self, input_ids, training=None):
x = self.embed(input_ids)
for block in self.blocks:
x = block(x, training=training)
return self.lm_head(self.norm(x))
def get_config(self):
base_config = super().get_config()
base_config['config'] = self.cfg
return base_config
# ============================================================================
# Model Loading
# ============================================================================
print("πŸš€ Loading SAM-Z-1 Model for API...")
config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
try:
weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
use_checkpoint = True
print("βœ… Found checkpoint weights")
except:
model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
use_checkpoint = False
print("βœ… Found saved model")
with open(config_path, 'r') as f:
config = json.load(f)
eos_token_id = config.get('eos_token_id', 50256)
# Create tokenizer
print("πŸ“¦ Creating tokenizer...")
from transformers import AutoTokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
hf_tokenizer.add_special_tokens({
"additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
})
os.makedirs("./temp_tokenizer", exist_ok=True)
hf_tokenizer.save_pretrained("./temp_tokenizer")
tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
# Load model
if use_checkpoint:
print("πŸ“¦ Building model and loading weights...")
model_config = {
'vocab_size': config['vocab_size'],
'd_model': config['hidden_size'],
'n_layers': config['num_hidden_layers'],
'n_heads': config['num_attention_heads'],
'ff_mult': config['intermediate_size'] / config['hidden_size'],
'max_len': config['max_position_embeddings'],
'dropout': 0.1,
'rope_theta': config['rope_theta']
}
model = SAM1Model(config=model_config)
dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
_ = model(dummy_input, training=False)
model.load_weights(weights_path)
else:
model = keras.models.load_model(model_path, compile=False)
@tf.function(reduce_retracing=True)
def fast_forward(input_tensor):
return model(input_tensor, training=False)
print(f"βœ… Model loaded: {config['num_hidden_layers']} layers, ~313M params")
# ============================================================================
# Generation Engine
# ============================================================================
def generate_tokens(
input_ids: List[int],
max_tokens: int = 512,
temperature: float = 0.8,
top_k: int = 40,
top_p: float = 0.9,
repetition_penalty: float = 1.1
):
"""Generator that yields tokens one at a time"""
if len(input_ids) > config['max_position_embeddings'] - max_tokens:
input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
input_tensor = tf.constant([input_ids], dtype=tf.int32)
token_freq = {}
for step in range(max_tokens):
logits = fast_forward(input_tensor)
next_token_logits = logits[0, -1, :].numpy()
# Temperature
next_token_logits = next_token_logits / temperature
# Repetition penalty
if repetition_penalty != 1.0:
for token_id, freq in token_freq.items():
if token_id < len(next_token_logits):
next_token_logits[token_id] /= (repetition_penalty ** freq)
# Top-k filtering
if top_k > 0:
top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
top_k_logits = next_token_logits[top_k_indices]
top_k_probs = tf.nn.softmax(top_k_logits).numpy()
# Top-p sampling
if top_p < 1.0:
sorted_indices = np.argsort(top_k_probs)[::-1]
cumsum = np.cumsum(top_k_probs[sorted_indices])
cutoff_idx = np.searchsorted(cumsum, top_p)
nucleus_indices = sorted_indices[:cutoff_idx + 1]
nucleus_logits = top_k_logits[nucleus_indices]
nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
else:
sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
next_token_id = int(top_k_indices[sampled_idx])
else:
probs = tf.nn.softmax(next_token_logits).numpy()
next_token_id = np.random.choice(len(probs), p=probs)
if next_token_id == eos_token_id:
break
token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
yield next_token_id
input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
if input_tensor.shape[1] > config['max_position_embeddings']:
input_tensor = input_tensor[:, -config['max_position_embeddings']:]
# ============================================================================
# API Functions - FIXED FOR GRADIO
# ============================================================================
def chat_completion_api(
messages_json: str,
max_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repetition_penalty: float,
stream: bool
) -> str:
"""OpenAI-style chat completion API"""
try:
messages = json.loads(messages_json)
# Format messages
prompt = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "system":
prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
elif role == "user":
prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
elif role == "assistant":
prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
prompt += "<|im_start|>assistant\n"
# Tokenize
input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
start_time = time.time()
token_count = 0
response_text = ""
for token_id in generate_tokens(
input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
):
token_text = tokenizer.decode([token_id])
response_text += token_text
token_count += 1
if "<|im_end|>" in response_text:
response_text = response_text.split("<|im_end|>")[0]
break
elapsed = time.time() - start_time
result = {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": "sam-z-1",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": response_text.strip()
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": len(input_ids),
"completion_tokens": token_count,
"total_tokens": len(input_ids) + token_count
},
"stats": {
"elapsed_sec": round(elapsed, 2),
"tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
}
}
return json.dumps(result, indent=2)
except Exception as e:
return json.dumps({"error": str(e)}, indent=2)
def text_completion_api(
prompt: str,
max_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repetition_penalty: float,
stream: bool
) -> str:
"""OpenAI-style text completion API"""
try:
input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
start_time = time.time()
token_count = 0
response_text = ""
for token_id in generate_tokens(
input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
):
token_text = tokenizer.decode([token_id])
response_text += token_text
token_count += 1
elapsed = time.time() - start_time
result = {
"id": f"cmpl-{int(time.time())}",
"object": "text_completion",
"created": int(time.time()),
"model": "sam-z-1",
"choices": [{
"text": response_text,
"index": 0,
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": len(input_ids),
"completion_tokens": token_count,
"total_tokens": len(input_ids) + token_count
},
"stats": {
"elapsed_sec": round(elapsed, 2),
"tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
}
}
return json.dumps(result, indent=2)
except Exception as e:
return json.dumps({"error": str(e)}, indent=2)
# ============================================================================
# Gradio UI with API Routes
# ============================================================================
custom_css = """
.api-container {
max-width: 1400px;
margin: auto;
}
.header {
text-align: center;
padding: 2rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 12px;
margin-bottom: 2rem;
}
.endpoint-card {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 8px;
border-left: 4px solid #667eea;
margin: 1rem 0;
}
"""
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
gr.HTML("""
<div class="header">
<h1>πŸš€ SAM-Z-1 API Server</h1>
<p>OpenAI-Compatible API for SAM-Z-1 Language Model</p>
<p style="font-size: 0.9rem; opacity: 0.9;">
313M Parameters β€’ 768D β€’ 16 Layers β€’ TensorFlow Optimized
</p>
</div>
""")
with gr.Tabs():
# ========== Chat Completion Tab ==========
with gr.Tab("πŸ’¬ Chat Completion"):
gr.Markdown("""
### Chat Completions API
OpenAI-compatible chat completion endpoint
""")
with gr.Row():
with gr.Column(scale=1):
messages_input = gr.Code(
label="Messages (JSON)",
language="json",
value=json.dumps([
{"role": "user", "content": "Hello! Who are you?"}
], indent=2),
lines=10
)
with gr.Row():
chat_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
chat_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")
with gr.Row():
chat_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
chat_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
with gr.Column(scale=1):
chat_output = gr.Code(
label="API Response (JSON)",
language="json",
lines=20
)
gr.Markdown("""
### Python Example with Gradio Client
```python
from gradio_client import Client
client = Client("YOUR-SPACE-URL")
messages = [
{"role": "user", "content": "Hello! Who are you?"}
]
result = client.predict(
messages_json=json.dumps(messages),
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/chat_completions"
)
print(result)
```
""")
# ========== Text Completion Tab ==========
with gr.Tab("πŸ“ Text Completion"):
gr.Markdown("""
### Text Completions API
OpenAI-compatible text completion endpoint
""")
with gr.Row():
with gr.Column(scale=1):
prompt_input = gr.Textbox(
label="Prompt",
placeholder="Once upon a time...",
lines=5
)
with gr.Row():
text_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
text_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")
with gr.Row():
text_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
text_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
with gr.Column(scale=1):
text_output = gr.Code(
label="API Response (JSON)",
language="json",
lines=20
)
gr.Markdown("""
### Python Example with Gradio Client
```python
from gradio_client import Client
client = Client("YOUR-SPACE-URL")
result = client.predict(
prompt="Once upon a time",
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/text_completions"
)
print(result)
```
""")
# ========== Documentation Tab ==========
with gr.Tab("πŸ“– Documentation"):
gr.Markdown(f"""
# SAM-Z-1 API Documentation
## Model Information
- **Model**: SAM-Z-1 (Direct Response Model)
- **Parameters**: ~313M
- **Architecture**: Transformer with RoPE, SwiGLU, RMSNorm
- **Context Length**: {config['max_position_embeddings']} tokens
- **Vocabulary Size**: {config['vocab_size']}
## Using the API
### Method 1: Gradio Client (Recommended)
Install the Gradio client:
```bash
pip install gradio_client
```
**Chat Completion:**
```python
from gradio_client import Client
import json
client = Client("https://YOUR-SPACE.hf.space")
messages = [
{{"role": "user", "content": "What is Python?"}}
]
result = client.predict(
messages_json=json.dumps(messages),
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/chat_completions"
)
response = json.loads(result)
print(response["choices"][0]["message"]["content"])
```
**Text Completion:**
```python
result = client.predict(
prompt="Once upon a time",
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/text_completions"
)
response = json.loads(result)
print(response["choices"][0]["text"])
```
### Method 2: Direct HTTP Requests
**Chat Completion:**
```python
import requests
import json
url = "https://YOUR-SPACE.hf.space/call/chat_completions"
payload = {{
"data": [
json.dumps([{{"role": "user", "content": "Hello!"}}]), # messages_json
512, # max_tokens
0.8, # temperature
0.9, # top_p
40, # top_k
1.1, # repetition_penalty
False # stream
]
}}
response = requests.post(url, json=payload)
print(response.json())
```
## API Endpoints
### Chat Completions
- **API Name**: `/chat_completions`
- **URL**: `https://YOUR-SPACE.hf.space/call/chat_completions`
**Parameters:**
1. `messages_json` (str): JSON string of messages array
2. `max_tokens` (int): Maximum tokens to generate (50-1024)
3. `temperature` (float): Sampling temperature (0.1-2.0)
4. `top_p` (float): Nucleus sampling threshold (0.1-1.0)
5. `top_k` (int): Top-K sampling (1-100)
6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0)
7. `stream` (bool): Stream response (UI only, not functional)
### Text Completions
- **API Name**: `/text_completions`
- **URL**: `https://YOUR-SPACE.hf.space/call/text_completions`
**Parameters:**
1. `prompt` (str): Text prompt
2. `max_tokens` (int): Maximum tokens to generate
3. `temperature` (float): Sampling temperature
4. `top_p` (float): Nucleus sampling threshold
5. `top_k` (int): Top-K sampling
6. `repetition_penalty` (float): Penalty for repetition
7. `stream` (bool): Stream response (UI only)
## Response Format
**Chat Completion Response:**
```json
{{
"id": "chatcmpl-1234567890",
"object": "chat.completion",
"created": 1234567890,
"model": "sam-z-1",
"choices": [{{
"index": 0,
"message": {{
"role": "assistant",
"content": "Response text here"
}},
"finish_reason": "stop"
}}],
"usage": {{
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}},
"stats": {{
"elapsed_sec": 1.5,
"tokens_per_sec": 13.3
}}
}}
```
**Text Completion Response:**
```json
{{
"id": "cmpl-1234567890",
"object": "text_completion",
"created": 1234567890,
"model": "sam-z-1",
"choices": [{{
"text": "Completion text here",
"index": 0,
"finish_reason": "stop"
}}],
"usage": {{
"prompt_tokens": 5,
"completion_tokens": 15,
"total_tokens": 20
}},
"stats": {{
"elapsed_sec": 1.2,
"tokens_per_sec": 12.5
}}
}}
```
## Complete Example Script
```python
#!/usr/bin/env python3
"""
SAM-Z-1 API Client Example
"""
from gradio_client import Client
import json
# Initialize client
client = Client("https://YOUR-SPACE.hf.space")
def chat(message, history=[]):
\"\"\"Send a chat message\"\"\"
messages = history + [{{"role": "user", "content": message}}]
result = client.predict(
messages_json=json.dumps(messages),
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/chat_completions"
)
response = json.loads(result)
assistant_msg = response["choices"][0]["message"]["content"]
# Update history
history.append({{"role": "user", "content": message}})
history.append({{"role": "assistant", "content": assistant_msg}})
return assistant_msg, history
def complete(prompt):
\"\"\"Complete text\"\"\"
result = client.predict(
prompt=prompt,
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/text_completions"
)
response = json.loads(result)
return response["choices"][0]["text"]
# Example usage
if __name__ == "__main__":
# Chat example
print("=== Chat Example ===")
history = []
response, history = chat("Hello! Who are you?", history)
print(f"Assistant: {{response}}\\n")
response, history = chat("What can you help me with?", history)
print(f"Assistant: {{response}}\\n")
# Text completion example
print("\\n=== Text Completion Example ===")
completion = complete("Once upon a time in a distant galaxy")
print(f"Completion: {{completion}}")
```
## Parameters Guide
### Temperature (0.1 - 2.0)
- **Low (0.1-0.5)**: More focused, deterministic, factual
- **Medium (0.6-0.9)**: Balanced creativity and coherence
- **High (1.0-2.0)**: More creative, diverse, unpredictable
### Top-P (0.1 - 1.0)
- Controls diversity via nucleus sampling
- **0.9** (default): Good balance
- Lower values = more focused
- Higher values = more diverse
### Top-K (1 - 100)
- Limits vocabulary to top K tokens
- **40** (default): Good balance
- Lower values = more focused
- Higher values = more diverse
### Repetition Penalty (1.0 - 2.0)
- **1.0**: No penalty
- **1.1** (default): Slight penalty
- **1.5+**: Strong penalty (use if model repeats)
## Rate Limits & Performance
- **Concurrent Requests**: Supported via Gradio queue
- **Average Speed**: 10-20 tokens/sec on CPU
- **Context Window**: {config['max_position_embeddings']} tokens
- **Queue Size**: Up to 20 concurrent requests
## Error Handling
```python
try:
result = client.predict(
messages_json=json.dumps(messages),
max_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
stream=False,
api_name="/chat_completions"
)
response = json.loads(result)
if "error" in response:
print(f"API Error: {{response['error']}}")
else:
print(response["choices"][0]["message"]["content"])
except Exception as e:
print(f"Request failed: {{e}}")
```
## Troubleshooting
**Connection Issues:**
- Verify Space URL is correct
- Check if Space is running
- Ensure gradio_client is installed
**Slow Responses:**
- Reduce `max_tokens`
- Lower `top_k` value
- Use shorter prompts
**Repetitive Output:**
- Increase `repetition_penalty` (try 1.2-1.5)
- Adjust `temperature` higher
- Use `top_p` sampling
**Incoherent Output:**
- Lower `temperature` (try 0.5-0.7)
- Reduce `top_k` (try 20-30)
- Ensure prompt is clear and well-formatted
## Chat Template Format
The model uses ChatML format:
```
<|im_start|>system
System message here<|im_end|>
<|im_start|>user
User message here<|im_end|>
<|im_start|>assistant
Assistant response here<|im_end|>
```
## Tips for Best Results
1. **Use clear, specific prompts**
2. **Lower temperature for factual tasks**
3. **Higher temperature for creative tasks**
4. **Adjust repetition penalty if model repeats phrases**
5. **Keep context under {config['max_position_embeddings']} tokens**
6. **Use system messages to set behavior**
## Model Capabilities
βœ… General conversation
βœ… Question answering
βœ… Code generation
βœ… Creative writing
βœ… Text completion
βœ… Instruction following
❌ Does NOT use reasoning tokens (`<think>` tags)
❌ Not fine-tuned for specific domains
---
**Model**: SAM-Z-1 | **API Version**: 1.0
**Support**: Open an issue on the Space for bugs or questions
""")
# ========== API Routes - MUST USE api_name parameter ==========
chat_btn.click(
fn=chat_completion_api,
inputs=[
messages_input, chat_max_tokens, chat_temperature,
chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
],
outputs=[chat_output],
api_name="chat_completions" # This creates /call/chat_completions endpoint
)
text_btn.click(
fn=text_completion_api,
inputs=[
prompt_input, text_max_tokens, text_temperature,
text_top_p, text_top_k, text_rep_penalty, text_stream
],
outputs=[text_output],
api_name="text_completions" # This creates /call/text_completions endpoint
)
# Launch
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)