Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Sam-Z-chat / app.py

Keeby-smilyai

Update app.py

2e319c6 verified 2 months ago

raw

history blame

35.9 kB

	"""
	SAM-Z-1 Production API with Gradio UI
	OpenAI-compatible API interface for Hugging Face Spaces
	"""

	import gradio as gr
	import tensorflow as tf
	import keras
	from huggingface_hub import hf_hub_download
	import json
	import os
	from tokenizers import Tokenizer
	import numpy as np
	import time
	from typing import Dict, Any, List

	# ============================================================================
	# Configuration
	# ============================================================================

	MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
	CACHE_DIR = "./model_cache"

	# Global model storage
	model = None
	tokenizer = None
	config = None
	eos_token_id = None

	# ============================================================================
	# Model Architecture (same as original)
	# ============================================================================

	@keras.saving.register_keras_serializable()
	class RotaryEmbedding(keras.layers.Layer):
	def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
	super().__init__(**kwargs)
	self.dim = dim
	self.max_len = max_len
	self.theta = theta
	self.built_cache = False

	def build(self, input_shape):
	super().build(input_shape)

	def _build_cache(self):
	if not self.built_cache:
	inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
	t = tf.range(self.max_len, dtype=tf.float32)
	freqs = tf.einsum("i,j->ij", t, inv_freq)
	emb = tf.concat([freqs, freqs], axis=-1)
	self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
	self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
	self.built_cache = True

	def rotate_half(self, x):
	x1, x2 = tf.split(x, 2, axis=-1)
	return tf.concat([-x2, x1], axis=-1)

	def call(self, q, k):
	self._build_cache()
	seq_len = tf.shape(q)[2]
	dtype = q.dtype
	cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
	sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
	q_rotated = (q * cos) + (self.rotate_half(q) * sin)
	k_rotated = (k * cos) + (self.rotate_half(k) * sin)
	return q_rotated, k_rotated

	def get_config(self):
	config = super().get_config()
	config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
	return config


	@keras.saving.register_keras_serializable()
	class RMSNorm(keras.layers.Layer):
	def __init__(self, epsilon=1e-5, **kwargs):
	super().__init__(**kwargs)
	self.epsilon = epsilon

	def build(self, input_shape):
	self.scale = self.add_weight(name="scale", shape=(input_shape[-1],), initializer="ones")

	def call(self, x):
	variance = tf.reduce_mean(tf.square(x), axis=-1, keepdims=True)
	return x * tf.math.rsqrt(variance + self.epsilon) * self.scale

	def get_config(self):
	config = super().get_config()
	config.update({"epsilon": self.epsilon})
	return config


	@keras.saving.register_keras_serializable()
	class TransformerBlock(keras.layers.Layer):
	def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
	super().__init__(**kwargs)
	self.d_model = d_model
	self.n_heads = n_heads
	self.ff_dim = ff_dim
	self.dropout_rate = dropout
	self.max_len = max_len
	self.rope_theta = rope_theta
	self.head_dim = d_model // n_heads
	self.layer_idx = layer_idx

	self.pre_attn_norm = RMSNorm()
	self.pre_ffn_norm = RMSNorm()
	self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
	self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
	self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
	self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
	self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
	self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
	self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
	self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
	self.dropout = keras.layers.Dropout(dropout)

	def call(self, x, training=None):
	B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
	dtype = x.dtype

	res = x
	y = self.pre_attn_norm(x)

	q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
	k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
	v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])

	q, k = self.rope(q, k)
	scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
	mask = tf.where(
	tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
	tf.constant(-1e9, dtype=dtype),
	tf.constant(0.0, dtype=dtype)
	)
	scores += mask
	attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
	attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
	x = res + self.dropout(self.out_proj(attn), training=training)

	res = x
	y = self.pre_ffn_norm(x)
	ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))

	return res + self.dropout(ffn, training=training)

	def get_config(self):
	config = super().get_config()
	config.update({
	"d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim,
	"dropout": self.dropout_rate, "max_len": self.max_len,
	"rope_theta": self.rope_theta, "layer_idx": self.layer_idx
	})
	return config


	@keras.saving.register_keras_serializable()
	class SAM1Model(keras.Model):
	def __init__(self, **kwargs):
	super().__init__()
	if 'config' in kwargs and isinstance(kwargs['config'], dict):
	self.cfg = kwargs['config']
	elif 'vocab_size' in kwargs:
	self.cfg = kwargs
	else:
	self.cfg = kwargs.get('cfg', kwargs)

	self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")

	ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
	block_args = {
	'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'],
	'ff_dim': ff_dim, 'dropout': self.cfg['dropout'],
	'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']
	}

	self.blocks = [TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
	for i in range(self.cfg['n_layers'])]
	self.norm = RMSNorm(name="final_norm")
	self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")

	def call(self, input_ids, training=None):
	x = self.embed(input_ids)
	for block in self.blocks:
	x = block(x, training=training)
	return self.lm_head(self.norm(x))

	def get_config(self):
	base_config = super().get_config()
	base_config['config'] = self.cfg
	return base_config

	# ============================================================================
	# Model Loading
	# ============================================================================

	print("🚀 Loading SAM-Z-1 Model for API...")

	config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)

	try:
	weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
	use_checkpoint = True
	print("✅ Found checkpoint weights")
	except:
	model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
	use_checkpoint = False
	print("✅ Found saved model")

	with open(config_path, 'r') as f:
	config = json.load(f)

	eos_token_id = config.get('eos_token_id', 50256)

	# Create tokenizer
	print("📦 Creating tokenizer...")
	from transformers import AutoTokenizer
	hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
	hf_tokenizer.add_special_tokens({
	"additional_special_tokens": ["<\|im_start\|>", "<\|im_end\|>", "<think>", "<think/>"]
	})

	os.makedirs("./temp_tokenizer", exist_ok=True)
	hf_tokenizer.save_pretrained("./temp_tokenizer")
	tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")

	# Load model
	if use_checkpoint:
	print("📦 Building model and loading weights...")
	model_config = {
	'vocab_size': config['vocab_size'],
	'd_model': config['hidden_size'],
	'n_layers': config['num_hidden_layers'],
	'n_heads': config['num_attention_heads'],
	'ff_mult': config['intermediate_size'] / config['hidden_size'],
	'max_len': config['max_position_embeddings'],
	'dropout': 0.1,
	'rope_theta': config['rope_theta']
	}
	model = SAM1Model(config=model_config)
	dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
	_ = model(dummy_input, training=False)
	model.load_weights(weights_path)
	else:
	model = keras.models.load_model(model_path, compile=False)

	@tf.function(reduce_retracing=True)
	def fast_forward(input_tensor):
	return model(input_tensor, training=False)

	print(f"✅ Model loaded: {config['num_hidden_layers']} layers, ~313M params")

	# ============================================================================
	# Generation Engine
	# ============================================================================

	def generate_tokens(
	input_ids: List[int],
	max_tokens: int = 512,
	temperature: float = 0.8,
	top_k: int = 40,
	top_p: float = 0.9,
	repetition_penalty: float = 1.1
	):
	"""Generator that yields tokens one at a time"""
	if len(input_ids) > config['max_position_embeddings'] - max_tokens:
	input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]

	input_tensor = tf.constant([input_ids], dtype=tf.int32)
	token_freq = {}

	for step in range(max_tokens):
	logits = fast_forward(input_tensor)
	next_token_logits = logits[0, -1, :].numpy()

	# Temperature
	next_token_logits = next_token_logits / temperature

	# Repetition penalty
	if repetition_penalty != 1.0:
	for token_id, freq in token_freq.items():
	if token_id < len(next_token_logits):
	next_token_logits[token_id] /= (repetition_penalty ** freq)

	# Top-k filtering
	if top_k > 0:
	top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
	top_k_logits = next_token_logits[top_k_indices]
	top_k_probs = tf.nn.softmax(top_k_logits).numpy()

	# Top-p sampling
	if top_p < 1.0:
	sorted_indices = np.argsort(top_k_probs)[::-1]
	cumsum = np.cumsum(top_k_probs[sorted_indices])
	cutoff_idx = np.searchsorted(cumsum, top_p)
	nucleus_indices = sorted_indices[:cutoff_idx + 1]
	nucleus_logits = top_k_logits[nucleus_indices]
	nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
	sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
	next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
	else:
	sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
	next_token_id = int(top_k_indices[sampled_idx])
	else:
	probs = tf.nn.softmax(next_token_logits).numpy()
	next_token_id = np.random.choice(len(probs), p=probs)

	if next_token_id == eos_token_id:
	break

	token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1

	yield next_token_id

	input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)

	if input_tensor.shape[1] > config['max_position_embeddings']:
	input_tensor = input_tensor[:, -config['max_position_embeddings']:]

	# ============================================================================
	# API Functions - FIXED FOR GRADIO
	# ============================================================================

	def chat_completion_api(
	messages_json: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	top_k: int,
	repetition_penalty: float,
	stream: bool
	) -> str:
	"""OpenAI-style chat completion API"""
	try:
	messages = json.loads(messages_json)

	# Format messages
	prompt = ""
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")

	if role == "system":
	prompt += f"<\|im_start\|>system\n{content}<\|im_end\|>\n"
	elif role == "user":
	prompt += f"<\|im_start\|>user\n{content}<\|im_end\|>\n"
	elif role == "assistant":
	prompt += f"<\|im_start\|>assistant\n{content}<\|im_end\|>\n"

	prompt += "<\|im_start\|>assistant\n"

	# Tokenize
	input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]

	start_time = time.time()
	token_count = 0
	response_text = ""

	for token_id in generate_tokens(
	input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
	):
	token_text = tokenizer.decode([token_id])
	response_text += token_text
	token_count += 1

	if "<\|im_end\|>" in response_text:
	response_text = response_text.split("<\|im_end\|>")[0]
	break

	elapsed = time.time() - start_time

	result = {
	"id": f"chatcmpl-{int(time.time())}",
	"object": "chat.completion",
	"created": int(time.time()),
	"model": "sam-z-1",
	"choices": [{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": response_text.strip()
	},
	"finish_reason": "stop"
	}],
	"usage": {
	"prompt_tokens": len(input_ids),
	"completion_tokens": token_count,
	"total_tokens": len(input_ids) + token_count
	},
	"stats": {
	"elapsed_sec": round(elapsed, 2),
	"tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
	}
	}

	return json.dumps(result, indent=2)

	except Exception as e:
	return json.dumps({"error": str(e)}, indent=2)

	def text_completion_api(
	prompt: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	top_k: int,
	repetition_penalty: float,
	stream: bool
	) -> str:
	"""OpenAI-style text completion API"""
	try:
	input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]

	start_time = time.time()
	token_count = 0
	response_text = ""

	for token_id in generate_tokens(
	input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
	):
	token_text = tokenizer.decode([token_id])
	response_text += token_text
	token_count += 1

	elapsed = time.time() - start_time

	result = {
	"id": f"cmpl-{int(time.time())}",
	"object": "text_completion",
	"created": int(time.time()),
	"model": "sam-z-1",
	"choices": [{
	"text": response_text,
	"index": 0,
	"finish_reason": "stop"
	}],
	"usage": {
	"prompt_tokens": len(input_ids),
	"completion_tokens": token_count,
	"total_tokens": len(input_ids) + token_count
	},
	"stats": {
	"elapsed_sec": round(elapsed, 2),
	"tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
	}
	}

	return json.dumps(result, indent=2)

	except Exception as e:
	return json.dumps({"error": str(e)}, indent=2)

	# ============================================================================
	# Gradio UI with API Routes
	# ============================================================================

	custom_css = """
	.api-container {
	max-width: 1400px;
	margin: auto;
	}

	.header {
	text-align: center;
	padding: 2rem;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	border-radius: 12px;
	margin-bottom: 2rem;
	}

	.endpoint-card {
	background: #f8f9fa;
	padding: 1.5rem;
	border-radius: 8px;
	border-left: 4px solid #667eea;
	margin: 1rem 0;
	}
	"""

	with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
	gr.HTML("""
	<div class="header">
	<h1>🚀 SAM-Z-1 API Server</h1>
	<p>OpenAI-Compatible API for SAM-Z-1 Language Model</p>
	<p style="font-size: 0.9rem; opacity: 0.9;">
	313M Parameters • 768D • 16 Layers • TensorFlow Optimized
	</p>
	</div>
	""")

	with gr.Tabs():
	# ========== Chat Completion Tab ==========
	with gr.Tab("💬 Chat Completion"):
	gr.Markdown("""
	### Chat Completions API
	OpenAI-compatible chat completion endpoint
	""")

	with gr.Row():
	with gr.Column(scale=1):
	messages_input = gr.Code(
	label="Messages (JSON)",
	language="json",
	value=json.dumps([
	{"role": "user", "content": "Hello! Who are you?"}
	], indent=2),
	lines=10
	)

	with gr.Row():
	chat_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
	chat_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")

	with gr.Row():
	chat_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
	chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")

	chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
	chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)

	chat_btn = gr.Button("🚀 Generate", variant="primary", size="lg")

	with gr.Column(scale=1):
	chat_output = gr.Code(
	label="API Response (JSON)",
	language="json",
	lines=20
	)

	gr.Markdown("""
	### Python Example with Gradio Client
	```python
	from gradio_client import Client

	client = Client("YOUR-SPACE-URL")

	messages = [
	{"role": "user", "content": "Hello! Who are you?"}
	]

	result = client.predict(
	messages_json=json.dumps(messages),
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/chat_completions"
	)

	print(result)
	```
	""")

	# ========== Text Completion Tab ==========
	with gr.Tab("📝 Text Completion"):
	gr.Markdown("""
	### Text Completions API
	OpenAI-compatible text completion endpoint
	""")

	with gr.Row():
	with gr.Column(scale=1):
	prompt_input = gr.Textbox(
	label="Prompt",
	placeholder="Once upon a time...",
	lines=5
	)

	with gr.Row():
	text_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens")
	text_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature")

	with gr.Row():
	text_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
	text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")

	text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
	text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)

	text_btn = gr.Button("🚀 Generate", variant="primary", size="lg")

	with gr.Column(scale=1):
	text_output = gr.Code(
	label="API Response (JSON)",
	language="json",
	lines=20
	)

	gr.Markdown("""
	### Python Example with Gradio Client
	```python
	from gradio_client import Client

	client = Client("YOUR-SPACE-URL")

	result = client.predict(
	prompt="Once upon a time",
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/text_completions"
	)

	print(result)
	```
	""")

	# ========== Documentation Tab ==========
	with gr.Tab("📖 Documentation"):
	gr.Markdown(f"""
	# SAM-Z-1 API Documentation

	## Model Information
	- Model: SAM-Z-1 (Direct Response Model)
	- Parameters: ~313M
	- Architecture: Transformer with RoPE, SwiGLU, RMSNorm
	- Context Length: {config['max_position_embeddings']} tokens
	- Vocabulary Size: {config['vocab_size']}

	## Using the API

	### Method 1: Gradio Client (Recommended)

	Install the Gradio client:
	```bash
	pip install gradio_client
	```

	Chat Completion:
	```python
	from gradio_client import Client
	import json

	client = Client("https://YOUR-SPACE.hf.space")

	messages = [
	{{"role": "user", "content": "What is Python?"}}
	]

	result = client.predict(
	messages_json=json.dumps(messages),
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/chat_completions"
	)

	response = json.loads(result)
	print(response["choices"][0]["message"]["content"])
	```

	Text Completion:
	```python
	result = client.predict(
	prompt="Once upon a time",
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/text_completions"
	)

	response = json.loads(result)
	print(response["choices"][0]["text"])
	```

	### Method 2: Direct HTTP Requests

	Chat Completion:
	```python
	import requests
	import json

	url = "https://YOUR-SPACE.hf.space/call/chat_completions"

	payload = {{
	"data": [
	json.dumps([{{"role": "user", "content": "Hello!"}}]), # messages_json
	512, # max_tokens
	0.8, # temperature
	0.9, # top_p
	40, # top_k
	1.1, # repetition_penalty
	False # stream
	]
	}}

	response = requests.post(url, json=payload)
	print(response.json())
	```

	## API Endpoints

	### Chat Completions
	- API Name: `/chat_completions`
	- URL: `https://YOUR-SPACE.hf.space/call/chat_completions`

	Parameters:
	1. `messages_json` (str): JSON string of messages array
	2. `max_tokens` (int): Maximum tokens to generate (50-1024)
	3. `temperature` (float): Sampling temperature (0.1-2.0)
	4. `top_p` (float): Nucleus sampling threshold (0.1-1.0)
	5. `top_k` (int): Top-K sampling (1-100)
	6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0)
	7. `stream` (bool): Stream response (UI only, not functional)

	### Text Completions
	- API Name: `/text_completions`
	- URL: `https://YOUR-SPACE.hf.space/call/text_completions`

	Parameters:
	1. `prompt` (str): Text prompt
	2. `max_tokens` (int): Maximum tokens to generate
	3. `temperature` (float): Sampling temperature
	4. `top_p` (float): Nucleus sampling threshold
	5. `top_k` (int): Top-K sampling
	6. `repetition_penalty` (float): Penalty for repetition
	7. `stream` (bool): Stream response (UI only)

	## Response Format

	Chat Completion Response:
	```json
	{{
	"id": "chatcmpl-1234567890",
	"object": "chat.completion",
	"created": 1234567890,
	"model": "sam-z-1",
	"choices": [{{
	"index": 0,
	"message": {{
	"role": "assistant",
	"content": "Response text here"
	}},
	"finish_reason": "stop"
	}}],
	"usage": {{
	"prompt_tokens": 10,
	"completion_tokens": 20,
	"total_tokens": 30
	}},
	"stats": {{
	"elapsed_sec": 1.5,
	"tokens_per_sec": 13.3
	}}
	}}
	```

	Text Completion Response:
	```json
	{{
	"id": "cmpl-1234567890",
	"object": "text_completion",
	"created": 1234567890,
	"model": "sam-z-1",
	"choices": [{{
	"text": "Completion text here",
	"index": 0,
	"finish_reason": "stop"
	}}],
	"usage": {{
	"prompt_tokens": 5,
	"completion_tokens": 15,
	"total_tokens": 20
	}},
	"stats": {{
	"elapsed_sec": 1.2,
	"tokens_per_sec": 12.5
	}}
	}}
	```

	## Complete Example Script

	```python
	#!/usr/bin/env python3
	"""
	SAM-Z-1 API Client Example
	"""
	from gradio_client import Client
	import json

	# Initialize client
	client = Client("https://YOUR-SPACE.hf.space")

	def chat(message, history=[]):
	\"\"\"Send a chat message\"\"\"
	messages = history + [{{"role": "user", "content": message}}]

	result = client.predict(
	messages_json=json.dumps(messages),
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/chat_completions"
	)

	response = json.loads(result)
	assistant_msg = response["choices"][0]["message"]["content"]

	# Update history
	history.append({{"role": "user", "content": message}})
	history.append({{"role": "assistant", "content": assistant_msg}})

	return assistant_msg, history

	def complete(prompt):
	\"\"\"Complete text\"\"\"
	result = client.predict(
	prompt=prompt,
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/text_completions"
	)

	response = json.loads(result)
	return response["choices"][0]["text"]

	# Example usage
	if __name__ == "__main__":
	# Chat example
	print("=== Chat Example ===")
	history = []

	response, history = chat("Hello! Who are you?", history)
	print(f"Assistant: {{response}}\\n")

	response, history = chat("What can you help me with?", history)
	print(f"Assistant: {{response}}\\n")

	# Text completion example
	print("\\n=== Text Completion Example ===")
	completion = complete("Once upon a time in a distant galaxy")
	print(f"Completion: {{completion}}")
	```

	## Parameters Guide

	### Temperature (0.1 - 2.0)
	- Low (0.1-0.5): More focused, deterministic, factual
	- Medium (0.6-0.9): Balanced creativity and coherence
	- High (1.0-2.0): More creative, diverse, unpredictable

	### Top-P (0.1 - 1.0)
	- Controls diversity via nucleus sampling
	- 0.9 (default): Good balance
	- Lower values = more focused
	- Higher values = more diverse

	### Top-K (1 - 100)
	- Limits vocabulary to top K tokens
	- 40 (default): Good balance
	- Lower values = more focused
	- Higher values = more diverse

	### Repetition Penalty (1.0 - 2.0)
	- 1.0: No penalty
	- 1.1 (default): Slight penalty
	- 1.5+: Strong penalty (use if model repeats)

	## Rate Limits & Performance

	- Concurrent Requests: Supported via Gradio queue
	- Average Speed: 10-20 tokens/sec on CPU
	- Context Window: {config['max_position_embeddings']} tokens
	- Queue Size: Up to 20 concurrent requests

	## Error Handling

	```python
	try:
	result = client.predict(
	messages_json=json.dumps(messages),
	max_tokens=512,
	temperature=0.8,
	top_p=0.9,
	top_k=40,
	repetition_penalty=1.1,
	stream=False,
	api_name="/chat_completions"
	)
	response = json.loads(result)

	if "error" in response:
	print(f"API Error: {{response['error']}}")
	else:
	print(response["choices"][0]["message"]["content"])

	except Exception as e:
	print(f"Request failed: {{e}}")
	```

	## Troubleshooting

	Connection Issues:
	- Verify Space URL is correct
	- Check if Space is running
	- Ensure gradio_client is installed

	Slow Responses:
	- Reduce `max_tokens`
	- Lower `top_k` value
	- Use shorter prompts

	Repetitive Output:
	- Increase `repetition_penalty` (try 1.2-1.5)
	- Adjust `temperature` higher
	- Use `top_p` sampling

	Incoherent Output:
	- Lower `temperature` (try 0.5-0.7)
	- Reduce `top_k` (try 20-30)
	- Ensure prompt is clear and well-formatted

	## Chat Template Format

	The model uses ChatML format:
	```
	<\|im_start\|>system
	System message here<\|im_end\|>
	<\|im_start\|>user
	User message here<\|im_end\|>
	<\|im_start\|>assistant
	Assistant response here<\|im_end\|>
	```

	## Tips for Best Results

	1. Use clear, specific prompts
	2. Lower temperature for factual tasks
	3. Higher temperature for creative tasks
	4. Adjust repetition penalty if model repeats phrases
	5. Keep context under {config['max_position_embeddings']} tokens
	6. Use system messages to set behavior

	## Model Capabilities

	✅ General conversation
	✅ Question answering
	✅ Code generation
	✅ Creative writing
	✅ Text completion
	✅ Instruction following

	❌ Does NOT use reasoning tokens (`<think>` tags)
	❌ Not fine-tuned for specific domains

	---

	Model: SAM-Z-1 \| API Version: 1.0
	Support: Open an issue on the Space for bugs or questions
	""")

	# ========== API Routes - MUST USE api_name parameter ==========
	chat_btn.click(
	fn=chat_completion_api,
	inputs=[
	messages_input, chat_max_tokens, chat_temperature,
	chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
	],
	outputs=[chat_output],
	api_name="chat_completions" # This creates /call/chat_completions endpoint
	)

	text_btn.click(
	fn=text_completion_api,
	inputs=[
	prompt_input, text_max_tokens, text_temperature,
	text_top_p, text_top_k, text_rep_penalty, text_stream
	],
	outputs=[text_output],
	api_name="text_completions" # This creates /call/text_completions endpoint
	)

	# Launch
	if __name__ == "__main__":
	demo.queue(max_size=20)
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)