Spaces:
Sleeping
Sleeping
| """ | |
| SAM-Z-1 Production API with Gradio UI | |
| OpenAI-compatible API interface for Hugging Face Spaces | |
| """ | |
| import gradio as gr | |
| import tensorflow as tf | |
| import keras | |
| from huggingface_hub import hf_hub_download | |
| import json | |
| import os | |
| from tokenizers import Tokenizer | |
| import numpy as np | |
| import time | |
| from typing import Dict, Any, List | |
| # ============================================================================ | |
| # Configuration | |
| # ============================================================================ | |
| MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow" | |
| CACHE_DIR = "./model_cache" | |
| # Global model storage | |
| model = None | |
| tokenizer = None | |
| config = None | |
| eos_token_id = None | |
| # ============================================================================ | |
| # Model Architecture (same as original) | |
| # ============================================================================ | |
| class RotaryEmbedding(keras.layers.Layer): | |
| def __init__(self, dim, max_len=2048, theta=10000, **kwargs): | |
| super().__init__(**kwargs) | |
| self.dim = dim | |
| self.max_len = max_len | |
| self.theta = theta | |
| self.built_cache = False | |
| def build(self, input_shape): | |
| super().build(input_shape) | |
| def _build_cache(self): | |
| if not self.built_cache: | |
| inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim)) | |
| t = tf.range(self.max_len, dtype=tf.float32) | |
| freqs = tf.einsum("i,j->ij", t, inv_freq) | |
| emb = tf.concat([freqs, freqs], axis=-1) | |
| self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32) | |
| self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32) | |
| self.built_cache = True | |
| def rotate_half(self, x): | |
| x1, x2 = tf.split(x, 2, axis=-1) | |
| return tf.concat([-x2, x1], axis=-1) | |
| def call(self, q, k): | |
| self._build_cache() | |
| seq_len = tf.shape(q)[2] | |
| dtype = q.dtype | |
| cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :] | |
| sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :] | |
| q_rotated = (q * cos) + (self.rotate_half(q) * sin) | |
| k_rotated = (k * cos) + (self.rotate_half(k) * sin) | |
| return q_rotated, k_rotated | |
| def get_config(self): | |
| config = super().get_config() | |
| config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta}) | |
| return config | |
| class RMSNorm(keras.layers.Layer): | |
| def __init__(self, epsilon=1e-5, **kwargs): | |
| super().__init__(**kwargs) | |
| self.epsilon = epsilon | |
| def build(self, input_shape): | |
| self.scale = self.add_weight(name="scale", shape=(input_shape[-1],), initializer="ones") | |
| def call(self, x): | |
| variance = tf.reduce_mean(tf.square(x), axis=-1, keepdims=True) | |
| return x * tf.math.rsqrt(variance + self.epsilon) * self.scale | |
| def get_config(self): | |
| config = super().get_config() | |
| config.update({"epsilon": self.epsilon}) | |
| return config | |
| class TransformerBlock(keras.layers.Layer): | |
| def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs): | |
| super().__init__(**kwargs) | |
| self.d_model = d_model | |
| self.n_heads = n_heads | |
| self.ff_dim = ff_dim | |
| self.dropout_rate = dropout | |
| self.max_len = max_len | |
| self.rope_theta = rope_theta | |
| self.head_dim = d_model // n_heads | |
| self.layer_idx = layer_idx | |
| self.pre_attn_norm = RMSNorm() | |
| self.pre_ffn_norm = RMSNorm() | |
| self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj") | |
| self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj") | |
| self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj") | |
| self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj") | |
| self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta) | |
| self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj") | |
| self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj") | |
| self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj") | |
| self.dropout = keras.layers.Dropout(dropout) | |
| def call(self, x, training=None): | |
| B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model | |
| dtype = x.dtype | |
| res = x | |
| y = self.pre_attn_norm(x) | |
| q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3]) | |
| k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3]) | |
| v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3]) | |
| q, k = self.rope(q, k) | |
| scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype)) | |
| mask = tf.where( | |
| tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0, | |
| tf.constant(-1e9, dtype=dtype), | |
| tf.constant(0.0, dtype=dtype) | |
| ) | |
| scores += mask | |
| attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v) | |
| attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D]) | |
| x = res + self.dropout(self.out_proj(attn), training=training) | |
| res = x | |
| y = self.pre_ffn_norm(x) | |
| ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y)) | |
| return res + self.dropout(ffn, training=training) | |
| def get_config(self): | |
| config = super().get_config() | |
| config.update({ | |
| "d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim, | |
| "dropout": self.dropout_rate, "max_len": self.max_len, | |
| "rope_theta": self.rope_theta, "layer_idx": self.layer_idx | |
| }) | |
| return config | |
| class SAM1Model(keras.Model): | |
| def __init__(self, **kwargs): | |
| super().__init__() | |
| if 'config' in kwargs and isinstance(kwargs['config'], dict): | |
| self.cfg = kwargs['config'] | |
| elif 'vocab_size' in kwargs: | |
| self.cfg = kwargs | |
| else: | |
| self.cfg = kwargs.get('cfg', kwargs) | |
| self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens") | |
| ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult']) | |
| block_args = { | |
| 'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'], | |
| 'ff_dim': ff_dim, 'dropout': self.cfg['dropout'], | |
| 'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta'] | |
| } | |
| self.blocks = [TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args) | |
| for i in range(self.cfg['n_layers'])] | |
| self.norm = RMSNorm(name="final_norm") | |
| self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head") | |
| def call(self, input_ids, training=None): | |
| x = self.embed(input_ids) | |
| for block in self.blocks: | |
| x = block(x, training=training) | |
| return self.lm_head(self.norm(x)) | |
| def get_config(self): | |
| base_config = super().get_config() | |
| base_config['config'] = self.cfg | |
| return base_config | |
| # ============================================================================ | |
| # Model Loading | |
| # ============================================================================ | |
| print("π Loading SAM-Z-1 Model for API...") | |
| config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR) | |
| try: | |
| weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR) | |
| use_checkpoint = True | |
| print("β Found checkpoint weights") | |
| except: | |
| model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR) | |
| use_checkpoint = False | |
| print("β Found saved model") | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| eos_token_id = config.get('eos_token_id', 50256) | |
| # Create tokenizer | |
| print("π¦ Creating tokenizer...") | |
| from transformers import AutoTokenizer | |
| hf_tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| hf_tokenizer.add_special_tokens({ | |
| "additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"] | |
| }) | |
| os.makedirs("./temp_tokenizer", exist_ok=True) | |
| hf_tokenizer.save_pretrained("./temp_tokenizer") | |
| tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json") | |
| # Load model | |
| if use_checkpoint: | |
| print("π¦ Building model and loading weights...") | |
| model_config = { | |
| 'vocab_size': config['vocab_size'], | |
| 'd_model': config['hidden_size'], | |
| 'n_layers': config['num_hidden_layers'], | |
| 'n_heads': config['num_attention_heads'], | |
| 'ff_mult': config['intermediate_size'] / config['hidden_size'], | |
| 'max_len': config['max_position_embeddings'], | |
| 'dropout': 0.1, | |
| 'rope_theta': config['rope_theta'] | |
| } | |
| model = SAM1Model(config=model_config) | |
| dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32) | |
| _ = model(dummy_input, training=False) | |
| model.load_weights(weights_path) | |
| else: | |
| model = keras.models.load_model(model_path, compile=False) | |
| def fast_forward(input_tensor): | |
| return model(input_tensor, training=False) | |
| print(f"β Model loaded: {config['num_hidden_layers']} layers, ~313M params") | |
| # ============================================================================ | |
| # Generation Engine | |
| # ============================================================================ | |
| def generate_tokens( | |
| input_ids: List[int], | |
| max_tokens: int = 512, | |
| temperature: float = 0.8, | |
| top_k: int = 40, | |
| top_p: float = 0.9, | |
| repetition_penalty: float = 1.1 | |
| ): | |
| """Generator that yields tokens one at a time""" | |
| if len(input_ids) > config['max_position_embeddings'] - max_tokens: | |
| input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):] | |
| input_tensor = tf.constant([input_ids], dtype=tf.int32) | |
| token_freq = {} | |
| for step in range(max_tokens): | |
| logits = fast_forward(input_tensor) | |
| next_token_logits = logits[0, -1, :].numpy() | |
| # Temperature | |
| next_token_logits = next_token_logits / temperature | |
| # Repetition penalty | |
| if repetition_penalty != 1.0: | |
| for token_id, freq in token_freq.items(): | |
| if token_id < len(next_token_logits): | |
| next_token_logits[token_id] /= (repetition_penalty ** freq) | |
| # Top-k filtering | |
| if top_k > 0: | |
| top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:] | |
| top_k_logits = next_token_logits[top_k_indices] | |
| top_k_probs = tf.nn.softmax(top_k_logits).numpy() | |
| # Top-p sampling | |
| if top_p < 1.0: | |
| sorted_indices = np.argsort(top_k_probs)[::-1] | |
| cumsum = np.cumsum(top_k_probs[sorted_indices]) | |
| cutoff_idx = np.searchsorted(cumsum, top_p) | |
| nucleus_indices = sorted_indices[:cutoff_idx + 1] | |
| nucleus_logits = top_k_logits[nucleus_indices] | |
| nucleus_probs = tf.nn.softmax(nucleus_logits).numpy() | |
| sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs) | |
| next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]]) | |
| else: | |
| sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs) | |
| next_token_id = int(top_k_indices[sampled_idx]) | |
| else: | |
| probs = tf.nn.softmax(next_token_logits).numpy() | |
| next_token_id = np.random.choice(len(probs), p=probs) | |
| if next_token_id == eos_token_id: | |
| break | |
| token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1 | |
| yield next_token_id | |
| input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1) | |
| if input_tensor.shape[1] > config['max_position_embeddings']: | |
| input_tensor = input_tensor[:, -config['max_position_embeddings']:] | |
| # ============================================================================ | |
| # API Functions - FIXED FOR GRADIO | |
| # ============================================================================ | |
| def chat_completion_api( | |
| messages_json: str, | |
| max_tokens: int, | |
| temperature: float, | |
| top_p: float, | |
| top_k: int, | |
| repetition_penalty: float, | |
| stream: bool | |
| ) -> str: | |
| """OpenAI-style chat completion API""" | |
| try: | |
| messages = json.loads(messages_json) | |
| # Format messages | |
| prompt = "" | |
| for msg in messages: | |
| role = msg.get("role", "user") | |
| content = msg.get("content", "") | |
| if role == "system": | |
| prompt += f"<|im_start|>system\n{content}<|im_end|>\n" | |
| elif role == "user": | |
| prompt += f"<|im_start|>user\n{content}<|im_end|>\n" | |
| elif role == "assistant": | |
| prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" | |
| prompt += "<|im_start|>assistant\n" | |
| # Tokenize | |
| input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id] | |
| start_time = time.time() | |
| token_count = 0 | |
| response_text = "" | |
| for token_id in generate_tokens( | |
| input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty | |
| ): | |
| token_text = tokenizer.decode([token_id]) | |
| response_text += token_text | |
| token_count += 1 | |
| if "<|im_end|>" in response_text: | |
| response_text = response_text.split("<|im_end|>")[0] | |
| break | |
| elapsed = time.time() - start_time | |
| result = { | |
| "id": f"chatcmpl-{int(time.time())}", | |
| "object": "chat.completion", | |
| "created": int(time.time()), | |
| "model": "sam-z-1", | |
| "choices": [{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": response_text.strip() | |
| }, | |
| "finish_reason": "stop" | |
| }], | |
| "usage": { | |
| "prompt_tokens": len(input_ids), | |
| "completion_tokens": token_count, | |
| "total_tokens": len(input_ids) + token_count | |
| }, | |
| "stats": { | |
| "elapsed_sec": round(elapsed, 2), | |
| "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1) | |
| } | |
| } | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| return json.dumps({"error": str(e)}, indent=2) | |
| def text_completion_api( | |
| prompt: str, | |
| max_tokens: int, | |
| temperature: float, | |
| top_p: float, | |
| top_k: int, | |
| repetition_penalty: float, | |
| stream: bool | |
| ) -> str: | |
| """OpenAI-style text completion API""" | |
| try: | |
| input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id] | |
| start_time = time.time() | |
| token_count = 0 | |
| response_text = "" | |
| for token_id in generate_tokens( | |
| input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty | |
| ): | |
| token_text = tokenizer.decode([token_id]) | |
| response_text += token_text | |
| token_count += 1 | |
| elapsed = time.time() - start_time | |
| result = { | |
| "id": f"cmpl-{int(time.time())}", | |
| "object": "text_completion", | |
| "created": int(time.time()), | |
| "model": "sam-z-1", | |
| "choices": [{ | |
| "text": response_text, | |
| "index": 0, | |
| "finish_reason": "stop" | |
| }], | |
| "usage": { | |
| "prompt_tokens": len(input_ids), | |
| "completion_tokens": token_count, | |
| "total_tokens": len(input_ids) + token_count | |
| }, | |
| "stats": { | |
| "elapsed_sec": round(elapsed, 2), | |
| "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1) | |
| } | |
| } | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| return json.dumps({"error": str(e)}, indent=2) | |
| # ============================================================================ | |
| # Gradio UI with API Routes | |
| # ============================================================================ | |
| custom_css = """ | |
| .api-container { | |
| max-width: 1400px; | |
| margin: auto; | |
| } | |
| .header { | |
| text-align: center; | |
| padding: 2rem; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| border-radius: 12px; | |
| margin-bottom: 2rem; | |
| } | |
| .endpoint-card { | |
| background: #f8f9fa; | |
| padding: 1.5rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #667eea; | |
| margin: 1rem 0; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>π SAM-Z-1 API Server</h1> | |
| <p>OpenAI-Compatible API for SAM-Z-1 Language Model</p> | |
| <p style="font-size: 0.9rem; opacity: 0.9;"> | |
| 313M Parameters β’ 768D β’ 16 Layers β’ TensorFlow Optimized | |
| </p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ========== Chat Completion Tab ========== | |
| with gr.Tab("π¬ Chat Completion"): | |
| gr.Markdown(""" | |
| ### Chat Completions API | |
| OpenAI-compatible chat completion endpoint | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| messages_input = gr.Code( | |
| label="Messages (JSON)", | |
| language="json", | |
| value=json.dumps([ | |
| {"role": "user", "content": "Hello! Who are you?"} | |
| ], indent=2), | |
| lines=10 | |
| ) | |
| with gr.Row(): | |
| chat_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens") | |
| chat_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature") | |
| with gr.Row(): | |
| chat_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P") | |
| chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K") | |
| chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty") | |
| chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False) | |
| chat_btn = gr.Button("π Generate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| chat_output = gr.Code( | |
| label="API Response (JSON)", | |
| language="json", | |
| lines=20 | |
| ) | |
| gr.Markdown(""" | |
| ### Python Example with Gradio Client | |
| ```python | |
| from gradio_client import Client | |
| client = Client("YOUR-SPACE-URL") | |
| messages = [ | |
| {"role": "user", "content": "Hello! Who are you?"} | |
| ] | |
| result = client.predict( | |
| messages_json=json.dumps(messages), | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/chat_completions" | |
| ) | |
| print(result) | |
| ``` | |
| """) | |
| # ========== Text Completion Tab ========== | |
| with gr.Tab("π Text Completion"): | |
| gr.Markdown(""" | |
| ### Text Completions API | |
| OpenAI-compatible text completion endpoint | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| prompt_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Once upon a time...", | |
| lines=5 | |
| ) | |
| with gr.Row(): | |
| text_max_tokens = gr.Slider(50, 1024, 512, step=50, label="Max Tokens") | |
| text_temperature = gr.Slider(0.1, 2.0, 0.8, step=0.1, label="Temperature") | |
| with gr.Row(): | |
| text_top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P") | |
| text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K") | |
| text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty") | |
| text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False) | |
| text_btn = gr.Button("π Generate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| text_output = gr.Code( | |
| label="API Response (JSON)", | |
| language="json", | |
| lines=20 | |
| ) | |
| gr.Markdown(""" | |
| ### Python Example with Gradio Client | |
| ```python | |
| from gradio_client import Client | |
| client = Client("YOUR-SPACE-URL") | |
| result = client.predict( | |
| prompt="Once upon a time", | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/text_completions" | |
| ) | |
| print(result) | |
| ``` | |
| """) | |
| # ========== Documentation Tab ========== | |
| with gr.Tab("π Documentation"): | |
| gr.Markdown(f""" | |
| # SAM-Z-1 API Documentation | |
| ## Model Information | |
| - **Model**: SAM-Z-1 (Direct Response Model) | |
| - **Parameters**: ~313M | |
| - **Architecture**: Transformer with RoPE, SwiGLU, RMSNorm | |
| - **Context Length**: {config['max_position_embeddings']} tokens | |
| - **Vocabulary Size**: {config['vocab_size']} | |
| ## Using the API | |
| ### Method 1: Gradio Client (Recommended) | |
| Install the Gradio client: | |
| ```bash | |
| pip install gradio_client | |
| ``` | |
| **Chat Completion:** | |
| ```python | |
| from gradio_client import Client | |
| import json | |
| client = Client("https://YOUR-SPACE.hf.space") | |
| messages = [ | |
| {{"role": "user", "content": "What is Python?"}} | |
| ] | |
| result = client.predict( | |
| messages_json=json.dumps(messages), | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/chat_completions" | |
| ) | |
| response = json.loads(result) | |
| print(response["choices"][0]["message"]["content"]) | |
| ``` | |
| **Text Completion:** | |
| ```python | |
| result = client.predict( | |
| prompt="Once upon a time", | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/text_completions" | |
| ) | |
| response = json.loads(result) | |
| print(response["choices"][0]["text"]) | |
| ``` | |
| ### Method 2: Direct HTTP Requests | |
| **Chat Completion:** | |
| ```python | |
| import requests | |
| import json | |
| url = "https://YOUR-SPACE.hf.space/call/chat_completions" | |
| payload = {{ | |
| "data": [ | |
| json.dumps([{{"role": "user", "content": "Hello!"}}]), # messages_json | |
| 512, # max_tokens | |
| 0.8, # temperature | |
| 0.9, # top_p | |
| 40, # top_k | |
| 1.1, # repetition_penalty | |
| False # stream | |
| ] | |
| }} | |
| response = requests.post(url, json=payload) | |
| print(response.json()) | |
| ``` | |
| ## API Endpoints | |
| ### Chat Completions | |
| - **API Name**: `/chat_completions` | |
| - **URL**: `https://YOUR-SPACE.hf.space/call/chat_completions` | |
| **Parameters:** | |
| 1. `messages_json` (str): JSON string of messages array | |
| 2. `max_tokens` (int): Maximum tokens to generate (50-1024) | |
| 3. `temperature` (float): Sampling temperature (0.1-2.0) | |
| 4. `top_p` (float): Nucleus sampling threshold (0.1-1.0) | |
| 5. `top_k` (int): Top-K sampling (1-100) | |
| 6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0) | |
| 7. `stream` (bool): Stream response (UI only, not functional) | |
| ### Text Completions | |
| - **API Name**: `/text_completions` | |
| - **URL**: `https://YOUR-SPACE.hf.space/call/text_completions` | |
| **Parameters:** | |
| 1. `prompt` (str): Text prompt | |
| 2. `max_tokens` (int): Maximum tokens to generate | |
| 3. `temperature` (float): Sampling temperature | |
| 4. `top_p` (float): Nucleus sampling threshold | |
| 5. `top_k` (int): Top-K sampling | |
| 6. `repetition_penalty` (float): Penalty for repetition | |
| 7. `stream` (bool): Stream response (UI only) | |
| ## Response Format | |
| **Chat Completion Response:** | |
| ```json | |
| {{ | |
| "id": "chatcmpl-1234567890", | |
| "object": "chat.completion", | |
| "created": 1234567890, | |
| "model": "sam-z-1", | |
| "choices": [{{ | |
| "index": 0, | |
| "message": {{ | |
| "role": "assistant", | |
| "content": "Response text here" | |
| }}, | |
| "finish_reason": "stop" | |
| }}], | |
| "usage": {{ | |
| "prompt_tokens": 10, | |
| "completion_tokens": 20, | |
| "total_tokens": 30 | |
| }}, | |
| "stats": {{ | |
| "elapsed_sec": 1.5, | |
| "tokens_per_sec": 13.3 | |
| }} | |
| }} | |
| ``` | |
| **Text Completion Response:** | |
| ```json | |
| {{ | |
| "id": "cmpl-1234567890", | |
| "object": "text_completion", | |
| "created": 1234567890, | |
| "model": "sam-z-1", | |
| "choices": [{{ | |
| "text": "Completion text here", | |
| "index": 0, | |
| "finish_reason": "stop" | |
| }}], | |
| "usage": {{ | |
| "prompt_tokens": 5, | |
| "completion_tokens": 15, | |
| "total_tokens": 20 | |
| }}, | |
| "stats": {{ | |
| "elapsed_sec": 1.2, | |
| "tokens_per_sec": 12.5 | |
| }} | |
| }} | |
| ``` | |
| ## Complete Example Script | |
| ```python | |
| #!/usr/bin/env python3 | |
| """ | |
| SAM-Z-1 API Client Example | |
| """ | |
| from gradio_client import Client | |
| import json | |
| # Initialize client | |
| client = Client("https://YOUR-SPACE.hf.space") | |
| def chat(message, history=[]): | |
| \"\"\"Send a chat message\"\"\" | |
| messages = history + [{{"role": "user", "content": message}}] | |
| result = client.predict( | |
| messages_json=json.dumps(messages), | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/chat_completions" | |
| ) | |
| response = json.loads(result) | |
| assistant_msg = response["choices"][0]["message"]["content"] | |
| # Update history | |
| history.append({{"role": "user", "content": message}}) | |
| history.append({{"role": "assistant", "content": assistant_msg}}) | |
| return assistant_msg, history | |
| def complete(prompt): | |
| \"\"\"Complete text\"\"\" | |
| result = client.predict( | |
| prompt=prompt, | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/text_completions" | |
| ) | |
| response = json.loads(result) | |
| return response["choices"][0]["text"] | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Chat example | |
| print("=== Chat Example ===") | |
| history = [] | |
| response, history = chat("Hello! Who are you?", history) | |
| print(f"Assistant: {{response}}\\n") | |
| response, history = chat("What can you help me with?", history) | |
| print(f"Assistant: {{response}}\\n") | |
| # Text completion example | |
| print("\\n=== Text Completion Example ===") | |
| completion = complete("Once upon a time in a distant galaxy") | |
| print(f"Completion: {{completion}}") | |
| ``` | |
| ## Parameters Guide | |
| ### Temperature (0.1 - 2.0) | |
| - **Low (0.1-0.5)**: More focused, deterministic, factual | |
| - **Medium (0.6-0.9)**: Balanced creativity and coherence | |
| - **High (1.0-2.0)**: More creative, diverse, unpredictable | |
| ### Top-P (0.1 - 1.0) | |
| - Controls diversity via nucleus sampling | |
| - **0.9** (default): Good balance | |
| - Lower values = more focused | |
| - Higher values = more diverse | |
| ### Top-K (1 - 100) | |
| - Limits vocabulary to top K tokens | |
| - **40** (default): Good balance | |
| - Lower values = more focused | |
| - Higher values = more diverse | |
| ### Repetition Penalty (1.0 - 2.0) | |
| - **1.0**: No penalty | |
| - **1.1** (default): Slight penalty | |
| - **1.5+**: Strong penalty (use if model repeats) | |
| ## Rate Limits & Performance | |
| - **Concurrent Requests**: Supported via Gradio queue | |
| - **Average Speed**: 10-20 tokens/sec on CPU | |
| - **Context Window**: {config['max_position_embeddings']} tokens | |
| - **Queue Size**: Up to 20 concurrent requests | |
| ## Error Handling | |
| ```python | |
| try: | |
| result = client.predict( | |
| messages_json=json.dumps(messages), | |
| max_tokens=512, | |
| temperature=0.8, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.1, | |
| stream=False, | |
| api_name="/chat_completions" | |
| ) | |
| response = json.loads(result) | |
| if "error" in response: | |
| print(f"API Error: {{response['error']}}") | |
| else: | |
| print(response["choices"][0]["message"]["content"]) | |
| except Exception as e: | |
| print(f"Request failed: {{e}}") | |
| ``` | |
| ## Troubleshooting | |
| **Connection Issues:** | |
| - Verify Space URL is correct | |
| - Check if Space is running | |
| - Ensure gradio_client is installed | |
| **Slow Responses:** | |
| - Reduce `max_tokens` | |
| - Lower `top_k` value | |
| - Use shorter prompts | |
| **Repetitive Output:** | |
| - Increase `repetition_penalty` (try 1.2-1.5) | |
| - Adjust `temperature` higher | |
| - Use `top_p` sampling | |
| **Incoherent Output:** | |
| - Lower `temperature` (try 0.5-0.7) | |
| - Reduce `top_k` (try 20-30) | |
| - Ensure prompt is clear and well-formatted | |
| ## Chat Template Format | |
| The model uses ChatML format: | |
| ``` | |
| <|im_start|>system | |
| System message here<|im_end|> | |
| <|im_start|>user | |
| User message here<|im_end|> | |
| <|im_start|>assistant | |
| Assistant response here<|im_end|> | |
| ``` | |
| ## Tips for Best Results | |
| 1. **Use clear, specific prompts** | |
| 2. **Lower temperature for factual tasks** | |
| 3. **Higher temperature for creative tasks** | |
| 4. **Adjust repetition penalty if model repeats phrases** | |
| 5. **Keep context under {config['max_position_embeddings']} tokens** | |
| 6. **Use system messages to set behavior** | |
| ## Model Capabilities | |
| β General conversation | |
| β Question answering | |
| β Code generation | |
| β Creative writing | |
| β Text completion | |
| β Instruction following | |
| β Does NOT use reasoning tokens (`<think>` tags) | |
| β Not fine-tuned for specific domains | |
| --- | |
| **Model**: SAM-Z-1 | **API Version**: 1.0 | |
| **Support**: Open an issue on the Space for bugs or questions | |
| """) | |
| # ========== API Routes - MUST USE api_name parameter ========== | |
| chat_btn.click( | |
| fn=chat_completion_api, | |
| inputs=[ | |
| messages_input, chat_max_tokens, chat_temperature, | |
| chat_top_p, chat_top_k, chat_rep_penalty, chat_stream | |
| ], | |
| outputs=[chat_output], | |
| api_name="chat_completions" # This creates /call/chat_completions endpoint | |
| ) | |
| text_btn.click( | |
| fn=text_completion_api, | |
| inputs=[ | |
| prompt_input, text_max_tokens, text_temperature, | |
| text_top_p, text_top_k, text_rep_penalty, text_stream | |
| ], | |
| outputs=[text_output], | |
| api_name="text_completions" # This creates /call/text_completions endpoint | |
| ) | |
| # Launch | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) |