Spaces:

Bc-AI
/

worker-large

Runtime error

App Files Files Community

Bc-AI commited on Dec 18, 2025

Commit

666ed75

verified ·

1 Parent(s): 3f1837e

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

Dockerfile +24 -0
README.md +12 -10
model_architecture.py +205 -0
requirements.txt +11 -0
shared/chat_history.py +80 -0
shared/models.py +34 -0
space-config.yaml +8 -0
worker_app.py +362 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY worker_app.py .
+COPY model_architecture.py .
+COPY ../shared ./shared
+# Expose port for the API
+EXPOSE 8000
+# Start the application
+CMD ["python", "worker_app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
----
-title: Worker Large
-emoji: 🌍
-colorFrom: red
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SACCP Worker_Large Node
+This is a worker_large node in the SACCP (Scalable Accelerated Compute Protocol) distributed computing network.
+## Node Type: WORKER_LARGE
+- Processes tasks according to SACCP protocol
+- Contributes computational resources to the network
+- Earns cloud credits for resource contribution
+## Architecture
+- Built with FastAPI and TensorFlow/Keras
+- Implements fault-tolerant operations
+- Integrated with SACCP credit system

model_architecture.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import tensorflow as tf
+import keras
+import numpy as np
+@keras.saving.register_keras_serializable()
+class RotaryEmbedding(keras.layers.Layer):
+    def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.max_len = max_len
+        self.theta = theta
+        self.built_cache = False
+        self.cos_cached = None
+        self.sin_cached = None
+    def build(self, input_shape):
+        super().build(input_shape)
+    def _build_cache(self):
+        if not self.built_cache:
+            inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
+            t = tf.range(self.max_len, dtype=tf.float32)
+            freqs = tf.einsum("i,j->ij", t, inv_freq)
+            emb = tf.concat([freqs, freqs], axis=-1)
+            self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
+            self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
+            self.built_cache = True
+    def rotate_half(self, x):
+        x1, x2 = tf.split(x, 2, axis=-1)
+        return tf.concat([-x2, x1], axis=-1)
+    def call(self, q, k, offset=0):
+        """Apply rotary embeddings with position offset."""
+        self._build_cache()
+        seq_len = tf.shape(q)[2]
+        dtype = q.dtype
+        cos = tf.cast(self.cos_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
+        sin = tf.cast(self.sin_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
+        q_embed = (q * cos) + (self.rotate_half(q) * sin)
+        k_embed = (k * cos) + (self.rotate_half(k) * sin)
+        return q_embed, k_embed
+    def get_config(self):
+        config = super().get_config()
+        config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
+        return config
+@keras.saving.register_keras_serializable()
+class RMSNorm(keras.layers.Layer):
+    def __init__(self, epsilon=1e-5, **kwargs):
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+        self.scale = None
+    def build(self, input_shape):
+        self.scale = self.add_weight(name="scale", shape=(input_shape[-1],), initializer="ones")
+        super().build(input_shape)
+    def call(self, x):
+        variance = tf.reduce_mean(tf.square(x), axis=-1, keepdims=True)
+        return x * tf.math.rsqrt(variance + self.epsilon) * self.scale
+    def get_config(self):
+        config = super().get_config()
+        config.update({"epsilon": self.epsilon})
+        return config
+@keras.saving.register_keras_serializable()
+class TransformerBlock(keras.layers.Layer):
+    def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.ff_dim = ff_dim
+        self.dropout_rate = dropout
+        self.max_len = max_len
+        self.rope_theta = rope_theta
+        self.head_dim = d_model // n_heads
+        self.layer_idx = layer_idx
+    def build(self, input_shape):
+        self.pre_attn_norm = RMSNorm(name="pre_attn_norm")
+        self.pre_ffn_norm = RMSNorm(name="pre_ffn_norm")
+        self.q_proj = keras.layers.Dense(self.d_model, use_bias=False, name="q_proj")
+        self.k_proj = keras.layers.Dense(self.d_model, use_bias=False, name="k_proj")
+        self.v_proj = keras.layers.Dense(self.d_model, use_bias=False, name="v_proj")
+        self.out_proj = keras.layers.Dense(self.d_model, use_bias=False, name="o_proj")
+        self.rope = RotaryEmbedding(self.head_dim, max_len=self.max_len, theta=self.rope_theta)
+        self.gate_proj = keras.layers.Dense(self.ff_dim, use_bias=False, name="gate_proj")
+        self.up_proj = keras.layers.Dense(self.ff_dim, use_bias=False, name="up_proj")
+        self.down_proj = keras.layers.Dense(self.d_model, use_bias=False, name="down_proj")
+        self.dropout = keras.layers.Dropout(self.dropout_rate)
+        super().build(input_shape)
+    def call(self, x, training=None, past_kv=None, use_cache=False):
+        """Simplified call without KV cache for this example"""
+        B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
+        dtype = x.dtype
+        res = x
+        y = self.pre_attn_norm(x)
+        # Multi-head attention
+        q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
+        # Apply RoPE
+        q, k = self.rope(q, k, offset=0)
+        # Attention scores
+        scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        # Causal mask
+        mask = tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0)  # Upper triangular
+        mask = tf.where(mask == 0, tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
+        scores = scores + mask[None, None, :, :]
+        attn = tf.nn.softmax(scores, axis=-1)
+        attn_out = tf.matmul(attn, v)
+        attn_out = tf.transpose(attn_out, [0, 2, 1, 3])
+        attn_out = tf.reshape(attn_out, [B, T, self.d_model])
+        x = res + self.dropout(self.out_proj(attn_out), training=training)
+        # FFN
+        res = x
+        y = self.pre_ffn_norm(x)
+        ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
+        output = res + self.dropout(ffn, training=training)
+        return output, None  # Return None for past_kv in this simplified version
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "d_model": self.d_model,
+            "n_heads": self.n_heads,
+            "ff_dim": self.ff_dim,
+            "dropout": self.dropout_rate,
+            "max_len": self.max_len,
+            "rope_theta": self.rope_theta,
+            "layer_idx": self.layer_idx
+        })
+        return config
+@keras.saving.register_keras_serializable()
+class SAM1Model(keras.Model):
+    def __init__(self, **kwargs):
+        super().__init__()
+        if 'config' in kwargs and isinstance(kwargs['config'], dict):
+            self.cfg = kwargs['config']
+        elif 'vocab_size' in kwargs:
+            self.cfg = kwargs
+        else:
+            self.cfg = kwargs.get('cfg', kwargs)
+        self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
+        ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
+        block_args = {
+            'd_model': self.cfg['d_model'],
+            'n_heads': self.cfg['n_heads'],
+            'ff_dim': ff_dim,
+            'dropout': self.cfg['dropout'],
+            'max_len': self.cfg['max_len'],
+            'rope_theta': self.cfg['rope_theta']
+        }
+        self.blocks = [
+            TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
+            for i in range(self.cfg['n_layers'])
+        ]
+        self.norm = RMSNorm(name="final_norm")
+        self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
+    def call(self, input_ids, training=None, past_kv=None, use_cache=False):
+        """
+        Simplified call without full KV cache implementation
+        """
+        x = self.embed(input_ids)
+        for block in self.blocks:
+            x, _ = block(x, training=training, past_kv=None, use_cache=False)
+        logits = self.lm_head(self.norm(x))
+        return logits, None  # Return None for past_kv in this simplified version
+    def get_config(self):
+        base_config = super().get_config()
+        base_config['config'] = self.cfg
+        return base_config
+def count_parameters(model):
+    """Count model parameters"""
+    total_params = 0
+    for weight in model.weights:
+        w = weight.numpy()
+        total_params += w.size
+    return total_params

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# Requirements for Worker Nodes
+keras==2.15.0
+tensorflow==2.15.0
+fastapi==0.104.1
+uvicorn==0.24.0
+requests==2.31.0
+huggingface_hub==0.20.1
+tokenizers==0.15.0
+transformers==4.35.2
+numpy==1.24.3
+pytz==2023.3.post1

shared/chat_history.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import json
+import time
+from datetime import datetime
+from typing import List, Dict, Any
+from .models import ChatMessage
+def save_chat_history(messages: List[ChatMessage], model_name: str, response: str, filename: str = "chat.md"):
+    """
+    Save chat history to a markdown file with timestamp and model information
+    """
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    # Prepare the markdown content
+    history_content = f"""
+## Chat Session: {timestamp}
+**Model Used:** {model_name}
+---
+"""
+    # Add all messages to the markdown file
+    for msg in messages:
+        role_prefix = "**User:**" if msg.role.lower() == "user" else "**Assistant:**"
+        history_content += f"\n{role_prefix} {msg.content}\n\n"
+    # Add the final response from the assistant
+    history_content += f"\n**Assistant Response:** {response}\n\n---\n\n"
+    # Append to the chat history file
+    with open(filename, "a", encoding="utf-8") as file:
+        file.write(history_content)
+def save_detailed_chat_log(request_data: Dict[str, Any], response_data: str, model_name: str, processing_time: float, filename: str = "chat.md"):
+    """
+    Save detailed chat log with metadata
+    """
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    log_content = f"""
+## Chat Request Log: {timestamp}
+- **Model:** {model_name}
+- **Processing Time:** {processing_time:.2f}s
+- **Max Tokens:** {request_data.get('max_tokens', 512)}
+- **Temperature:** {request_data.get('temperature', 0.8)}
+### Input Messages:
+"""
+    # Add the messages from the request
+    messages = request_data.get('messages', [])
+    for msg in messages:
+        role = msg.get('role', 'unknown')
+        content = msg.get('content', '')
+        role_display = "**User**" if role.lower() == 'user' else "**Assistant**"
+        log_content += f"- {role_display}: {content}\n"
+    log_content += f"\n### Model Response:\n{response_data}\n\n---\n\n"
+    # Append to the file
+    with open(filename, "a", encoding="utf-8") as file:
+        file.write(log_content)
+def initialize_chat_file(filename: str = "chat.md"):
+    """
+    Initialize the chat history file with header if it doesn't exist
+    """
+    if not os.path.exists(filename):
+        header = f"""# Chat History
+Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+This file contains the history of all chat conversations processed by the multi-node API system.
+---
+"""
+        with open(filename, "w", encoding="utf-8") as file:
+            file.write(header)

shared/models.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+class ChatMessage(BaseModel):
+    role: str  # "user" or "assistant"
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    model: str = "sam-x-nano"
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.8
+    top_k: Optional[int] = 40
+    top_p: Optional[float] = 0.9
+    repetition_penalty: Optional[float] = 1.1
+    stream: Optional[bool] = False
+class ChatResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Dict[str, int]
+class WorkerStatus(BaseModel):
+    model_name: str
+    is_active: bool
+    load: float
+    last_heartbeat: int

space-config.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# SACCP Node Space Configuration
+runtime:
+  cpu: "medium"
+  memory: "16x"
+  accelerator: "cpu"  # Will be configured based on node type
+env:
+  NODE_TYPE: "large"
+  MODEL_TYPE: "sam-x-large"

worker_app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+import time
+import json
+import asyncio
+from datetime import datetime
+from typing import Dict, List, Optional
+from fastapi import FastAPI, HTTPException
+import uvicorn
+from pydantic import BaseModel
+from shared.models import ChatRequest, ChatResponse, ChatMessage
+import tensorflow as tf
+import keras
+import numpy as np
+from tokenizers import Tokenizer
+from huggingface_hub import hf_hub_download
+import requests
+from transformers import GPT2Tokenizer
+app = FastAPI(
+    title="Worker Node for Sam-X Models",
+    description="Processing node for Sam-X model inference",
+    version="1.0.0"
+)
+# Global variables for model and tokenizer
+tokenizer = None
+model = None
+model_loaded = False
+# Configuration
+MODEL_REPO = os.getenv("MODEL_REPO", "Smilyai-labs/Sam-large-2")
+MODEL_TYPE = os.getenv("MODEL_TYPE", "sam-x-nano")  # Determines which model to load
+CACHE_DIR = "./model_cache"
+# Performance optimizations
+NUM_CORES = os.cpu_count() or 4
+os.environ['TF_NUM_INTEROP_THREADS'] = str(NUM_CORES)
+os.environ['TF_NUM_INTRAOP_THREADS'] = str(NUM_CORES)
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Force CPU only
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'  # Intel optimization
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'   # Reduce TF logging
+# Configure TF threading
+tf.config.threading.set_inter_op_parallelism_threads(NUM_CORES)
+tf.config.threading.set_intra_op_parallelism_threads(NUM_CORES)
+print(f"✅ CPU optimized: {NUM_CORES} threads, oneDNN enabled")
+def load_tokenizer():
+    """Load the tokenizer from Hugging Face or local files"""
+    global tokenizer
+    print("🚀 Loading tokenizer...")
+    try:
+        # Try to load from Hugging Face
+        from transformers import AutoTokenizer
+        hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        # Add special tokens specific to your models
+        special_tokens = ["
+", "
+", "
+", "
+", "<CONTINUE>", "<im end for model tun>"]
+        hf_tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
+        # Save temporarily to create tokenizers instance
+        os.makedirs("./temp_tokenizer", exist_ok=True)
+        hf_tokenizer.save_pretrained("./temp_tokenizer")
+        tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
+        print(f"✅ Tokenizer loaded with vocab size: {tokenizer.get_vocab_size()}")
+    except Exception as e:
+        print(f"❌ Error loading tokenizer: {e}")
+        raise
+def load_model():
+    """Load the specific model based on MODEL_TYPE environment variable"""
+    global model, model_loaded
+    print(f"🚀 Loading {MODEL_TYPE} model...")
+    try:
+        # Determine which model to load based on MODEL_TYPE
+        if MODEL_TYPE == "sam-x-nano":
+            # Load nano model
+            config_path = hf_hub_download("Smilyai-labs/Sam-nano", "config.json", cache_dir=CACHE_DIR)
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+        elif MODEL_TYPE == "sam-x-mini":
+            # Load mini model
+            config_path = hf_hub_download("Smilyai-labs/Sam-mini", "config.json", cache_dir=CACHE_DIR)
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+        elif MODEL_TYPE == "sam-x-fast":
+            # Load fast model
+            config_path = hf_hub_download("Smilyai-labs/Sam-fast", "config.json", cache_dir=CACHE_DIR)
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+        else:  # Default to large model
+            # Load from the default repo
+            config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+        # Build model from config
+        model_config = {
+            'vocab_size': config.get('vocab_size', 50432),
+            'd_model': config.get('hidden_size', 768),
+            'n_layers': config.get('num_hidden_layers', 12),
+            'n_heads': config.get('num_attention_heads', 12),
+            'ff_mult': config.get('intermediate_size', 3072) / config.get('hidden_size', 768),
+            'max_len': config.get('max_position_embeddings', 2048),
+            'dropout': 0.1,
+            'rope_theta': config.get('rope_theta', 10000)
+        }
+        from model_architecture import SAM1Model  # Import from your architecture file
+        model = SAM1Model(config=model_config)
+        # Build model with dummy input
+        dummy_input = tf.zeros((1, 16), dtype=tf.int32)
+        _ = model(dummy_input, training=False, use_cache=False)
+        print(f"✅ Model loaded: {config.get('num_hidden_layers', 12)} layers")
+        # Try to load weights
+        try:
+            weights_path = hf_hub_download(MODEL_REPO, "model.weights.h5", cache_dir=CACHE_DIR)
+            model.load_weights(weights_path)
+            print("✅ Model weights loaded successfully!")
+        except Exception as e:
+            print(f"⚠️ Could not load weights, using random initialization: {e}")
+        # Warm up the model
+        print("🔥 Warming up model...")
+        warmup_input = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
+        _, _ = model(warmup_input, training=False, use_cache=True)
+        print("✅ Model warmed up")
+        model_loaded = True
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        raise
+def format_chat_prompt(messages: List[Dict[str, str]]) -> str:
+    """Format chat messages into a prompt for the model"""
+    prompt = ""
+    for msg in messages:
+        role = msg.get('role', 'user')
+        content = msg.get('content', '')
+        if role.lower() == 'user':
+            prompt += f"
+{content}
+"
+        elif role.lower() == 'assistant':
+            prompt += f"
+{content}
+"
+        else:
+            # System or other roles
+            prompt += f"{content}\n"
+    # Add assistant prefix for the response
+    prompt += "
+"
+    return prompt
+def sample_token(logits, temperature=0.8, top_k=40, top_p=0.9, repetition_penalty=1.1):
+    """Sample next token from logits"""
+    # Apply temperature
+    logits = logits / temperature
+    # Apply repetition penalty
+    if repetition_penalty != 1.0:
+        logits = np.where(logits < 0, logits * repetition_penalty, logits / repetition_penalty)
+    # Convert to probabilities
+    probs = np.exp(logits - np.max(logits))  # Numerical stability
+    probs = probs / np.sum(probs)
+    # Top-k filtering
+    if top_k > 0 and top_k < len(probs):
+        top_k_idx = np.argpartition(probs, -top_k)[-top_k:]
+        top_k_probs = probs[top_k_idx]
+        top_k_probs = top_k_probs / np.sum(top_k_probs)  # Normalize
+        sampled_idx = np.random.choice(len(top_k_idx), p=top_k_probs)
+        return top_k_idx[sampled_idx]
+    # Top-p (nucleus) sampling
+    if top_p < 1.0:
+        sorted_idx = np.argsort(probs)[::-1]
+        sorted_probs = probs[sorted_idx]
+        cumulative_probs = np.cumsum(sorted_probs)
+        cutoff_idx = np.searchsorted(cumulative_probs, top_p)
+        cutoff_idx = min(cutoff_idx + 1, len(sorted_idx))
+        nucleus_idx = sorted_idx[:cutoff_idx]
+        nucleus_probs = probs[nucleus_idx]
+        nucleus_probs = nucleus_probs / np.sum(nucleus_probs)  # Normalize
+        sampled_idx = np.random.choice(len(nucleus_idx), p=nucleus_probs)
+        return nucleus_idx[sampled_idx]
+    # Regular sampling
+    return np.random.choice(len(probs), p=probs)
+def generate_response(prompt: str, max_tokens: int = 512, temperature: float = 0.8,
+                     top_k: int = 40, top_p: float = 0.9, repetition_penalty: float = 1.1) -> str:
+    """Generate response from the model"""
+    global model, tokenizer
+    if not model_loaded:
+        raise Exception("Model not loaded")
+    # Tokenize the prompt
+    prompt_ids = tokenizer.encode(prompt).ids
+    input_ids = tf.constant([prompt_ids], dtype=tf.int32)
+    # Run the model
+    generated_ids = []
+    current_ids = input_ids
+    # Process tokens one by one (simplified generation without KV cache for this example)
+    for i in range(max_tokens):
+        with tf.device('/CPU:0'):  # Use CPU for inference
+            logits, _ = model(current_ids, training=False, use_cache=False)
+            next_token_logits = logits[0, -1, :].numpy()
+        # Sample next token
+        next_token_id = sample_token(next_token_logits, temperature, top_k, top_p, repetition_penalty)
+        # Add to generated sequence
+        generated_ids.append(next_token_id)
+        current_ids = tf.constant([[next_token_id]], dtype=tf.int32)
+        # Stop if we hit an end token
+        if next_token_id in [50256, tokenizer.token_to_id("
+"), tokenizer.token_to_id("<im end for model tun>")]:
+            break
+    # Decode the generated tokens
+    generated_text = tokenizer.decode(generated_ids)
+    # Clean up the response
+    # Remove any end tokens that might have been included
+    stop_tokens = ["
+", "<im end for model tun>"]
+    for token in stop_tokens:
+        idx = generated_text.find(token)
+        if idx != -1:
+            generated_text = generated_text[:idx]
+    return generated_text.strip()
+@app.on_event("startup")
+def startup_event():
+    """Initialize model and tokenizer on startup"""
+    global model_loaded
+    print(f"Initializing worker for model type: {MODEL_TYPE}")
+    try:
+        load_tokenizer()
+        load_model()
+        print("✅ Worker initialized successfully!")
+    except Exception as e:
+        print(f"❌ Worker initialization failed: {e}")
+        model_loaded = False
+@app.post("/chat/completions")
+async def chat_completions(request: ChatRequest):
+    """Process chat completion request"""
+    global model_loaded
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    try:
+        # Format the messages into a single prompt
+        messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
+        prompt = format_chat_prompt(messages)
+        # Generate response
+        start_time = time.time()
+        response_text = generate_response(
+            prompt=prompt,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            repetition_penalty=request.repetition_penalty
+        )
+        processing_time = time.time() - start_time
+        # Create response in OpenAI-compatible format
+        response = ChatResponse(
+            id=f"chat-{int(time.time())}",
+            model=request.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": response_text},
+                    "finish_reason": "stop"
+                }
+            ],
+            usage={
+                "prompt_tokens": len(prompt),
+                "completion_tokens": len(response_text),
+                "total_tokens": len(prompt) + len(response_text)
+            }
+        )
+        print(f"Generated response in {processing_time:.2f}s for model {request.model}")
+        return response.dict()
+    except Exception as e:
+        print(f"Error processing request: {e}")
+        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy" if model_loaded else "unhealthy",
+        "model_type": MODEL_TYPE,
+        "model_loaded": model_loaded,
+        "timestamp": int(time.time())
+    }
+@app.get("/model-info")
+async def model_info():
+    """Get information about the loaded model"""
+    if not model_loaded:
+        raise HTTPException(status_code=404, detail="Model not loaded")
+    return {
+        "model_type": MODEL_TYPE,
+        "vocab_size": tokenizer.get_vocab_size() if tokenizer else 0,
+        "parameters": model.count_params() if model else 0,
+        "max_context_length": 2048  # Default, would be from config
+    }
+if __name__ == "__main__":
+    port = int(os.getenv("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)