Spaces:

Bc-AI
/

Worker-2

Sleeping

App Files Files Community

Bc-AI commited on Nov 4, 2025

Commit

25388aa

verified ·

1 Parent(s): 120f320

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -398

app.py CHANGED Viewed

@@ -1,9 +1,6 @@
 """
-SAM-Z-1 Distributed Worker Node v5.0
-- Supports BOTH old SAM-Z-1 AND 4 new SAM-X-1 models
-- Different tokenizers and vocabularies per model family
-- Auto version detection
-- Backward compatible with v4 head nodes
 """
 from fastapi import FastAPI, HTTPException
@@ -17,56 +14,10 @@ import os
 from tokenizers import Tokenizer
 import numpy as np
 import time
-from typing import List, Optional, Dict
 import asyncio
-app = FastAPI(title="SAM-Z-1 Distributed Worker", version="5.0.0")
-# ============================================================================
-# Configuration - ALL 5 MODELS
-# ============================================================================
-MODEL_REGISTRY = {
-    # Original SAM-Z-1 (keep this!)
-    "SAM-Z-1": {
-        "repo": "Smilyai-labs/Sam-Z-1-tensorflow",
-        "weights": "ckpt.weights.h5",
-        "config": "config.json",
-        "tokenizer_repo": "Smilyai-labs/Sam-Z-1-tensorflow",
-        "family": "sam-z"  # Different tokenizer family
-    },
-    # New SAM-X-1 family (different tokenizer!)
-    "SAM-X-1-Large": {
-        "repo": "Smilyai-labs/Sam-1x-instruct",
-        "weights": "ckpt.weights.h5",
-        "config": None,
-        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
-        "family": "sam-x"
-    },
-    "SAM-X-1-Fast": {
-        "repo": "Smilyai-labs/Sam-X-1-fast",
-        "weights": "sam1_fast_finetuned.weights.h5",
-        "config": "sam1_fast_finetuned_config.json",
-        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
-        "family": "sam-x"
-    },
-    "SAM-X-1-Mini": {
-        "repo": "Smilyai-labs/Sam-X-1-Mini",
-        "weights": "sam1_mini_finetuned.weights.h5",
-        "config": "sam1_mini_finetuned_config.json",
-        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
-        "family": "sam-x"
-    },
-    "SAM-X-1-Nano": {
-        "repo": "Smilyai-labs/Sam-X-1-Nano",
-        "weights": "sam1_nano_finetuned.weights.h5",
-        "config": "sam1_nano_finetuned_config.json",
-        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
-        "family": "sam-x"
-    }
-}
-CACHE_DIR = "./model_cache"
 # ============================================================================
 # Model Architecture
@@ -250,19 +201,24 @@ class SAM1Model(keras.Model):
         return base_config
 # ============================================================================
-# Global State - Separate tokenizers per family!
 # ============================================================================
-loaded_models = {}  # Dict[model_name, (model, fast_forward, config, tokenizer, eos_token_id)]
-tokenizer_cache = {}  # Dict[family, (tokenizer, eos_token_id)]
-current_model = None
 worker_stats = {
     "total_requests": 0,
     "total_tokens": 0,
     "decode_requests": 0,
-    "uptime_start": time.time(),
-    "model_usage": {}
 }
 # ============================================================================
@@ -278,7 +234,6 @@ class GenerateRequest(BaseModel):
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
-    model: Optional[str] = None
 class ChatMessage(BaseModel):
     role: str
@@ -293,70 +248,12 @@ class ChatRequest(BaseModel):
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
-    model: Optional[str] = None
 class DecodeRequest(BaseModel):
     token_ids: List[int]
-    model: Optional[str] = None  # Need to know which tokenizer to use!
 class BatchDecodeRequest(BaseModel):
     batches: List[List[int]]
-    model: Optional[str] = None
-# ============================================================================
-# Tokenizer Management
-# ============================================================================
-async def load_tokenizer(family: str, repo: str) -> tuple:
-    """Load tokenizer for a model family"""
-    if family in tokenizer_cache:
-        return tokenizer_cache[family]
-    print(f"   🔤 Loading tokenizer for {family} family from {repo}...")
-    try:
-        from transformers import AutoTokenizer
-        hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
-        hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
-        os.makedirs(f"./temp_tokenizer_{family}", exist_ok=True)
-        hf_tokenizer.save_pretrained(f"./temp_tokenizer_{family}")
-        tokenizer = Tokenizer.from_file(f"./temp_tokenizer_{family}/tokenizer.json")
-        eos_token = "<|endoftext|>"
-        eos_token_id = tokenizer.token_to_id(eos_token)
-        if eos_token_id is None:
-            tokenizer.add_special_tokens([eos_token])
-            eos_token_id = tokenizer.token_to_id(eos_token)
-        tokenizer_cache[family] = (tokenizer, eos_token_id)
-        print(f"   ✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()}, EOS: {eos_token_id})")
-        return tokenizer, eos_token_id
-    except Exception as e:
-        print(f"   ⚠️ Tokenizer load failed: {e}")
-        raise
-def get_tokenizer_for_model(model_name: str):
-    """Get the correct tokenizer for a model"""
-    if not model_name or model_name not in loaded_models:
-        model_name = current_model
-    if model_name in loaded_models:
-        _, _, _, tokenizer, eos_id = loaded_models[model_name]
-        return tokenizer, eos_id
-    # Fallback to first available
-    if loaded_models:
-        first_model = list(loaded_models.keys())[0]
-        _, _, _, tokenizer, eos_id = loaded_models[first_model]
-        return tokenizer, eos_id
-    raise HTTPException(status_code=503, detail="No models loaded")
 # ============================================================================
 # Generation Functions
@@ -369,22 +266,11 @@ def generate_tokens(
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1,
-    return_token_ids: bool = False,
-    model_name: Optional[str] = None
 ):
-    """Core generation with correct tokenizer per model"""
-    global loaded_models, current_model
-    # Select model
-    if model_name and model_name in loaded_models:
-        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[model_name]
-    elif current_model:
-        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[current_model]
-    else:
-        model_name = list(loaded_models.keys())[0]
-        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[model_name]
-    # Encode with model's tokenizer
     input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
     if len(input_ids) == 0:
@@ -463,29 +349,26 @@ def format_chat_prompt(messages: List[ChatMessage]) -> str:
 @app.get("/", response_class=HTMLResponse)
 async def status_page():
-    models_html = ""
-    for model_name in loaded_models.keys():
-        usage = worker_stats["model_usage"].get(model_name, 0)
-        _, _, _, tokenizer, _ = loaded_models[model_name]
-        vocab_size = tokenizer.get_vocab_size()
-        models_html += f'<li><strong>{model_name}</strong> - Vocab: {vocab_size} - Used: {usage}x</li>'
-    return f"""
 <!DOCTYPE html>
 <html>
 <head>
-    <title>SAM Worker v5.0 - Multi-Model</title>
     <style>
-        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
-        body {{
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #1a1f3a 0%, #0a0e27 100%);
             color: #00bfff;
             padding: 20px;
             min-height: 100vh;
-        }}
-        .container {{ max-width: 1000px; margin: 0 auto; }}
-        .header {{
             text-align: center;
             padding: 30px;
             background: rgba(0, 191, 255, 0.1);
@@ -493,77 +376,93 @@ async def status_page():
             border-radius: 10px;
             margin-bottom: 30px;
             box-shadow: 0 0 20px rgba(0, 191, 255, 0.3);
-        }}
-        .header h1 {{
             font-size: 2.5em;
             text-transform: uppercase;
             letter-spacing: 3px;
             animation: glow 2s ease-in-out infinite alternate;
-        }}
-        @keyframes glow {{
-            from {{ text-shadow: 0 0 10px #00bfff; }}
-            to {{ text-shadow: 0 0 20px #00bfff, 0 0 30px #00bfff; }}
-        }}
-        .badge {{
             display: inline-block;
             padding: 5px 15px;
             border-radius: 15px;
             font-size: 0.9em;
-            margin: 5px;
-        }}
-        .badge-v5 {{
             background: rgba(0, 255, 136, 0.2);
             border: 1px solid #00ff88;
             color: #00ff88;
-        }}
-        .badge-multi {{
             background: rgba(255, 165, 0, 0.2);
             border: 1px solid #ffa500;
             color: #ffa500;
-        }}
-        .stats-grid {{
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
-        }}
-        .stat-card {{
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
             text-align: center;
-        }}
-        .stat-label {{ font-size: 0.8em; opacity: 0.7; text-transform: uppercase; margin-bottom: 10px; }}
-        .stat-value {{ font-size: 2em; font-weight: bold; }}
-        .features {{
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
-            margin-bottom: 20px;
-        }}
-        .features h3 {{ margin-bottom: 15px; }}
-        .feature-list {{ list-style: none; padding: 0; }}
-        .feature-list li {{
             padding: 10px;
             margin: 5px 0;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 5px;
-            border-left: 3px solid #00ff88;
-        }}
-        .timestamp {{ text-align: center; margin-top: 20px; opacity: 0.5; }}
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚙️ WORKER NODE ⚙️</h1>
-            <div>SAM-Z-1 Distributed Worker v5.0</div>
-            <div>
-                <span class="badge badge-v5">V5 PROTOCOL</span>
-                <span class="badge badge-multi">{len(loaded_models)} MODELS</span>
-            </div>
         </div>
         <div class="stats-grid" id="stats">
@@ -585,23 +484,14 @@ async def status_page():
             </div>
         </div>
-        <div class="features">
-            <h3>🤖 LOADED MODELS ({len(loaded_models)})</h3>
-            <ul class="feature-list">
-                {models_html}
-            </ul>
-        </div>
         <div class="features">
             <h3>🚀 CAPABILITIES</h3>
             <ul class="feature-list">
-                <li>✅ Original SAM-Z-1 (preserved)</li>
-                <li>✅ 4 new SAM-X-1 models</li>
-                <li>✅ Separate tokenizers per family</li>
-                <li>✅ Multi-model selection</li>
-                <li>✅ Token & batch decoding</li>
-                <li>✅ Streaming support</li>
-                <li>✅ Auto version detection</li>
             </ul>
         </div>
@@ -609,8 +499,21 @@ async def status_page():
     </div>
     <script>
-        async function updateStats() {{
-            try {{
                 const statsRes = await fetch('/stats');
                 const stats = await statsRes.json();
@@ -622,15 +525,16 @@ async def status_page():
                 const h = Math.floor(uptime / 3600);
                 const m = Math.floor((uptime % 3600) / 60);
                 const s = uptime % 60;
-                document.getElementById('uptime').textContent = `${{h}}h ${{m}}m ${{s}}s`;
                 document.getElementById('timestamp').textContent =
-                    `Last update: ${{new Date().toLocaleTimeString()}}`;
-            }} catch (e) {{
                 console.error('Failed to update stats:', e);
-            }}
-        }}
         setInterval(updateStats, 1000);
         updateStats();
     </script>
@@ -645,38 +549,8 @@ async def status_page():
 @app.get("/health")
 async def health():
     return {
-        "status": "healthy" if loaded_models else "loading",
-        "model_loaded": len(loaded_models) > 0,
-        "models_count": len(loaded_models)
-    }
-@app.get("/info")
-async def worker_info():
-    """Worker information for version detection"""
-    return {
-        "version": "v5",
-        "models": list(loaded_models.keys()),
-        "features": [
-            "multi_model",
-            "model_selection",
-            "separate_tokenizers",
-            "token_generation",
-            "batch_decoding",
-            "streaming"
-        ],
-        "model_families": {
-            "sam-z": [m for m, info in MODEL_REGISTRY.items() if info["family"] == "sam-z"],
-            "sam-x": [m for m, info in MODEL_REGISTRY.items() if info["family"] == "sam-x"]
-        }
-    }
-@app.get("/models")
-async def list_models():
-    """List available models"""
-    return {
-        "models": list(loaded_models.keys()),
-        "default": current_model,
-        "count": len(loaded_models)
     }
 @app.get("/stats")
@@ -687,16 +561,17 @@ async def stats():
         "total_tokens": worker_stats["total_tokens"],
         "decode_requests": worker_stats["decode_requests"],
         "uptime": uptime,
-        "tokens_per_second": worker_stats["total_tokens"] / uptime if uptime > 0 else 0,
-        "model_usage": worker_stats["model_usage"]
     }
 @app.post("/decode")
 async def decode(request: DecodeRequest):
-    """Fast single decode - uses correct tokenizer"""
     try:
         worker_stats["decode_requests"] += 1
-        tokenizer, _ = get_tokenizer_for_model(request.model)
         text = tokenizer.decode(request.token_ids)
         return {"text": text}
     except Exception as e:
@@ -704,10 +579,12 @@ async def decode(request: DecodeRequest):
 @app.post("/decode/batch")
 async def batch_decode(request: BatchDecodeRequest):
-    """Optimized batch decoding - uses correct tokenizer"""
     try:
         worker_stats["decode_requests"] += len(request.batches)
-        tokenizer, _ = get_tokenizer_for_model(request.model)
         results = [tokenizer.decode(batch) for batch in request.batches]
         return {"texts": results}
     except Exception as e:
@@ -715,15 +592,9 @@ async def batch_decode(request: BatchDecodeRequest):
 @app.post("/generate")
 async def generate(request: GenerateRequest):
-    """Generate text with model selection"""
-    if not loaded_models:
-        raise HTTPException(status_code=503, detail="No models loaded")
-    # Track model usage
-    model_name = request.model or current_model
-    if model_name not in worker_stats["model_usage"]:
-        worker_stats["model_usage"][model_name] = 0
-    worker_stats["model_usage"][model_name] += 1
     worker_stats["total_requests"] += 1
     start_time = time.time()
@@ -741,8 +612,7 @@ async def generate(request: GenerateRequest):
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
-                    return_token_ids=request.return_token_ids,
-                    model_name=request.model
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
@@ -756,7 +626,7 @@ async def generate(request: GenerateRequest):
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
-                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed, 'model': model_name})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
@@ -775,8 +645,7 @@ async def generate(request: GenerateRequest):
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
-                return_token_ids=request.return_token_ids,
-                model_name=request.model
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
@@ -789,8 +658,7 @@ async def generate(request: GenerateRequest):
                 "text": generated_text,
                 "tokens": token_count,
                 "time": elapsed,
-                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0,
-                "model": model_name
             }
         except Exception as e:
@@ -798,15 +666,9 @@ async def generate(request: GenerateRequest):
 @app.post("/chat")
 async def chat(request: ChatRequest):
-    """Chat completion with model selection"""
-    if not loaded_models:
-        raise HTTPException(status_code=503, detail="No models loaded")
-    # Track model usage
-    model_name = request.model or current_model
-    if model_name not in worker_stats["model_usage"]:
-        worker_stats["model_usage"][model_name] = 0
-    worker_stats["model_usage"][model_name] += 1
     worker_stats["total_requests"] += 1
     prompt = format_chat_prompt(request.messages)
@@ -825,8 +687,7 @@ async def chat(request: ChatRequest):
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
-                    return_token_ids=request.return_token_ids,
-                    model_name=request.model
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
@@ -845,7 +706,7 @@ async def chat(request: ChatRequest):
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
-                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed, 'model': model_name})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
@@ -864,8 +725,7 @@ async def chat(request: ChatRequest):
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
-                return_token_ids=request.return_token_ids,
-                model_name=request.model
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
@@ -886,8 +746,7 @@ async def chat(request: ChatRequest):
                 },
                 "tokens": token_count,
                 "time": elapsed,
-                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0,
-                "model": model_name
             }
         except Exception as e:
@@ -897,152 +756,86 @@ async def chat(request: ChatRequest):
 # Model Loading
 # ============================================================================
-async def load_single_model(model_name: str, model_info: dict) -> bool:
-    """Load a single model with its tokenizer"""
-    global loaded_models, current_model
     try:
-        print(f"\n⏳ Loading: {model_name} ({model_info['family']} family)")
-        print(f"   Repo: {model_info['repo']}")
-        print(f"   Weights: {model_info['weights']}")
-        # Load tokenizer for this family
-        tokenizer, eos_token_id = await load_tokenizer(
-            model_info['family'],
-            model_info['tokenizer_repo']
-        )
-        # Load config
-        if model_info['config']:
-            print(f"   Config: {model_info['config']}")
-            config_path = hf_hub_download(
-                repo_id=model_info['repo'],
-                filename=model_info['config'],
-                cache_dir=CACHE_DIR
-            )
-            with open(config_path, 'r') as f:
-                config_raw = json.load(f)
-        else:
-            # Load base config for Large model
-            print(f"   Loading base config from tokenizer repo...")
-            config_path = hf_hub_download(
-                repo_id=model_info['tokenizer_repo'],
-                filename="config.json",
-                cache_dir=CACHE_DIR
-            )
-            with open(config_path, 'r') as f:
-                config_raw = json.load(f)
-        # Convert to model format
-        model_config = {
-            'vocab_size': config_raw['vocab_size'],
-            'd_model': config_raw['hidden_size'],
-            'n_heads': config_raw['num_attention_heads'],
-            'ff_mult': config_raw['intermediate_size'] / config_raw['hidden_size'],
-            'dropout': config_raw.get('dropout', 0.0),
-            'max_len': config_raw['max_position_embeddings'],
-            'rope_theta': config_raw['rope_theta'],
-            'n_layers': config_raw['num_hidden_layers']
-        }
-        # Add for config object
-        model_config['max_position_embeddings'] = config_raw['max_position_embeddings']
-        print(f"   📐 Architecture: {model_config['n_layers']} layers, {model_config['n_heads']} heads")
-        # Load weights
-        weights_path = hf_hub_download(
-            repo_id=model_info['repo'],
-            filename=model_info['weights'],
-            cache_dir=CACHE_DIR
-        )
-        # Build model
-        model = SAM1Model(**model_config)
-        dummy_input = tf.zeros((1, 1), dtype=tf.int32)
-        model(dummy_input)
-        model.load_weights(weights_path)
-        model.trainable = False
-        # Create optimized forward pass
-        @tf.function(
-            input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
-            jit_compile=True,
-            reduce_retracing=True
-        )
-        def fast_predict(inputs):
-            return model(inputs, training=False)
-        # Warm up
-        print(f"   🔥 Warming up...")
-        dummy = tf.constant([[1, 2, 3]], dtype=tf.int32)
-        _ = fast_predict(dummy)
-        # Store model with its tokenizer
-        loaded_models[model_name] = (model, fast_predict, model_config, tokenizer, eos_token_id)
-        # Set as default if first
-        if current_model is None:
-            current_model = model_name
-        # Count parameters
-        total_params = sum(np.prod(w.shape) for w in model.weights)
-        if total_params >= 1e9:
-            param_str = f"{total_params/1e9:.2f}B"
-        elif total_params >= 1e6:
-            param_str = f"{total_params/1e6:.2f}M"
         else:
-            param_str = f"{total_params/1e3:.2f}K"
-        print(f"   ✅ Loaded successfully!")
-        print(f"   📊 Parameters: {param_str}")
-        print(f"   🔤 Tokenizer vocab: {tokenizer.get_vocab_size()}")
-        return True
-    except Exception as e:
-        print(f"   ⚠️  Failed to load {model_name}: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-@app.on_event("startup")
-async def load_models():
-    global loaded_models, current_model
-    print("="*80)
-    print("🚀 SAM-Z-1 Worker Node v5.0 - Multi-Model with Separate Tokenizers".center(80))
-    print("="*80)
-    try:
-        # Load all models
-        print("\n" + "="*80)
-        print("📦 LOADING ALL 5 MODELS".center(80))
-        print("="*80)
-        loaded_count = 0
-        for model_name, model_info in MODEL_REGISTRY.items():
-            success = await load_single_model(model_name, model_info)
-            if success:
-                loaded_count += 1
-        if loaded_count == 0:
-            raise RuntimeError("❌ No models loaded successfully!")
-        print(f"\n{'='*80}")
-        print(f"✅ Successfully loaded {loaded_count}/{len(MODEL_REGISTRY)} models")
-        print(f"📌 Default model: {current_model}")
-        # Show tokenizer families
-        print(f"\n🔤 Tokenizer Families:")
-        print(f"   SAM-Z family: {len([m for m, i in MODEL_REGISTRY.items() if i['family'] == 'sam-z'])} model(s)")
-        print(f"   SAM-X family: {len([m for m, i in MODEL_REGISTRY.items() if i['family'] == 'sam-x'])} model(s)")
-        print(f"\n🚀 Worker ready for inference!")
-        print(f"{'='*80}\n")
     except Exception as e:
-        print(f"\n❌ Failed to initialize worker: {e}")
         import traceback
         traceback.print_exc()
         raise

 """
+SAM-Z-1 Distributed Worker Node v4.0
+Optimized for distributed gen/decode pipeline
 """
 from fastapi import FastAPI, HTTPException
 from tokenizers import Tokenizer
 import numpy as np
 import time
+from typing import List, Optional
 import asyncio
+app = FastAPI(title="SAM-Z-1 Distributed Worker", version="4.0.0")
 # ============================================================================
 # Model Architecture
         return base_config
 # ============================================================================
+# Global State
 # ============================================================================
+model = None
+tokenizer = None
+config = None
+eos_token_id = None
+fast_forward = None
+MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
+CACHE_DIR = "./model_cache"
+# Stats
 worker_stats = {
     "total_requests": 0,
     "total_tokens": 0,
     "decode_requests": 0,
+    "uptime_start": time.time()
 }
 # ============================================================================
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
 class ChatMessage(BaseModel):
     role: str
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
 class DecodeRequest(BaseModel):
     token_ids: List[int]
 class BatchDecodeRequest(BaseModel):
     batches: List[List[int]]
 # ============================================================================
 # Generation Functions
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1,
+    return_token_ids: bool = False
 ):
+    """Core generation - yields (token_id, token_text or None)"""
+    global model, tokenizer, config, eos_token_id, fast_forward
     input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
     if len(input_ids) == 0:
 @app.get("/", response_class=HTMLResponse)
 async def status_page():
+    """Worker status page"""
+    return """
 <!DOCTYPE html>
 <html>
 <head>
+    <title>SAM-Z-1 Worker Node</title>
     <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #1a1f3a 0%, #0a0e27 100%);
             color: #00bfff;
             padding: 20px;
             min-height: 100vh;
+        }
+        .container {
+            max-width: 900px;
+            margin: 0 auto;
+        }
+        .header {
             text-align: center;
             padding: 30px;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 10px;
             margin-bottom: 30px;
             box-shadow: 0 0 20px rgba(0, 191, 255, 0.3);
+        }
+        .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
             letter-spacing: 3px;
             animation: glow 2s ease-in-out infinite alternate;
+        }
+        @keyframes glow {
+            from { text-shadow: 0 0 10px #00bfff; }
+            to { text-shadow: 0 0 20px #00bfff, 0 0 30px #00bfff; }
+        }
+        .badge {
             display: inline-block;
             padding: 5px 15px;
             border-radius: 15px;
             font-size: 0.9em;
+            margin-top: 10px;
+        }
+        .badge-ready {
             background: rgba(0, 255, 136, 0.2);
             border: 1px solid #00ff88;
             color: #00ff88;
+        }
+        .badge-loading {
             background: rgba(255, 165, 0, 0.2);
             border: 1px solid #ffa500;
             color: #ffa500;
+        }
+        .stats-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
+        }
+        .stat-card {
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
             text-align: center;
+        }
+        .stat-label {
+            font-size: 0.8em;
+            opacity: 0.7;
+            text-transform: uppercase;
+            margin-bottom: 10px;
+        }
+        .stat-value {
+            font-size: 2em;
+            font-weight: bold;
+        }
+        .features {
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
+        }
+        .features h3 {
+            margin-bottom: 15px;
+        }
+        .feature-list {
+            list-style: none;
+            padding: 0;
+        }
+        .feature-list li {
             padding: 10px;
             margin: 5px 0;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 5px;
+        }
+        .feature-list li:before {
+            content: "⚡ ";
+            color: #00ff88;
+        }
+        .timestamp {
+            text-align: center;
+            margin-top: 20px;
+            opacity: 0.5;
+        }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚙️ WORKER NODE ⚙️</h1>
+            <div>SAM-Z-1 Distributed Worker v4.0</div>
+            <div class="badge" id="status-badge">CHECKING STATUS...</div>
         </div>
         <div class="stats-grid" id="stats">
             </div>
         </div>
         <div class="features">
             <h3>🚀 CAPABILITIES</h3>
             <ul class="feature-list">
+                <li>Full Text Generation</li>
+                <li>Token-Only Mode (for distributed pipeline)</li>
+                <li>High-Speed Batch Decoding</li>
+                <li>Chat Completion</li>
+                <li>Streaming & Non-Streaming</li>
             </ul>
         </div>
     </div>
     <script>
+        async function updateStats() {
+            try {
+                const response = await fetch('/health');
+                const data = await response.json();
+                const badge = document.getElementById('status-badge');
+                if (data.model_loaded) {
+                    badge.textContent = '✅ READY FOR INFERENCE';
+                    badge.className = 'badge badge-ready';
+                } else {
+                    badge.textContent = '⏳ LOADING MODEL...';
+                    badge.className = 'badge badge-loading';
+                }
+                // Fetch stats
                 const statsRes = await fetch('/stats');
                 const stats = await statsRes.json();
                 const h = Math.floor(uptime / 3600);
                 const m = Math.floor((uptime % 3600) / 60);
                 const s = uptime % 60;
+                document.getElementById('uptime').textContent = `${h}h ${m}m ${s}s`;
                 document.getElementById('timestamp').textContent =
+                    `Last update: ${new Date().toLocaleTimeString()}`;
+            } catch (e) {
                 console.error('Failed to update stats:', e);
+            }
+        }
+        // Update every second
         setInterval(updateStats, 1000);
         updateStats();
     </script>
 @app.get("/health")
 async def health():
     return {
+        "status": "healthy" if model is not None else "loading",
+        "model_loaded": model is not None
     }
 @app.get("/stats")
         "total_tokens": worker_stats["total_tokens"],
         "decode_requests": worker_stats["decode_requests"],
         "uptime": uptime,
+        "tokens_per_second": worker_stats["total_tokens"] / uptime if uptime > 0 else 0
     }
 @app.post("/decode")
 async def decode(request: DecodeRequest):
+    """Fast single decode"""
+    if tokenizer is None:
+        raise HTTPException(status_code=503, detail="Tokenizer not loaded")
     try:
         worker_stats["decode_requests"] += 1
         text = tokenizer.decode(request.token_ids)
         return {"text": text}
     except Exception as e:
 @app.post("/decode/batch")
 async def batch_decode(request: BatchDecodeRequest):
+    """Optimized batch decoding for distributed pipeline"""
+    if tokenizer is None:
+        raise HTTPException(status_code=503, detail="Tokenizer not loaded")
     try:
         worker_stats["decode_requests"] += len(request.batches)
         results = [tokenizer.decode(batch) for batch in request.batches]
         return {"texts": results}
     except Exception as e:
 @app.post("/generate")
 async def generate(request: GenerateRequest):
+    """Generate text"""
+    if model is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
     worker_stats["total_requests"] += 1
     start_time = time.time()
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
+                    return_token_ids=request.return_token_ids
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
+                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
+                return_token_ids=request.return_token_ids
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
                 "text": generated_text,
                 "tokens": token_count,
                 "time": elapsed,
+                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0
             }
         except Exception as e:
 @app.post("/chat")
 async def chat(request: ChatRequest):
+    """Chat completion"""
+    if model is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
     worker_stats["total_requests"] += 1
     prompt = format_chat_prompt(request.messages)
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
+                    return_token_ids=request.return_token_ids
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
+                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
+                return_token_ids=request.return_token_ids
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
                 },
                 "tokens": token_count,
                 "time": elapsed,
+                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0
             }
         except Exception as e:
 # Model Loading
 # ============================================================================
+@app.on_event("startup")
+async def load_model():
+    global model, tokenizer, config, eos_token_id, fast_forward
+    print("🚀 Loading SAM-Z-1 Model...")
     try:
+        config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
+        try:
+            weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
+            print("✅ Found checkpoint weights")
+            use_checkpoint = True
+        except:
+            print("⚠️  Checkpoint not found, using model.keras")
+            model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
+            use_checkpoint = False
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        print(f"📦 Config loaded: {config['num_hidden_layers']} layers")
+        print("📦 Creating tokenizer...")
+        from transformers import AutoTokenizer
+        hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
+        hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
+        os.makedirs("./temp_tokenizer", exist_ok=True)
+        hf_tokenizer.save_pretrained("./temp_tokenizer")
+        tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
+        eos_token_id = config.get('eos_token_id', 50256)
+        print(f"✅ Tokenizer ready: vocab size {tokenizer.get_vocab_size()}")
+        print("🔄 Loading model...")
+        if use_checkpoint:
+            model_config = {
+                'vocab_size': config['vocab_size'],
+                'd_model': config['hidden_size'],
+                'n_layers': config['num_hidden_layers'],
+                'n_heads': config['num_attention_heads'],
+                'ff_mult': config['intermediate_size'] / config['hidden_size'],
+                'max_len': config['max_position_embeddings'],
+                'dropout': 0.1,
+                'rope_theta': config['rope_theta']
+            }
+            model = SAM1Model(config=model_config)
+            dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
+            _ = model(dummy_input, training=False)
+            print(f"✅ Architecture built: {model.count_params():,} parameters")
+            model.load_weights(weights_path)
+            print("✅ Weights loaded!")
         else:
+            model = keras.models.load_model(model_path, compile=False)
+            print("✅ Model loaded!")
+        @tf.function(reduce_retracing=True)
+        def optimized_forward(input_tensor):
+            return model(input_tensor, training=False)
+        fast_forward = optimized_forward
+        print("✅ SAM-Z-1 Distributed Worker ready! 🚀")
+        print("🔥 Features enabled:")
+        print("   - Full text generation")
+        print("   - Token-only mode (distributed pipeline)")
+        print("   - Batch decoding optimization")
+        print("   - Streaming support")
     except Exception as e:
+        print(f"❌ Failed to load model: {e}")
         import traceback
         traceback.print_exc()
         raise