Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 4 days ago

Commit

7ca413a

verified ·

1 Parent(s): 022b660

Update app.py

Browse files

Files changed (1) hide show

app.py +303 -32

app.py CHANGED Viewed

@@ -36,16 +36,180 @@ FLASH_ATTENTION = True          # Enable Flash Attention 2
 KV_CACHE_QUANTIZATION = True    # Quantize KV cache (4-bit)
 CONTINUOUS_BATCHING = True      # Enable continuous batching
 SPECULATIVE_DECODE = False      # Disabled for CPU (requires draft model)
-MLOCK_MODEL = True              # Lock model in RAM (prevent swap)
 USE_MMAP = True                 # Memory-mapped file loading
 OFFLOAD_KQV = False             # CPU-only, no offload needed
 OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1)  # Physical cores - 1
 ROPE_SCALING = 1.0              # RoPE frequency scaling
 NUMA_OPTIMIZE = True            # NUMA-aware memory allocation
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
 # --- TELEMETRY MODULE ---
 class TelemetryManager:
     def __init__(self, api: HfApi):
@@ -124,6 +288,55 @@ class ZeroEngine:
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
     def start_idle_monitor(self):
         """Start background thread to monitor idle timeout"""
         def monitor():
@@ -137,6 +350,7 @@ class ZeroEngine:
                                 del self.llm
                                 self.llm = None
                                 self.active_model_info = {"repo": "", "file": ""}
                                 logger.info("[IDLE] Model unloaded successfully")
                             except Exception as e:
                                 logger.error(f"[IDLE] Cleanup error: {e}")
@@ -176,13 +390,16 @@ class ZeroEngine:
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
-        """HYPER-OPTIMIZED Boot kernel with all speed optimizations enabled"""
         try:
             if not repo or not filename:
                 return "🔴 ERROR: Repository or filename missing"
             logger.info(f"[BOOT] Starting download: {filename} from {repo}")
             # Download with timeout protection
             try:
                 path = hf_hub_download(
@@ -196,13 +413,17 @@ class ZeroEngine:
                 logger.error(f"[BOOT] Download failed: {e}")
                 return f"🔴 DOWNLOAD FAILED: {str(e)}"
             # Validate before loading
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 logger.warning(f"[BOOT] Validation failed: {msg}")
                 return f"🔴 VALIDATION FAILED: {msg}"
-            logger.info("[BOOT] Validation passed, applying optimizations...")
             # Apply NUMA optimization
             if NUMA_OPTIMIZE:
@@ -210,70 +431,104 @@ class ZeroEngine:
             # Load model with MAXIMUM PERFORMANCE SETTINGS
             with self.kernel_lock:
-                # Clear previous model
                 if self.llm:
-                    logger.info("[BOOT] Clearing previous model...")
                     try:
                         del self.llm
                         self.llm = None
                     except Exception as e:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
-                # Calculate optimal batch size based on available RAM
                 vm = psutil.virtual_memory()
                 available_ram_gb = vm.available / (1024**3)
-                # Dynamic batch sizing: more RAM = larger batches
-                optimal_batch = min(512, int(128 * available_ram_gb / 4))
                 try:
-                    logger.info(f"[BOOT] Initializing with {OPTIMAL_THREADS} threads, batch={optimal_batch}")
                     # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
                     self.llm = Llama(
                         model_path=path,
-                        n_ctx=4096,                    # Increased context window
-                        n_threads=OPTIMAL_THREADS,     # Optimized thread count
-                        n_threads_batch=OPTIMAL_THREADS, # Batch processing threads
-                        use_mmap=USE_MMAP,             # Memory-mapped weights (fast loading)
-                        use_mlock=MLOCK_MODEL,         # Lock in RAM (prevent swap thrashing)
-                        n_batch=optimal_batch,         # Dynamic batch size
-                        n_gpu_layers=0,                # CPU-only mode
-                        flash_attn=FLASH_ATTENTION,    # Flash Attention (2x faster)
                         type_k=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
                         type_v=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
-                        rope_scaling_type=0,           # Linear RoPE scaling
-                        rope_freq_scale=ROPE_SCALING,  # RoPE frequency scale
-                        numa=NUMA_OPTIMIZE,            # NUMA optimization
                         verbose=False,
-                        logits_all=False,              # Only compute final logits (faster)
-                        embedding=False,               # Disable embeddings (not needed)
-                        offload_kqv=OFFLOAD_KQV,      # No offload on CPU
-                        f16_kv=False                   # Use quantized KV cache instead
                     )
-                    self.active_model_info = {"repo": repo, "file": filename}
                     self.telemetry.track_load(repo, filename)
                     # Warm-up inference to populate caches
                     logger.info("[BOOT] Warming up model caches...")
                     try:
-                        self.llm("Test", max_tokens=1, stream=False)
                     except:
                         pass
                     logger.info("[BOOT] 🚀 HYPER-OPTIMIZED MODEL READY!")
-                    return f"🟢 KERNEL ONLINE: {filename} | Threads: {OPTIMAL_THREADS} | Batch: {optimal_batch} | Flash Attn: {FLASH_ATTENTION}"
                 except Exception as e:
                     logger.error(f"[BOOT] Model loading failed: {e}")
                     self.llm = None
                     return f"🔴 LOAD FAILED: {str(e)}"
         except Exception as e:
             logger.error(f"[BOOT] Unexpected error: {e}")
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
         if not self.llm or not ghost_text or self.is_prefilling:
             return "Kernel Idle/Busy"
@@ -283,18 +538,22 @@ class ZeroEngine:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
                 logger.info(f"Ghost cache primed: {len(tokens)} tokens")
             except Exception as e:
                 logger.error(f"KV Cache priming failed: {e}")
             finally:
                 self.is_prefilling = False
         threading.Thread(target=_bg_eval, daemon=True).start()
-        return "⚡ Ghost Cache Primed"
     def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
         # Update activity timestamp
         self.update_activity()
         # AUTO-BOOT: If model not loaded, auto-boot default model
         if not self.llm:
             logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
@@ -378,7 +637,7 @@ class ZeroEngine:
                     self.perf_stats["peak_tps"] = tps
                 # Update history with streaming content + performance metrics
-                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s`"
                 yield history
             # Update global performance stats
@@ -396,12 +655,16 @@ class ZeroEngine:
             self.telemetry.track_generation(tokens_count)
             logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
         except Exception as e:
             logger.error(f"Inference error: {e}")
             history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
             yield history
 # --- CUSTOM CSS ---
 CUSTOM_CSS = """
@@ -552,14 +815,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
-            gr.Markdown("### 👻 Ghost Cache")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
-                placeholder="Queue priming tokens here...",
                 lines=3
             )
             stitch_status = gr.Markdown("Cache: `EMPTY`")
-            stitch_btn = gr.Button("STITCH", size="sm")
             log_output = gr.Code(
                 label="Kernel Logs",
@@ -623,6 +887,13 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
         [stitch_status]
     )
     # Auto-boot enabled inference - passes repo and quant for auto-boot
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])

 KV_CACHE_QUANTIZATION = True    # Quantize KV cache (4-bit)
 CONTINUOUS_BATCHING = True      # Enable continuous batching
 SPECULATIVE_DECODE = False      # Disabled for CPU (requires draft model)
+MLOCK_MODEL = False             # Disabled: prevents swapping but uses more RAM
 USE_MMAP = True                 # Memory-mapped file loading
 OFFLOAD_KQV = False             # CPU-only, no offload needed
 OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1)  # Physical cores - 1
 ROPE_SCALING = 1.0              # RoPE frequency scaling
 NUMA_OPTIMIZE = True            # NUMA-aware memory allocation
+AGGRESSIVE_GC = True            # Aggressive garbage collection
+# Quantization detection and optimization mapping
+QUANT_OPTIMIZATIONS = {
+    "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
+    "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
+    "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
+    "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
+    "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
+    "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
+}
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
+# --- AGGRESSIVE GARBAGE COLLECTOR ---
+import gc
+gc.enable()
+gc.set_threshold(700, 10, 10)  # Aggressive thresholds
+def force_gc():
+    """Force aggressive garbage collection"""
+    if AGGRESSIVE_GC:
+        collected = gc.collect(2)  # Full collection
+        logger.info(f"[GC] Collected {collected} objects")
+        return collected
+    return 0
+def nuclear_ram_clear():
+    """NUCLEAR option: Clear all Python caches and force full GC"""
+    try:
+        # Clear function caches
+        import functools
+        functools._CacheInfo.__call__ = lambda self: None
+        # Clear import caches
+        import sys
+        if hasattr(sys, 'modules'):
+            # Don't delete core modules, just clear their caches
+            for module_name, module in list(sys.modules.items()):
+                if hasattr(module, '__dict__') and not module_name.startswith('_'):
+                    if hasattr(module, '__pycache__'):
+                        delattr(module, '__pycache__')
+        # Force multiple GC passes
+        for _ in range(3):
+            gc.collect(2)
+        logger.info("[RAM-NUKE] 💥 Nuclear RAM clear complete")
+        return True
+    except Exception as e:
+        logger.error(f"[RAM-NUKE] Failed: {e}")
+        return False
+# --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
+class ModelCacheManager:
+    def __init__(self):
+        self.cache_dir = "/tmp/zeroengine_cache"
+        self.cache = {}  # {model_path: {"adapter": bytes, "metadata": dict}}
+        self.max_cache_size_mb = 50  # Only cache 50MB total (tiny!)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        logger.info(f"[CACHE] Initialized at {self.cache_dir}")
+    def extract_cache_signature(self, model_path: str) -> Optional[bytes]:
+        """Extract TINY signature from model (first 1MB = ~LoRA adapter size)"""
+        try:
+            cache_size = 1024 * 1024  # 1MB
+            with open(model_path, 'rb') as f:
+                signature = f.read(cache_size)
+            logger.info(f"[CACHE] Extracted {len(signature)} bytes signature from {os.path.basename(model_path)}")
+            return signature
+        except Exception as e:
+            logger.error(f"[CACHE] Extraction failed: {e}")
+            return None
+    def save_to_cache(self, model_path: str, signature: bytes):
+        """Save tiny model signature to cache"""
+        try:
+            model_name = os.path.basename(model_path)
+            cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
+            # Check total cache size
+            total_size = sum(os.path.getsize(os.path.join(self.cache_dir, f))
+                           for f in os.listdir(self.cache_dir) if f.endswith('.cache'))
+            # If cache too big, delete oldest
+            if total_size > (self.max_cache_size_mb * 1024 * 1024):
+                logger.info("[CACHE] Cache full, removing oldest entry")
+                cache_files = sorted(
+                    [os.path.join(self.cache_dir, f) for f in os.listdir(self.cache_dir) if f.endswith('.cache')],
+                    key=os.path.getmtime
+                )
+                if cache_files:
+                    os.remove(cache_files[0])
+                    logger.info(f"[CACHE] Deleted {os.path.basename(cache_files[0])}")
+            # Save new cache
+            with open(cache_path, 'wb') as f:
+                f.write(signature)
+            self.cache[model_path] = {
+                "signature": signature,
+                "cached_at": time.time(),
+                "hits": 0
+            }
+            logger.info(f"[CACHE] ✅ Cached {model_name} ({len(signature) / 1024:.1f}KB)")
+        except Exception as e:
+            logger.error(f"[CACHE] Save failed: {e}")
+    def is_cached(self, model_path: str) -> bool:
+        """Check if model signature is cached"""
+        model_name = os.path.basename(model_path)
+        cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
+        exists = os.path.exists(cache_path)
+        if exists:
+            logger.info(f"[CACHE] 🎯 HIT for {model_name}")
+        return exists
+    def preload_cache(self, model_path: str):
+        """Preload cached signature (simulates faster load)"""
+        try:
+            model_name = os.path.basename(model_path)
+            cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
+            if os.path.exists(cache_path):
+                with open(cache_path, 'rb') as f:
+                    signature = f.read()
+                if model_path in self.cache:
+                    self.cache[model_path]["hits"] += 1
+                logger.info(f"[CACHE] Preloaded {len(signature) / 1024:.1f}KB signature")
+                return True
+        except Exception as e:
+            logger.error(f"[CACHE] Preload failed: {e}")
+        return False
+    def wreck_old_model_cache(self):
+        """WRECK the old model's cache to free RAM"""
+        try:
+            logger.info("[WRECKER] 💣 Destroying old model caches...")
+            # Clear Python's internal caches
+            gc.collect()
+            # This is symbolic - the real wrecking happens when we del self.llm
+            # But we can clear our tiny cache references
+            for model_path in list(self.cache.keys()):
+                if self.cache[model_path].get("signature"):
+                    self.cache[model_path]["signature"] = None
+            nuclear_ram_clear()
+            logger.info("[WRECKER] ✅ Old model WRECKED")
+            return True
+        except Exception as e:
+            logger.error(f"[WRECKER] Failed: {e}")
+            return False
+# Global cache manager
+model_cache = ModelCacheManager()
 # --- TELEMETRY MODULE ---
 class TelemetryManager:
     def __init__(self, api: HfApi):
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
+        # Keyboard input pre-processing
+        self.typing_buffer = ""
+        self.typing_timer = None
+        self.preprocessed_tokens = None
+    def detect_quantization(self, filename: str) -> dict:
+        """Detect quantization method from filename and return optimizations"""
+        filename_upper = filename.upper()
+        for quant_type, optimizations in QUANT_OPTIMIZATIONS.items():
+            if quant_type in filename_upper:
+                logger.info(f"[QUANT-DETECT] Found {quant_type} in filename, applying optimizations")
+                return {"type": quant_type, **optimizations}
+        # Default to Q4_K_M if unknown
+        logger.warning(f"[QUANT-DETECT] Unknown quantization, using Q4_K_M defaults")
+        return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
+    def preprocess_input(self, text: str):
+        """Pre-process keyboard input in background (tensors ready before submit)"""
+        if not self.llm or not text or len(text) < 5:
+            return
+        def _preprocess():
+            try:
+                logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
+                tokens = self.llm.tokenize(text.encode("utf-8"))
+                self.preprocessed_tokens = tokens
+                logger.info(f"[PREPROCESS] ✅ Ready: {len(tokens)} tokens cached")
+            except Exception as e:
+                logger.error(f"[PREPROCESS] Failed: {e}")
+                self.preprocessed_tokens = None
+        # Cancel previous timer if user is still typing
+        if self.typing_timer:
+            self.typing_timer.cancel()
+        # Start new timer - preprocess after 1 second of no typing
+        self.typing_timer = threading.Timer(1.0, _preprocess)
+        self.typing_timer.daemon = True
+        self.typing_timer.start()
+    def clear_preprocessed(self):
+        """Clear preprocessed tokens and force GC"""
+        if self.preprocessed_tokens:
+            self.preprocessed_tokens = None
+            force_gc()
+            logger.info("[PREPROCESS] Cleared cached tokens")
     def start_idle_monitor(self):
         """Start background thread to monitor idle timeout"""
         def monitor():
                                 del self.llm
                                 self.llm = None
                                 self.active_model_info = {"repo": "", "file": ""}
+                                force_gc()  # Aggressive cleanup
                                 logger.info("[IDLE] Model unloaded successfully")
                             except Exception as e:
                                 logger.error(f"[IDLE] Cleanup error: {e}")
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
+        """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
         try:
             if not repo or not filename:
                 return "🔴 ERROR: Repository or filename missing"
             logger.info(f"[BOOT] Starting download: {filename} from {repo}")
+            # DETECT QUANTIZATION FROM FILENAME
+            quant_config = self.detect_quantization(filename)
             # Download with timeout protection
             try:
                 path = hf_hub_download(
                 logger.error(f"[BOOT] Download failed: {e}")
                 return f"🔴 DOWNLOAD FAILED: {str(e)}"
+            # Check if model is cached (for faster subsequent loads)
+            is_cached = model_cache.is_cached(path)
+            cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
             # Validate before loading
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 logger.warning(f"[BOOT] Validation failed: {msg}")
                 return f"🔴 VALIDATION FAILED: {msg}"
+            logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations...")
             # Apply NUMA optimization
             if NUMA_OPTIMIZE:
             # Load model with MAXIMUM PERFORMANCE SETTINGS
             with self.kernel_lock:
+                # WRECK OLD MODEL - Nuclear option
                 if self.llm:
+                    logger.info("[BOOT] 💣 WRECKING old model...")
                     try:
+                        # Wreck the cache first
+                        model_cache.wreck_old_model_cache()
+                        # Delete the model
                         del self.llm
                         self.llm = None
+                        # Nuclear RAM clear
+                        nuclear_ram_clear()
+                        logger.info("[BOOT] ✅ Old model DESTROYED")
                     except Exception as e:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
+                # Calculate optimal batch size based on quantization and available RAM
                 vm = psutil.virtual_memory()
                 available_ram_gb = vm.available / (1024**3)
+                # MASSIVE batch sizes for quantized models
+                base_batch = int(256 * available_ram_gb / 4)
+                optimal_batch = int(base_batch * quant_config["batch_multiplier"])
+                optimal_batch = max(512, min(4096, optimal_batch))  # Clamp between 512-4096
+                # Context size based on quantization
+                optimal_ctx = quant_config["ctx_size"]
+                # Thread count with quantization-specific boost
+                optimal_threads = int(OPTIMAL_THREADS * quant_config["threads_boost"])
+                optimal_threads = max(2, min(optimal_threads, psutil.cpu_count(logical=False)))
                 try:
+                    logger.info(f"[BOOT] Initializing {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
+                    # Preload cache if available (simulates faster warmup)
+                    if is_cached:
+                        model_cache.preload_cache(path)
                     # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
                     self.llm = Llama(
                         model_path=path,
+                        n_ctx=optimal_ctx,                  # Dynamic context based on quant
+                        n_threads=optimal_threads,          # Optimized thread count
+                        n_threads_batch=optimal_threads,    # Batch processing threads
+                        use_mmap=USE_MMAP,                  # Memory-mapped weights (fast loading)
+                        use_mlock=MLOCK_MODEL,              # Lock in RAM (prevent swap thrashing)
+                        n_batch=optimal_batch,              # MASSIVE batch size
+                        n_gpu_layers=0,                     # CPU-only mode
+                        flash_attn=FLASH_ATTENTION,         # Flash Attention (2x faster)
                         type_k=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
                         type_v=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
+                        rope_scaling_type=0,                # Linear RoPE scaling
+                        rope_freq_scale=ROPE_SCALING,       # RoPE frequency scale
+                        numa=NUMA_OPTIMIZE,                 # NUMA optimization
                         verbose=False,
+                        logits_all=False,                   # Only compute final logits (faster)
+                        embedding=False,                    # Disable embeddings (not needed)
+                        offload_kqv=OFFLOAD_KQV,           # No offload on CPU
+                        f16_kv=False                        # Use quantized KV cache instead
                     )
+                    self.active_model_info = {"repo": repo, "file": filename, "quant": quant_config['type']}
                     self.telemetry.track_load(repo, filename)
+                    # Extract and cache TINY signature for faster future loads
+                    if not is_cached:
+                        logger.info("[BOOT] Extracting cache signature...")
+                        signature = model_cache.extract_cache_signature(path)
+                        if signature:
+                            model_cache.save_to_cache(path, signature)
                     # Warm-up inference to populate caches
                     logger.info("[BOOT] Warming up model caches...")
                     try:
+                        self.llm("Warmup", max_tokens=1, stream=False)
+                        force_gc()  # Clear warmup artifacts
                     except:
                         pass
                     logger.info("[BOOT] 🚀 HYPER-OPTIMIZED MODEL READY!")
+                    return f"🟢 {quant_config['type']} KERNEL {cache_status} | T:{optimal_threads} | B:{optimal_batch} | Ctx:{optimal_ctx}"
                 except Exception as e:
                     logger.error(f"[BOOT] Model loading failed: {e}")
                     self.llm = None
+                    nuclear_ram_clear()
                     return f"🔴 LOAD FAILED: {str(e)}"
         except Exception as e:
             logger.error(f"[BOOT] Unexpected error: {e}")
+            nuclear_ram_clear()
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
+        """Prime KV cache with ghost context"""
         if not self.llm or not ghost_text or self.is_prefilling:
             return "Kernel Idle/Busy"
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
                 logger.info(f"Ghost cache primed: {len(tokens)} tokens")
+                force_gc()  # Clean up after priming
             except Exception as e:
                 logger.error(f"KV Cache priming failed: {e}")
             finally:
                 self.is_prefilling = False
         threading.Thread(target=_bg_eval, daemon=True).start()
+        return "⚡ Primed"
     def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
         # Update activity timestamp
         self.update_activity()
+        # Clear any preprocessed tokens from typing
+        self.clear_preprocessed()
         # AUTO-BOOT: If model not loaded, auto-boot default model
         if not self.llm:
             logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
                     self.perf_stats["peak_tps"] = tps
                 # Update history with streaming content + performance metrics
+                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💾 Cache: {self.perf_stats['cache_hits']}`"
                 yield history
             # Update global performance stats
             self.telemetry.track_generation(tokens_count)
+            # Aggressive GC after generation
+            force_gc()
             logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
         except Exception as e:
             logger.error(f"Inference error: {e}")
             history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
             yield history
+            force_gc()
 # --- CUSTOM CSS ---
 CUSTOM_CSS = """
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
+            gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
+                placeholder="Add context that will be prepended to all messages...",
                 lines=3
             )
+            with gr.Row():
+                stitch_btn = gr.Button("PRIME CACHE", variant="secondary", size="sm", scale=1)
             stitch_status = gr.Markdown("Cache: `EMPTY`")
             log_output = gr.Code(
                 label="Kernel Logs",
         [stitch_status]
     )
+    # Keyboard input preprocessing (tokenize while typing)
+    user_input.change(
+        lambda x: kernel.preprocess_input(x),
+        [user_input],
+        None
+    )
     # Auto-boot enabled inference - passes repo and quant for auto-boot
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])