Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Jan 31

Commit

0195768

verified ·

1 Parent(s): 5ed5fed

Update app.py

Browse files

Files changed (1) hide show

app.py +452 -67

app.py CHANGED Viewed

@@ -31,32 +31,47 @@ SYSTEM_RESERVE_MB = 500
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 # --- SPEED OPTIMIZATION CONFIG ---
-FLASH_ATTENTION = True          # Enable Flash Attention 2
-KV_CACHE_QUANTIZATION = True    # Quantize KV cache (4-bit)
-CONTINUOUS_BATCHING = True      # Enable continuous batching
-SPECULATIVE_DECODE = False      # Disabled for CPU (requires draft model)
-MLOCK_MODEL = False             # Disabled: prevents swapping but uses more RAM
-USE_MMAP = True                 # Memory-mapped file loading
-OFFLOAD_KQV = False             # CPU-only, no offload needed
-OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1)  # Physical cores - 1
-ROPE_SCALING = 1.0              # RoPE frequency scaling
-NUMA_OPTIMIZE = True            # NUMA-aware memory allocation
-AGGRESSIVE_GC = True            # Aggressive garbage collection
-# Quantization detection and optimization mapping
 QUANT_OPTIMIZATIONS = {
-    "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
-    "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
-    "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
-    "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
-    "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
-    "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
-    "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
-    "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
-    "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
 }
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -207,6 +222,100 @@ class ModelCacheManager:
             logger.error(f"[WRECKER] Failed: {e}")
             return False
 # Global cache manager
 model_cache = ModelCacheManager()
@@ -272,7 +381,7 @@ class ZeroEngine:
         self.api = HfApi(token=HF_TOKEN)
         self.telemetry = TelemetryManager(self.api)
         self.llm: Optional[Llama] = None
-        self.active_model_info = {"repo": "", "file": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
         self.perf_stats = {
@@ -282,9 +391,9 @@ class ZeroEngine:
             "peak_tps": 0.0,
             "cache_hits": 0
         }
-        self.prompt_cache = {}  # Cache for repeated prompts
         self.last_activity = time.time()
-        self.idle_timeout = 20  # 20 seconds idle timeout
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
@@ -293,6 +402,29 @@ class ZeroEngine:
         self.typing_timer = None
         self.preprocessed_tokens = None
     def detect_quantization(self, filename: str) -> dict:
         """Detect quantization method from filename and return optimizations"""
         filename_upper = filename.upper()
@@ -389,7 +521,158 @@ class ZeroEngine:
             logger.error(f"Scan error: {e}")
             return []
-    def boot_kernel(self, repo: str, filename: str) -> str:
         """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
         try:
             if not repo or not filename:
@@ -600,23 +883,28 @@ class ZeroEngine:
         first_token_time = None
         try:
-            # HYPER-OPTIMIZED INFERENCE SETTINGS
             stream = self.llm(
                 formatted_prompt,
-                max_tokens=2048,                    # Increased output length
                 stop=["User:", "<|eot_id|>", "\n\n"],
                 stream=True,
-                temperature=0.7,                    # Balanced creativity
-                top_p=0.95,                         # Nucleus sampling
-                top_k=40,                           # Top-K sampling
-                repeat_penalty=1.1,                 # Prevent repetition
-                frequency_penalty=0.0,              # No frequency penalty
-                presence_penalty=0.0,               # No presence penalty
-                tfs_z=1.0,                          # Tail-free sampling
-                typical_p=1.0,                      # Typical sampling
-                mirostat_mode=2,                    # Mirostat v2 (perplexity control)
-                mirostat_tau=5.0,                   # Target perplexity
-                mirostat_eta=0.1,                   # Learning rate
             )
             for chunk in stream:
@@ -636,10 +924,19 @@ class ZeroEngine:
                 if tps > self.perf_stats["peak_tps"]:
                     self.perf_stats["peak_tps"] = tps
                 # Update history with streaming content + performance metrics
-                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💾 Cache: {self.perf_stats['cache_hits']}`"
                 yield history
             # Update global performance stats
             self.perf_stats["total_tokens"] += tokens_count
             self.perf_stats["total_time"] += elapsed
@@ -763,27 +1060,49 @@ h1, h2, h3, h4, h5, h6 {
 # --- UI INTERFACE ---
 kernel = ZeroEngine()
-with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
-    gr.HTML("""
-        <div style='text-align: center; padding: 30px; border-radius: 24px;
-                    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
-                    margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
-            <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
-                       -webkit-background-clip: text; -webkit-text-fill-color: transparent;
-                       font-family: Consolas, monospace;'>
-                🛰️ ZEROENGINE V0.1
-            </h1>
-            <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
-                Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
-            </p>
-        </div>
-    """)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 label="Main Engine Feedback",
-                height=650,
                 show_label=False,
                 autoscroll=True,
                 container=True
@@ -798,12 +1117,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
                 )
                 send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
-        with gr.Column(scale=3):
             gr.Markdown("### 🛠️ Hardware Status")
             ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
             cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
             gr.Markdown("### 📡 Model Control")
             repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
             quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
@@ -815,6 +1137,26 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
             gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
@@ -828,7 +1170,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
             log_output = gr.Code(
                 label="Kernel Logs",
                 language="shell",
-                value="[INIT] System Ready.",
                 lines=5
             )
@@ -836,9 +1178,11 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
     def update_stats():
         try:
             m = ResourceMonitor.get_metrics()
-            return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
         except Exception as e:
             logger.error(f"Stats update error: {e}")
             return "Error", "Error"
     def on_scan(repo):
@@ -864,37 +1208,78 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
                 return
             yield "⚙️ System: Initiating boot sequence...", gr.update()
-            time.sleep(0.5)  # Small delay for UI feedback
-            result = kernel.boot_kernel(repo, file)
             yield result, gr.update()
         except Exception as e:
             logger.error(f"Boot UI error: {e}")
             yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
-    # Timer for periodic stats updates
     timer = gr.Timer(value=2)
-    timer.tick(update_stats, None, [ram_metric, cpu_metric])
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
     boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
     stitch_btn.click(
         lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
         [ghost_buffer],
         [stitch_status]
     )
-    # Keyboard input preprocessing (tokenize while typing)
     user_input.change(
         lambda x: kernel.preprocess_input(x),
         [user_input],
         None
     )
-    # Auto-boot enabled inference - passes repo and quant for auto-boot
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])

 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
+# --- TOKEN SYSTEM CONFIG ---
+MONTHLY_TOKEN_CREDITS = 100.0
+TOKEN_COST_PER_100MS = 0.001
+BATCH_UPGRADE_BASE_COST = 0.00005  # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
+TOKEN_UPGRADE_COST_PER_1K = 0.0001  # Cost per 1000 extra tokens
 # --- SPEED OPTIMIZATION CONFIG ---
+FLASH_ATTENTION = False         # Disabled for CPU (GPU-only feature)
+KV_CACHE_QUANTIZATION = True    # Keep for RAM savings
+CONTINUOUS_BATCHING = False     # CPU doesn't benefit much
+SPECULATIVE_DECODE = False      # CPU-only, no draft model
+MLOCK_MODEL = False             # Don't lock - allow OS to manage memory
+USE_MMAP = True                 # Critical for CPU - fast loading
+OFFLOAD_KQV = False             # CPU-only
+OPTIMAL_THREADS = psutil.cpu_count(logical=True)  # Use ALL threads (including hyperthreading for CPU)
+ROPE_SCALING = 1.0
+NUMA_OPTIMIZE = False           # Disabled - can cause issues on some systems
+AGGRESSIVE_GC = True
+# Quantization detection - CPU-optimized batch multipliers (more aggressive)
 QUANT_OPTIMIZATIONS = {
+    "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
+    "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
+    "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
+    "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
+    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},  # MASSIVE for CPU
+    "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
+    "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
+    "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
+    "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
+}
+# Model format/architecture detection patterns
+MODEL_FORMATS = {
+    "llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
+    "gemma": {"pattern": ["gemma"], "template": "gemma"},
+    "phi": {"pattern": ["phi"], "template": "phi"},
+    "qwen": {"pattern": ["qwen"], "template": "chatml"},
+    "deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
 }
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
             logger.error(f"[WRECKER] Failed: {e}")
             return False
+# --- TOKEN MANAGER ---
+class TokenManager:
+    def __init__(self):
+        self.user_tokens = {}  # {session_id: {"balance": float, "start_time": float, "purchases": {}}}
+        self.active_sessions = {}
+    def get_session_id(self) -> str:
+        """Generate or retrieve session ID from Gradio request"""
+        import hashlib
+        import time
+        # Simple session ID based on timestamp (in production, use gr.Request)
+        return hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
+    def initialize_user(self, session_id: str):
+        """Initialize new user with monthly credits"""
+        if session_id not in self.user_tokens:
+            self.user_tokens[session_id] = {
+                "balance": MONTHLY_TOKEN_CREDITS,
+                "start_time": time.time(),
+                "purchases": {"batch_multiplier": 1, "token_limit": 2048},
+                "total_spent": 0.0
+            }
+            logger.info(f"[TOKEN] New user {session_id}: {MONTHLY_TOKEN_CREDITS} tokens")
+    def charge_usage(self, session_id: str, duration_ms: float) -> bool:
+        """Charge user for inference time. Returns True if successful"""
+        self.initialize_user(session_id)
+        cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
+        if self.user_tokens[session_id]["balance"] >= cost:
+            self.user_tokens[session_id]["balance"] -= cost
+            self.user_tokens[session_id]["total_spent"] += cost
+            logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[session_id]['balance']:.2f}")
+            return True
+        else:
+            logger.warning(f"[TOKEN] Insufficient balance! Need {cost:.4f}, have {self.user_tokens[session_id]['balance']:.2f}")
+            return False
+    def purchase_batch_upgrade(self, session_id: str) -> tuple:
+        """Purchase batch size upgrade (exponential cost)"""
+        self.initialize_user(session_id)
+        current_mult = self.user_tokens[session_id]["purchases"]["batch_multiplier"]
+        upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
+        cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
+        if self.user_tokens[session_id]["balance"] >= cost:
+            self.user_tokens[session_id]["balance"] -= cost
+            self.user_tokens[session_id]["purchases"]["batch_multiplier"] = current_mult * 2
+            new_mult = current_mult * 2
+            logger.info(f"[TOKEN] Batch upgrade: {current_mult}x → {new_mult}x | Cost: {cost:.5f}")
+            return True, f"✅ Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
+        else:
+            return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
+    def purchase_token_upgrade(self, session_id: str, extra_tokens: int = 1000) -> tuple:
+        """Purchase extra response token length"""
+        self.initialize_user(session_id)
+        cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
+        if self.user_tokens[session_id]["balance"] >= cost:
+            self.user_tokens[session_id]["balance"] -= cost
+            self.user_tokens[session_id]["purchases"]["token_limit"] += extra_tokens
+            new_limit = self.user_tokens[session_id]["purchases"]["token_limit"]
+            logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
+            return True, f"✅ Token limit now {new_limit}! (-{cost:.5f} tokens)"
+        else:
+            return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
+    def get_balance(self, session_id: str) -> float:
+        """Get user's current token balance"""
+        self.initialize_user(session_id)
+        return round(self.user_tokens[session_id]["balance"], 2)
+    def get_purchases(self, session_id: str) -> dict:
+        """Get user's current purchases"""
+        self.initialize_user(session_id)
+        return self.user_tokens[session_id]["purchases"]
+    def end_session(self, session_id: str):
+        """End user session and log stats"""
+        if session_id in self.user_tokens:
+            stats = self.user_tokens[session_id]
+            logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
+            # Don't delete - keep for monthly tracking
+            return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session."
+        return "No active session found."
+# Global token manager
+import math
+token_manager = TokenManager()
 # Global cache manager
 model_cache = ModelCacheManager()
         self.api = HfApi(token=HF_TOKEN)
         self.telemetry = TelemetryManager(self.api)
         self.llm: Optional[Llama] = None
+        self.active_model_info = {"repo": "", "file": "", "format": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
         self.perf_stats = {
             "peak_tps": 0.0,
             "cache_hits": 0
         }
+        self.prompt_cache = {}
         self.last_activity = time.time()
+        self.idle_timeout = 20
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
         self.typing_timer = None
         self.preprocessed_tokens = None
+        # Custom parameters (user-configurable)
+        self.custom_params = {
+            "temperature": 0.7,
+            "top_p": 0.95,
+            "top_k": 40,
+            "repeat_penalty": 1.1,
+            "batch_size_override": None,  # None = auto
+            "max_tokens_override": None   # None = auto
+        }
+    def detect_model_format(self, filename: str, repo: str) -> str:
+        """Auto-detect model format/architecture from filename and repo"""
+        combined = f"{repo.lower()} {filename.lower()}"
+        for format_name, format_info in MODEL_FORMATS.items():
+            for pattern in format_info["pattern"]:
+                if pattern in combined:
+                    logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
+                    return format_name
+        logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
+        return "llama"
     def detect_quantization(self, filename: str) -> dict:
         """Detect quantization method from filename and return optimizations"""
         filename_upper = filename.upper()
             logger.error(f"Scan error: {e}")
             return []
+    def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
+        """HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
+        try:
+            if not repo or not filename:
+                return "🔴 ERROR: Repository or filename missing"
+            logger.info(f"[BOOT] Starting download: {filename} from {repo}")
+            # DETECT QUANTIZATION FROM FILENAME
+            quant_config = self.detect_quantization(filename)
+            # DETECT MODEL FORMAT/ARCHITECTURE
+            model_format = self.detect_model_format(filename, repo)
+            # Download with timeout protection
+            try:
+                path = hf_hub_download(
+                    repo_id=repo,
+                    filename=filename,
+                    token=HF_TOKEN,
+                    local_files_only=False
+                )
+                logger.info(f"[BOOT] Download complete: {path}")
+            except Exception as e:
+                logger.error(f"[BOOT] Download failed: {e}")
+                return f"🔴 DOWNLOAD FAILED: {str(e)}"
+            # Check if model is cached
+            is_cached = model_cache.is_cached(path)
+            cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
+            # Validate before loading
+            valid, msg = ResourceMonitor.validate_deployment(path)
+            if not valid:
+                logger.warning(f"[BOOT] Validation failed: {msg}")
+                return f"🔴 VALIDATION FAILED: {msg}"
+            logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
+            # Load model with MAXIMUM PERFORMANCE SETTINGS
+            with self.kernel_lock:
+                # WRECK OLD MODEL
+                if self.llm:
+                    logger.info("[BOOT] 💣 WRECKING old model...")
+                    try:
+                        model_cache.wreck_old_model_cache()
+                        del self.llm
+                        self.llm = None
+                        nuclear_ram_clear()
+                        logger.info("[BOOT] ✅ Old model DESTROYED")
+                    except Exception as e:
+                        logger.warning(f"[BOOT] Cleanup warning: {e}")
+                # Calculate optimal parameters with token purchases
+                vm = psutil.virtual_memory()
+                available_ram_gb = vm.available / (1024**3)
+                # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
+                # Base calculation: use more RAM for batching on CPU
+                base_batch = int(512 * available_ram_gb / 8)  # More aggressive base
+                optimal_batch = int(base_batch * quant_config["batch_multiplier"])
+                # Apply user's batch multiplier from token purchases
+                if session_id:
+                    user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
+                    optimal_batch = int(optimal_batch * user_batch_mult)
+                    logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
+                # CPU can handle larger batches with quantized models
+                optimal_batch = max(1024, min(8192, optimal_batch))  # 1024-8192 range for CPU
+                # Context size
+                optimal_ctx = quant_config["ctx_size"]
+                # Reduce context for Gemma models (they have 131K n_ctx_train)
+                if model_format == "gemma":
+                    optimal_ctx = min(8192, optimal_ctx)  # Gemma works better with lower ctx
+                    logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
+                # Thread optimization - use ALL threads on CPU (including hyperthreading)
+                optimal_threads = psutil.cpu_count(logical=True)  # ALL logical cores
+                logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
+                try:
+                    logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
+                    # Preload cache if available
+                    if is_cached:
+                        model_cache.preload_cache(path)
+                    # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
+                    init_params = {
+                        "model_path": path,
+                        "n_ctx": optimal_ctx,
+                        "n_threads": optimal_threads,
+                        "n_threads_batch": optimal_threads,
+                        "use_mmap": USE_MMAP,              # Critical for CPU
+                        "use_mlock": MLOCK_MODEL,          # Let OS manage memory
+                        "n_batch": optimal_batch,          # MASSIVE batches for CPU
+                        "n_gpu_layers": 0,                 # CPU-only
+                        "rope_scaling_type": 0,
+                        "rope_freq_scale": ROPE_SCALING,
+                        "verbose": False,
+                        "logits_all": False,
+                        "embedding": False,
+                        "f16_kv": False                    # Use quantized KV cache
+                    }
+                    # Add KV quantization only if not Gemma (Gemma can be finicky)
+                    if model_format != "gemma" and KV_CACHE_QUANTIZATION:
+                        init_params["type_k"] = 2
+                        init_params["type_v"] = 2
+                        logger.info("[OPTIM] KV cache quantization enabled (Q4)")
+                    self.llm = Llama(**init_params)
+                    self.active_model_info = {
+                        "repo": repo,
+                        "file": filename,
+                        "quant": quant_config['type'],
+                        "format": model_format
+                    }
+                    self.telemetry.track_load(repo, filename)
+                    # Extract and cache signature
+                    if not is_cached:
+                        logger.info("[BOOT] Extracting cache signature...")
+                        signature = model_cache.extract_cache_signature(path)
+                        if signature:
+                            model_cache.save_to_cache(path, signature)
+                    # Warm-up
+                    logger.info("[BOOT] Warming up model caches...")
+                    try:
+                        self.llm("Warmup", max_tokens=1, stream=False)
+                        force_gc()
+                    except:
+                        pass
+                    logger.info("[BOOT] 🚀 CPU-OPTIMIZED MODEL READY!")
+                    return f"🟢 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
+                except Exception as e:
+                    logger.error(f"[BOOT] Model loading failed: {e}")
+                    self.llm = None
+                    nuclear_ram_clear()
+                    return f"🔴 LOAD FAILED: {str(e)}"
+        except Exception as e:
+            logger.error(f"[BOOT] Unexpected error: {e}")
+            nuclear_ram_clear()
+            return f"🔴 BOOT FAILURE: {str(e)}"
         """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
         try:
             if not repo or not filename:
         first_token_time = None
         try:
+            # Get max tokens from user purchases
+            max_tokens = 2048
+            if session_id:
+                max_tokens = token_manager.get_purchases(session_id)["token_limit"]
+            # HYPER-OPTIMIZED CPU INFERENCE SETTINGS
             stream = self.llm(
                 formatted_prompt,
+                max_tokens=max_tokens,
                 stop=["User:", "<|eot_id|>", "\n\n"],
                 stream=True,
+                temperature=self.custom_params["temperature"],
+                top_p=self.custom_params["top_p"],
+                top_k=self.custom_params["top_k"],
+                repeat_penalty=self.custom_params["repeat_penalty"],
+                frequency_penalty=0.0,
+                presence_penalty=0.0,
+                tfs_z=1.0,
+                typical_p=1.0,
+                mirostat_mode=2,                    # CPU benefits from mirostat
+                mirostat_tau=5.0,
+                mirostat_eta=0.1,
             )
             for chunk in stream:
                 if tps > self.perf_stats["peak_tps"]:
                     self.perf_stats["peak_tps"] = tps
+                # Charge tokens every second
+                if int(elapsed * 1000) % 1000 < 100 and session_id:  # Every ~1 second
+                    token_manager.charge_usage(session_id, elapsed * 1000)
                 # Update history with streaming content + performance metrics
+                balance = token_manager.get_balance(session_id) if session_id else 0
+                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💰 {balance:.2f} tokens`"
                 yield history
+            # Final token charge for remaining time
+            if session_id:
+                token_manager.charge_usage(session_id, elapsed * 1000)
             # Update global performance stats
             self.perf_stats["total_tokens"] += tokens_count
             self.perf_stats["total_time"] += elapsed
 # --- UI INTERFACE ---
 kernel = ZeroEngine()
+# Session ID for token tracking
+session_id = token_manager.get_session_id()
+with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
+    # Header with Token Display
+    with gr.Row():
+        with gr.Column(scale=8):
+            gr.HTML("""
+                <div style='text-align: center; padding: 30px; border-radius: 24px;
+                            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+                            margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
+                    <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
+                               -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+                               font-family: Consolas, monospace;'>
+                        🛰️ ZEROENGINE V0.2
+                    </h1>
+                    <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
+                        CPU-Optimized | Token System | Custom Parameters | Auto-Format
+                    </p>
+                </div>
+            """)
+        with gr.Column(scale=2):
+            # Token Display
+            gr.HTML("""
+                <div style='text-align: center; padding: 20px; border-radius: 20px;
+                            background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
+                            margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
+                    <div style='font-size: 2em; margin-bottom: 5px;'>💰</div>
+                    <div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
+                        100.00
+                    </div>
+                    <div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
+                </div>
+            """)
+            token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
+            end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
+            session_status = gr.Markdown("", visible=False)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 label="Main Engine Feedback",
+                height=600,
                 show_label=False,
                 autoscroll=True,
                 container=True
                 )
                 send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
+        with gr.Column(scale=4):
+            # Hardware Status
             gr.Markdown("### 🛠️ Hardware Status")
             ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
             cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
+            # Model Control
             gr.Markdown("### 📡 Model Control")
             repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
             quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
+            # Custom Parameters
+            gr.Markdown("### ⚙️ Custom Parameters")
+            temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
+            top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
+            top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
+            repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
+            gr.Markdown("---")
+            # Token Purchases
+            gr.Markdown("### 💎 Token Upgrades")
+            with gr.Row():
+                batch_upgrade_btn = gr.Button("🚀 Batch x2", size="sm", variant="secondary")
+                token_upgrade_btn = gr.Button("📈 +1K Tokens", size="sm", variant="secondary")
+            purchase_status = gr.Markdown("Ready to upgrade!")
+            gr.Markdown("---")
+            # Ghost Cache
             gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
             log_output = gr.Code(
                 label="Kernel Logs",
                 language="shell",
+                value="[INIT] V0.2 System Ready.",
                 lines=5
             )
     def update_stats():
         try:
             m = ResourceMonitor.get_metrics()
+            balance = token_manager.get_balance(session_id)
+            return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
         except Exception as e:
             logger.error(f"Stats update error: {e}")
+            return "Error", "Error", "0.00"
             return "Error", "Error"
     def on_scan(repo):
                 return
             yield "⚙️ System: Initiating boot sequence...", gr.update()
+            time.sleep(0.5)
+            result = kernel.boot_kernel(repo, file, session_id)
             yield result, gr.update()
         except Exception as e:
             logger.error(f"Boot UI error: {e}")
             yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
+    def on_batch_upgrade():
+        success, msg = token_manager.purchase_batch_upgrade(session_id)
+        balance = token_manager.get_balance(session_id)
+        return msg, f"{balance}"
+    def on_token_upgrade():
+        success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
+        balance = token_manager.get_balance(session_id)
+        return msg, f"{balance}"
+    def on_end_session():
+        msg = token_manager.end_session(session_id)
+        return msg
+    def update_custom_params(temp, top_p, top_k, repeat_pen):
+        kernel.custom_params["temperature"] = temp
+        kernel.custom_params["top_p"] = top_p
+        kernel.custom_params["top_k"] = int(top_k)
+        kernel.custom_params["repeat_penalty"] = repeat_pen
+        return "✅ Parameters updated!"
+    # Timer for periodic stats updates (includes token balance)
     timer = gr.Timer(value=2)
+    timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
     boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
+    # Token purchases
+    batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
+    token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
+    end_session_btn.click(on_end_session, None, [session_status])
+    # Custom parameter updates
+    temperature_slider.change(update_custom_params,
+                             [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+                             [purchase_status])
+    top_p_slider.change(update_custom_params,
+                        [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+                        [purchase_status])
+    top_k_slider.change(update_custom_params,
+                        [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+                        [purchase_status])
+    repeat_penalty_slider.change(update_custom_params,
+                                 [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+                                 [purchase_status])
+    # Ghost cache
     stitch_btn.click(
         lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
         [ghost_buffer],
         [stitch_status]
     )
+    # Keyboard input preprocessing
     user_input.change(
         lambda x: kernel.preprocess_input(x),
         [user_input],
         None
     )
+    # Auto-boot enabled inference
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])