Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Jan 31

Commit

72e0339

verified ·

1 Parent(s): 551f9be

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -161

app.py CHANGED Viewed

@@ -785,142 +785,6 @@ class ZeroEngine:
             logger.error(f"[BOOT] Unexpected error: {e}")
             nuclear_ram_clear()
             return f"🔴 BOOT FAILURE: {str(e)}"
-        """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
-        try:
-            if not repo or not filename:
-                return "🔴 ERROR: Repository or filename missing"
-            logger.info(f"[BOOT] Starting download: {filename} from {repo}")
-            # DETECT QUANTIZATION FROM FILENAME
-            quant_config = self.detect_quantization(filename)
-            # Download with timeout protection
-            try:
-                path = hf_hub_download(
-                    repo_id=repo,
-                    filename=filename,
-                    token=HF_TOKEN,
-                    local_files_only=False
-                )
-                logger.info(f"[BOOT] Download complete: {path}")
-            except Exception as e:
-                logger.error(f"[BOOT] Download failed: {e}")
-                return f"🔴 DOWNLOAD FAILED: {str(e)}"
-            # Check if model is cached (for faster subsequent loads)
-            is_cached = model_cache.is_cached(path)
-            cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
-            # Validate before loading
-            valid, msg = ResourceMonitor.validate_deployment(path)
-            if not valid:
-                logger.warning(f"[BOOT] Validation failed: {msg}")
-                return f"🔴 VALIDATION FAILED: {msg}"
-            logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations...")
-            # Apply NUMA optimization
-            if NUMA_OPTIMIZE:
-                self.optimize_numa()
-            # Load model with MAXIMUM PERFORMANCE SETTINGS
-            with self.kernel_lock:
-                # WRECK OLD MODEL - Nuclear option
-                if self.llm:
-                    logger.info("[BOOT] 💣 WRECKING old model...")
-                    try:
-                        # Wreck the cache first
-                        model_cache.wreck_old_model_cache()
-                        # Delete the model
-                        del self.llm
-                        self.llm = None
-                        # Nuclear RAM clear
-                        nuclear_ram_clear()
-                        logger.info("[BOOT] ✅ Old model DESTROYED")
-                    except Exception as e:
-                        logger.warning(f"[BOOT] Cleanup warning: {e}")
-                # Calculate optimal batch size based on quantization and available RAM
-                vm = psutil.virtual_memory()
-                available_ram_gb = vm.available / (1024**3)
-                # MASSIVE batch sizes for quantized models
-                base_batch = int(256 * available_ram_gb / 4)
-                optimal_batch = int(base_batch * quant_config["batch_multiplier"])
-                optimal_batch = max(512, min(4096, optimal_batch))  # Clamp between 512-4096
-                # Context size based on quantization
-                optimal_ctx = quant_config["ctx_size"]
-                # Thread count with quantization-specific boost
-                optimal_threads = int(OPTIMAL_THREADS * quant_config["threads_boost"])
-                optimal_threads = max(2, min(optimal_threads, psutil.cpu_count(logical=False)))
-                try:
-                    logger.info(f"[BOOT] Initializing {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
-                    # Preload cache if available (simulates faster warmup)
-                    if is_cached:
-                        model_cache.preload_cache(path)
-                    # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
-                    self.llm = Llama(
-                        model_path=path,
-                        n_ctx=optimal_ctx,                  # Dynamic context based on quant
-                        n_threads=optimal_threads,          # Optimized thread count
-                        n_threads_batch=optimal_threads,    # Batch processing threads
-                        use_mmap=USE_MMAP,                  # Memory-mapped weights (fast loading)
-                        use_mlock=MLOCK_MODEL,              # Lock in RAM (prevent swap thrashing)
-                        n_batch=optimal_batch,              # MASSIVE batch size
-                        n_gpu_layers=0,                     # CPU-only mode
-                        flash_attn=FLASH_ATTENTION,         # Flash Attention (2x faster)
-                        type_k=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
-                        type_v=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
-                        rope_scaling_type=0,                # Linear RoPE scaling
-                        rope_freq_scale=ROPE_SCALING,       # RoPE frequency scale
-                        numa=NUMA_OPTIMIZE,                 # NUMA optimization
-                        verbose=False,
-                        logits_all=False,                   # Only compute final logits (faster)
-                        embedding=False,                    # Disable embeddings (not needed)
-                        offload_kqv=OFFLOAD_KQV,           # No offload on CPU
-                        f16_kv=False                        # Use quantized KV cache instead
-                    )
-                    self.active_model_info = {"repo": repo, "file": filename, "quant": quant_config['type']}
-                    self.telemetry.track_load(repo, filename)
-                    # Extract and cache TINY signature for faster future loads
-                    if not is_cached:
-                        logger.info("[BOOT] Extracting cache signature...")
-                        signature = model_cache.extract_cache_signature(path)
-                        if signature:
-                            model_cache.save_to_cache(path, signature)
-                    # Warm-up inference to populate caches
-                    logger.info("[BOOT] Warming up model caches...")
-                    try:
-                        self.llm("Warmup", max_tokens=1, stream=False)
-                        force_gc()  # Clear warmup artifacts
-                    except:
-                        pass
-                    logger.info("[BOOT] 🚀 HYPER-OPTIMIZED MODEL READY!")
-                    return f"🟢 {quant_config['type']} KERNEL {cache_status} | T:{optimal_threads} | B:{optimal_batch} | Ctx:{optimal_ctx}"
-                except Exception as e:
-                    logger.error(f"[BOOT] Model loading failed: {e}")
-                    self.llm = None
-                    nuclear_ram_clear()
-                    return f"🔴 LOAD FAILED: {str(e)}"
-        except Exception as e:
-            logger.error(f"[BOOT] Unexpected error: {e}")
-            nuclear_ram_clear()
-            return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
         """Prime KV cache with ghost context"""
@@ -942,7 +806,7 @@ class ZeroEngine:
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Primed"
-    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
         username = profile.username if profile else "anonymous"
         # Update activity timestamp
         self.update_activity()
@@ -1291,14 +1155,14 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
     # --- UI LOGIC ---
     def update_stats(profile: gr.OAuthProfile | None):
         try:
-        m = ResourceMonitor.get_metrics()
-        current_user = profile.username if profile else "Guest"
-        balance = token_manager.get_balance(current_user)
-        return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
-    except Exception as e:
-        logger.error(f"Stats update error: {e}")
-        return "Error", "Error", "0.00"
     def on_scan(repo):
         try:
@@ -1316,21 +1180,10 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
             logger.error(f"Scan error: {e}")
             return gr.update(choices=[], value=None), f"🔴 Scan failed: {str(e)}"
-    def on_boot(repo, file):
-        try:
-            if not repo or not file:
-                yield "🔴 ERROR: Repository and filename required", gr.update()
-                return
-            yield "⚙️ System: Initiating boot sequence...", gr.update()
-            time.sleep(0.5)
-            result = kernel.boot_kernel(repo, file, session_id)
-            yield result, gr.update()
-        except Exception as e:
-            logger.error(f"Boot UI error: {e}")
-            yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
     def on_batch_upgrade():
         success, msg = token_manager.purchase_batch_upgrade(session_id)
@@ -1359,7 +1212,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
-    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
     # Token purchases
     batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])

             logger.error(f"[BOOT] Unexpected error: {e}")
             nuclear_ram_clear()
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
         """Prime KV cache with ghost context"""
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Primed"
+    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, profile: gr.OAuthProfile | None) -> Generator:
         username = profile.username if profile else "anonymous"
         # Update activity timestamp
         self.update_activity()
     # --- UI LOGIC ---
     def update_stats(profile: gr.OAuthProfile | None):
         try:
+            m = ResourceMonitor.get_metrics()
+            current_user = profile.username if profile else "anonymous"
+            balance = token_manager.get_balance(current_user)
+            return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
+        except Exception as e:
+            logger.error(f"Stats update error: {e}")
+            return "Error", "Error", "0.00"
     def on_scan(repo):
         try:
             logger.error(f"Scan error: {e}")
             return gr.update(choices=[], value=None), f"🔴 Scan failed: {str(e)}"
+    def on_boot(repo, file, profile: gr.OAuthProfile | None):
+        username = profile.username if profile else "anonymous"
+        result = kernel.boot_kernel(repo, file, username)
+        return result
     def on_batch_upgrade():
         success, msg = token_manager.purchase_batch_upgrade(session_id)
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
+    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status])
     # Token purchases
     batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])