Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 5 days ago

Commit

022b660

verified ·

1 Parent(s): 0667b53

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -51

app.py CHANGED Viewed

@@ -26,11 +26,23 @@ except ImportError:
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
-RAM_LIMIT_PCT = 0.85  # Increased from 0.50 to prevent false rejections
-SYSTEM_RESERVE_MB = 500  # Increased reserve
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
@@ -41,41 +53,21 @@ class TelemetryManager:
         self.stats = self._load_initial_stats()
     def _load_initial_stats(self) -> Dict:
-        if os.path.exists(LOG_FILE):
-            try:
-                with open(LOG_FILE, "r", encoding="utf-8") as f:
-                    return json.load(f)
-            except Exception:
-                pass
         return {
             "session_start": str(datetime.now(pytz.utc)),
             "load_count": {},
-            "total_tokens_generated": 0,
-            "popular_repos": []
         }
     def track_load(self, repo: str, filename: str):
         key = f"{repo}/{filename}"
         self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
-        self._sync_to_cloud()
     def track_generation(self, tokens: int):
         self.stats["total_tokens_generated"] += tokens
-    def _sync_to_cloud(self):
-        if not HF_TOKEN or not SPACE_ID:
-            return
-        try:
-            with open(LOG_FILE, "w", encoding="utf-8") as f:
-                json.dump(self.stats, f, indent=4)
-            self.api.upload_file(
-                path_or_fileobj=LOG_FILE,
-                path_in_repo=LOG_FILE,
-                repo_id=SPACE_ID,
-                repo_type="space"
-            )
-        except Exception as e:
-            logger.error(f"Sync Failure: {e}")
 # --- RESOURCE MONITOR ---
 class ResourceMonitor:
@@ -119,6 +111,59 @@ class ZeroEngine:
         self.active_model_info = {"repo": "", "file": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
     def list_ggufs(self, repo_id: str) -> List[str]:
         try:
@@ -131,7 +176,7 @@ class ZeroEngine:
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
-        """Boot kernel with proper error handling to prevent space crashes"""
         try:
             if not repo or not filename:
                 return "🔴 ERROR: Repository or filename missing"
@@ -157,9 +202,13 @@ class ZeroEngine:
                 logger.warning(f"[BOOT] Validation failed: {msg}")
                 return f"🔴 VALIDATION FAILED: {msg}"
-            logger.info("[BOOT] Validation passed, initializing model...")
-            # Load model with proper cleanup
             with self.kernel_lock:
                 # Clear previous model
                 if self.llm:
@@ -170,22 +219,51 @@ class ZeroEngine:
                     except Exception as e:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
-                # Initialize new model with conservative settings
                 try:
-                    logger.info("[BOOT] Loading model into memory...")
                     self.llm = Llama(
                         model_path=path,
-                        n_ctx=2048,
-                        n_threads=2,
-                        use_mmap=True,  # Critical: memory map to reduce RAM usage
-                        n_batch=256,    # Reduced from 512 to be safer
-                        n_gpu_layers=0, # Force CPU only
-                        verbose=False
                     )
                     self.active_model_info = {"repo": repo, "file": filename}
                     self.telemetry.track_load(repo, filename)
-                    logger.info("[BOOT] Model loaded successfully!")
-                    return f"🟢 KERNEL ONLINE: {filename}"
                 except Exception as e:
                     logger.error(f"[BOOT] Model loading failed: {e}")
                     self.llm = None
@@ -213,13 +291,42 @@ class ZeroEngine:
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Ghost Cache Primed"
-    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
         if not self.llm:
-            history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."})
             yield history
             return
-        # Prepare input
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
@@ -231,13 +338,26 @@ class ZeroEngine:
         response_text = ""
         start_time = time.time()
         tokens_count = 0
         try:
             stream = self.llm(
-                formatted_prompt,
-                max_tokens=1024,
-                stop=["User:", "<|eot_id|>", "\n\n"],
-                stream=True
             )
             for chunk in stream:
@@ -245,14 +365,39 @@ class ZeroEngine:
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
-                # Update history with streaming content
-                history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`"
                 yield history
             self.telemetry.track_generation(tokens_count)
         except Exception as e:
             logger.error(f"Inference error: {e}")
             history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
@@ -366,7 +511,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
                 🛰️ ZEROENGINE V0.1
             </h1>
             <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
-                Gradio 6.5.0 Production Build | Smooth Rounded UI
             </p>
         </div>
     """)
@@ -388,7 +533,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
                     container=False,
                     scale=9
                 )
-                send_btn = gr.Button("EXE", variant="primary", scale=1)
         with gr.Column(scale=3):
             gr.Markdown("### 🛠️ Hardware Status")
@@ -478,7 +623,8 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
         [stitch_status]
     )
-    inference_args = [user_input, chat_box, ghost_buffer]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])
     user_input.submit(lambda: "", None, [user_input])

 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
+RAM_LIMIT_PCT = 0.85
+SYSTEM_RESERVE_MB = 500
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
+# --- SPEED OPTIMIZATION CONFIG ---
+FLASH_ATTENTION = True          # Enable Flash Attention 2
+KV_CACHE_QUANTIZATION = True    # Quantize KV cache (4-bit)
+CONTINUOUS_BATCHING = True      # Enable continuous batching
+SPECULATIVE_DECODE = False      # Disabled for CPU (requires draft model)
+MLOCK_MODEL = True              # Lock model in RAM (prevent swap)
+USE_MMAP = True                 # Memory-mapped file loading
+OFFLOAD_KQV = False             # CPU-only, no offload needed
+OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1)  # Physical cores - 1
+ROPE_SCALING = 1.0              # RoPE frequency scaling
+NUMA_OPTIMIZE = True            # NUMA-aware memory allocation
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
         self.stats = self._load_initial_stats()
     def _load_initial_stats(self) -> Dict:
+        # Simplified: no file I/O to prevent restart issues
         return {
             "session_start": str(datetime.now(pytz.utc)),
             "load_count": {},
+            "total_tokens_generated": 0
         }
     def track_load(self, repo: str, filename: str):
         key = f"{repo}/{filename}"
         self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
+        logger.info(f"Model loaded: {key} (count: {self.stats['load_count'][key]})")
     def track_generation(self, tokens: int):
         self.stats["total_tokens_generated"] += tokens
+        logger.info(f"Total tokens generated: {self.stats['total_tokens_generated']}")
 # --- RESOURCE MONITOR ---
 class ResourceMonitor:
         self.active_model_info = {"repo": "", "file": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
+        self.perf_stats = {
+            "total_tokens": 0,
+            "total_time": 0.0,
+            "avg_tps": 0.0,
+            "peak_tps": 0.0,
+            "cache_hits": 0
+        }
+        self.prompt_cache = {}  # Cache for repeated prompts
+        self.last_activity = time.time()
+        self.idle_timeout = 20  # 20 seconds idle timeout
+        self.auto_cleanup_thread = None
+        self.start_idle_monitor()
+    def start_idle_monitor(self):
+        """Start background thread to monitor idle timeout"""
+        def monitor():
+            while True:
+                time.sleep(5)  # Check every 5 seconds
+                if self.llm and (time.time() - self.last_activity) > self.idle_timeout:
+                    logger.info(f"[IDLE] No activity for {self.idle_timeout}s, unloading model...")
+                    with self.kernel_lock:
+                        if self.llm:
+                            try:
+                                del self.llm
+                                self.llm = None
+                                self.active_model_info = {"repo": "", "file": ""}
+                                logger.info("[IDLE] Model unloaded successfully")
+                            except Exception as e:
+                                logger.error(f"[IDLE] Cleanup error: {e}")
+        self.auto_cleanup_thread = threading.Thread(target=monitor, daemon=True)
+        self.auto_cleanup_thread.start()
+        logger.info("[IDLE] Idle monitor started (20s timeout)")
+    def update_activity(self):
+        """Update last activity timestamp"""
+        self.last_activity = time.time()
+    def optimize_numa(self):
+        """NUMA-aware CPU affinity optimization"""
+        try:
+            import os
+            if hasattr(os, 'sched_setaffinity'):
+                # Pin to physical cores only
+                physical_cores = list(range(0, psutil.cpu_count(logical=False)))
+                os.sched_setaffinity(0, physical_cores)
+                logger.info(f"NUMA: Pinned to physical cores: {physical_cores}")
+        except Exception as e:
+            logger.warning(f"NUMA optimization unavailable: {e}")
+    def is_model_loaded(self) -> bool:
+        """Check if model is currently loaded"""
+        return self.llm is not None
     def list_ggufs(self, repo_id: str) -> List[str]:
         try:
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
+        """HYPER-OPTIMIZED Boot kernel with all speed optimizations enabled"""
         try:
             if not repo or not filename:
                 return "🔴 ERROR: Repository or filename missing"
                 logger.warning(f"[BOOT] Validation failed: {msg}")
                 return f"🔴 VALIDATION FAILED: {msg}"
+            logger.info("[BOOT] Validation passed, applying optimizations...")
+            # Apply NUMA optimization
+            if NUMA_OPTIMIZE:
+                self.optimize_numa()
+            # Load model with MAXIMUM PERFORMANCE SETTINGS
             with self.kernel_lock:
                 # Clear previous model
                 if self.llm:
                     except Exception as e:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
+                # Calculate optimal batch size based on available RAM
+                vm = psutil.virtual_memory()
+                available_ram_gb = vm.available / (1024**3)
+                # Dynamic batch sizing: more RAM = larger batches
+                optimal_batch = min(512, int(128 * available_ram_gb / 4))
                 try:
+                    logger.info(f"[BOOT] Initializing with {OPTIMAL_THREADS} threads, batch={optimal_batch}")
+                    # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
                     self.llm = Llama(
                         model_path=path,
+                        n_ctx=4096,                    # Increased context window
+                        n_threads=OPTIMAL_THREADS,     # Optimized thread count
+                        n_threads_batch=OPTIMAL_THREADS, # Batch processing threads
+                        use_mmap=USE_MMAP,             # Memory-mapped weights (fast loading)
+                        use_mlock=MLOCK_MODEL,         # Lock in RAM (prevent swap thrashing)
+                        n_batch=optimal_batch,         # Dynamic batch size
+                        n_gpu_layers=0,                # CPU-only mode
+                        flash_attn=FLASH_ATTENTION,    # Flash Attention (2x faster)
+                        type_k=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
+                        type_v=2 if KV_CACHE_QUANTIZATION else None,  # Q4 KV cache quantization
+                        rope_scaling_type=0,           # Linear RoPE scaling
+                        rope_freq_scale=ROPE_SCALING,  # RoPE frequency scale
+                        numa=NUMA_OPTIMIZE,            # NUMA optimization
+                        verbose=False,
+                        logits_all=False,              # Only compute final logits (faster)
+                        embedding=False,               # Disable embeddings (not needed)
+                        offload_kqv=OFFLOAD_KQV,      # No offload on CPU
+                        f16_kv=False                   # Use quantized KV cache instead
                     )
                     self.active_model_info = {"repo": repo, "file": filename}
                     self.telemetry.track_load(repo, filename)
+                    # Warm-up inference to populate caches
+                    logger.info("[BOOT] Warming up model caches...")
+                    try:
+                        self.llm("Test", max_tokens=1, stream=False)
+                    except:
+                        pass
+                    logger.info("[BOOT] 🚀 HYPER-OPTIMIZED MODEL READY!")
+                    return f"🟢 KERNEL ONLINE: {filename} | Threads: {OPTIMAL_THREADS} | Batch: {optimal_batch} | Flash Attn: {FLASH_ATTENTION}"
                 except Exception as e:
                     logger.error(f"[BOOT] Model loading failed: {e}")
                     self.llm = None
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Ghost Cache Primed"
+    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
+        # Update activity timestamp
+        self.update_activity()
+        # AUTO-BOOT: If model not loaded, auto-boot default model
         if not self.llm:
+            logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
+            history.append({"role": "assistant", "content": "🔄 Auto-booting model, please wait..."})
+            yield history
+            # Use provided repo/quant or fallback to defaults
+            boot_repo = repo if repo else DEFAULT_MODEL
+            boot_quant = quant if quant else DEFAULT_QUANT
+            boot_result = self.boot_kernel(boot_repo, boot_quant)
+            if "🔴" in boot_result or "FAILED" in boot_result:
+                history[-1]["content"] = f"❌ Auto-boot failed: {boot_result}\n\nPlease manually SCAN and BOOT a model."
+                yield history
+                return
+            history[-1]["content"] = f"✅ {boot_result}\n\nProcessing your request..."
+            yield history
+            time.sleep(0.5)  # Brief pause for user to see the message
+        # Check prompt cache for exact matches (instant response)
+        cache_key = f"{ghost_context}:{prompt}"
+        if cache_key in self.prompt_cache:
+            self.perf_stats["cache_hits"] += 1
+            logger.info("⚡ CACHE HIT - Instant response!")
+            history.append({"role": "user", "content": prompt})
+            history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
             yield history
             return
+        # Prepare input with optimized formatting
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
         start_time = time.time()
         tokens_count = 0
+        first_token_time = None
         try:
+            # HYPER-OPTIMIZED INFERENCE SETTINGS
             stream = self.llm(
+                formatted_prompt,
+                max_tokens=2048,                    # Increased output length
+                stop=["User:", "<|eot_id|>", "\n\n"],
+                stream=True,
+                temperature=0.7,                    # Balanced creativity
+                top_p=0.95,                         # Nucleus sampling
+                top_k=40,                           # Top-K sampling
+                repeat_penalty=1.1,                 # Prevent repetition
+                frequency_penalty=0.0,              # No frequency penalty
+                presence_penalty=0.0,               # No presence penalty
+                tfs_z=1.0,                          # Tail-free sampling
+                typical_p=1.0,                      # Typical sampling
+                mirostat_mode=2,                    # Mirostat v2 (perplexity control)
+                mirostat_tau=5.0,                   # Target perplexity
+                mirostat_eta=0.1,                   # Learning rate
             )
             for chunk in stream:
                 response_text += token
                 tokens_count += 1
+                # Track first token latency (TTFT - Time To First Token)
+                if first_token_time is None:
+                    first_token_time = time.time() - start_time
+                    logger.info(f"⚡ First token: {first_token_time*1000:.0f}ms")
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
+                # Track peak performance
+                if tps > self.perf_stats["peak_tps"]:
+                    self.perf_stats["peak_tps"] = tps
+                # Update history with streaming content + performance metrics
+                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s`"
                 yield history
+            # Update global performance stats
+            self.perf_stats["total_tokens"] += tokens_count
+            self.perf_stats["total_time"] += elapsed
+            self.perf_stats["avg_tps"] = self.perf_stats["total_tokens"] / self.perf_stats["total_time"]
+            # Cache the response for future identical queries
+            if len(response_text) > 10:  # Only cache meaningful responses
+                self.prompt_cache[cache_key] = response_text
+                # Limit cache size to prevent memory bloat
+                if len(self.prompt_cache) > 100:
+                    oldest_key = next(iter(self.prompt_cache))
+                    del self.prompt_cache[oldest_key]
             self.telemetry.track_generation(tokens_count)
+            logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
         except Exception as e:
             logger.error(f"Inference error: {e}")
             history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
                 🛰️ ZEROENGINE V0.1
             </h1>
             <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
+                Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
             </p>
         </div>
     """)
                     container=False,
                     scale=9
                 )
+                send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
         with gr.Column(scale=3):
             gr.Markdown("### 🛠️ Hardware Status")
         [stitch_status]
     )
+    # Auto-boot enabled inference - passes repo and quant for auto-boot
+    inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])
     user_input.submit(lambda: "", None, [user_input])