Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 5 days ago

Commit

0abb106

verified ·

1 Parent(s): 6f76ef1

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -69

app.py CHANGED Viewed

@@ -1,26 +1,24 @@
-"""
-ZEROENGINE KERNEL V0.1
-Target SDK: Gradio 6.5.0
-Optimized for: 2 vCPU / 16GB RAM
-Features: KV-Cache Stitching, Hard Partitioning, Resource Gatekeeper, Ghost Terminal
-"""
 import os
 import json
 import time
 import psutil
 import threading
 import logging
 from datetime import datetime
 from typing import List, Dict, Optional, Generator
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
-from llama_cpp import Llama
-# ==========================================
-# SYSTEM CONFIGURATION & CONSTANTS
-# ==========================================
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
@@ -32,11 +30,7 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
-# ==========================================
-# CORE TELEMETRY & PERSISTENCE
-# ==========================================
 class TelemetryManager:
-    """Handles JSON-based usage tracking and HF Space persistence."""
     def __init__(self, api: HfApi):
         self.api = api
         self.stats = self._load_initial_stats()
@@ -49,7 +43,7 @@ class TelemetryManager:
             except Exception as e:
                 logger.error(f"Failed to load telemetry: {e}")
         return {
-            "session_start": str(datetime.now()),
             "load_count": {},
             "total_tokens_generated": 0,
             "popular_repos": []
@@ -75,19 +69,13 @@ class TelemetryManager:
                 repo_id=SPACE_ID,
                 repo_type="space"
             )
-            logger.info("Telemetry synced to Space repository.")
         except Exception as e:
             logger.warning(f"Telemetry sync failed: {e}")
-# ==========================================
-# RESOURCE GATEKEEPER
-# ==========================================
 class ResourceMonitor:
-    """Monitors vCPU and RAM to prevent Kernel Panics."""
     @staticmethod
     def get_metrics() -> Dict:
         vm = psutil.virtual_memory()
-        cpu_freq = psutil.cpu_freq()
         return {
             "ram_used_gb": round(vm.used / (1024**3), 2),
             "ram_avail_gb": round(vm.available / (1024**3), 2),
@@ -112,9 +100,6 @@ class ResourceMonitor:
         return True, "Resource check passed."
-# ==========================================
-# THE ZEROENGINE KERNEL
-# ==========================================
 class ZeroEngine:
     def __init__(self):
         self.api = HfApi(token=HF_TOKEN)
@@ -129,15 +114,14 @@ class ZeroEngine:
             files = self.api.list_repo_files(repo_id=repo_id)
             return [f for f in files if f.endswith(".gguf")]
         except Exception as e:
-            logger.error(f"HF API Error: {e}")
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
-        """Downloads and initializes the llama-cpp-python instance."""
         try:
-            logger.info(f"Booting Kernel with {filename}...")
-            path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 return msg
@@ -148,11 +132,10 @@ class ZeroEngine:
                 self.llm = Llama(
                     model_path=path,
-                    n_ctx=4096,
-                    n_threads=1,
                     use_mmap=True,
                     n_batch=512,
-                    last_n_tokens_size=64,
                     verbose=False
                 )
                 self.active_model_info = {"repo": repo, "file": filename}
@@ -163,7 +146,6 @@ class ZeroEngine:
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
-        """KV-CACHE STITCHING: Pre-processes queue tokens in background."""
         if not self.llm or not ghost_text:
             return "Kernel Idle"
@@ -175,9 +157,8 @@ class ZeroEngine:
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
-                logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
-            except Exception as e:
-                logger.error(f"Stitching failed: {e}")
             finally:
                 self.is_prefilling = False
@@ -185,15 +166,12 @@ class ZeroEngine:
         return "⚡ Ghost Cache Primed"
     def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
-        """Main chat generator using prefix-matched context."""
         if not self.llm:
             yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
             return
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
         start_time = time.time()
         tokens_count = 0
@@ -210,7 +188,6 @@ class ZeroEngine:
                 token = chunk["choices"][0]["text"]
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
@@ -224,19 +201,9 @@ class ZeroEngine:
         except Exception as e:
             yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
-# ==========================================
-# GRADIO INTERFACE (DASHBOARD)
-# ==========================================
 kernel = ZeroEngine()
-with gr.Blocks(
-    title="ZeroEngine Kernel",
-    theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
-    css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
-) as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
         <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
@@ -247,7 +214,6 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
-                type="messages",
                 label="Active Slot Inference",
                 height=650,
                 show_label=False,
@@ -295,12 +261,9 @@ with gr.Blocks(
             gr.Markdown("### 📉 System Logs")
             log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
-    # --- UI LOGIC ---
     def update_system_stats():
         m = ResourceMonitor.get_metrics()
-        ram_str = f"{m['ram_used_gb']} / {m['ram_total_gb']} GB"
-        cpu_str = f"{m['cpu_usage_pct']}%"
-        return ram_str, cpu_str
     def on_scan(repo):
         files = kernel.list_ggufs(repo)
@@ -318,26 +281,19 @@ with gr.Blocks(
         return f"Cache State: `{res}`"
     demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
-    boot_btn.click(
-        on_boot,
-        [repo_input, quant_dropdown],
-        [boot_status, sidebar]
-    )
     stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
     input_args = [user_input, chat_box, ghost_buffer]
     user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     user_input.submit(lambda: "", None, [user_input])
     user_input.submit(lambda: "", None, [ghost_buffer])
-# ==========================================
-# KERNEL EXECUTION
-# ==========================================
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(show_api=False)

 import os
 import json
 import time
 import psutil
 import threading
 import logging
+import pytz
 from datetime import datetime
 from typing import List, Dict, Optional, Generator
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
+try:
+    from llama_cpp import Llama
+except ImportError:
+    try:
+        from llama_cpp_pydist import Llama
+    except ImportError:
+        Llama = None
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
 class TelemetryManager:
     def __init__(self, api: HfApi):
         self.api = api
         self.stats = self._load_initial_stats()
             except Exception as e:
                 logger.error(f"Failed to load telemetry: {e}")
         return {
+            "session_start": str(datetime.now(pytz.utc)),
             "load_count": {},
             "total_tokens_generated": 0,
             "popular_repos": []
                 repo_id=SPACE_ID,
                 repo_type="space"
             )
         except Exception as e:
             logger.warning(f"Telemetry sync failed: {e}")
 class ResourceMonitor:
     @staticmethod
     def get_metrics() -> Dict:
         vm = psutil.virtual_memory()
         return {
             "ram_used_gb": round(vm.used / (1024**3), 2),
             "ram_avail_gb": round(vm.available / (1024**3), 2),
         return True, "Resource check passed."
 class ZeroEngine:
     def __init__(self):
         self.api = HfApi(token=HF_TOKEN)
             files = self.api.list_repo_files(repo_id=repo_id)
             return [f for f in files if f.endswith(".gguf")]
         except Exception as e:
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
         try:
+            if Llama is None:
+                return "🔴 KERNEL ERROR: llama-cpp-python not installed correctly."
+            path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 return msg
                 self.llm = Llama(
                     model_path=path,
+                    n_ctx=2048,
+                    n_threads=2,
                     use_mmap=True,
                     n_batch=512,
                     verbose=False
                 )
                 self.active_model_info = {"repo": repo, "file": filename}
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
         if not self.llm or not ghost_text:
             return "Kernel Idle"
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
+            except Exception:
+                pass
             finally:
                 self.is_prefilling = False
         return "⚡ Ghost Cache Primed"
     def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
         if not self.llm:
             yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
             return
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
         start_time = time.time()
         tokens_count = 0
                 token = chunk["choices"][0]["text"]
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
         except Exception as e:
             yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
 kernel = ZeroEngine()
+with gr.Blocks(title="ZeroEngine Kernel") as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
         <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 label="Active Slot Inference",
                 height=650,
                 show_label=False,
             gr.Markdown("### 📉 System Logs")
             log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
     def update_system_stats():
         m = ResourceMonitor.get_metrics()
+        return f"{m['ram_used_gb']} / {m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
     def on_scan(repo):
         files = kernel.list_ggufs(repo)
         return f"Cache State: `{res}`"
     demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
+    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, sidebar])
     stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
     input_args = [user_input, chat_box, ghost_buffer]
     user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     user_input.submit(lambda: "", None, [user_input])
     user_input.submit(lambda: "", None, [ghost_buffer])
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(
+        show_api=False,
+        theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
+        css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
+    )