Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 6 days ago

Commit

e77443a

verified ·

1 Parent(s): 26fd9b6

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -110

app.py CHANGED Viewed

@@ -1,152 +1,115 @@
-import os
-import json
-import time
-import psutil
-import threading
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
-# System-level Constants
 HF_TOKEN = os.environ.get("HF_TOKEN")
 LOG_FILE = "engine_telemetry.json"
-RAM_SAFETY_THRESHOLD = 0.50  # 50% limit for model weights
-SYSTEM_RESERVE_MB = 200
 class ZeroEngine:
     def __init__(self):
         self.llm = None
-        self.lock = threading.Lock()
-        self.active_repo = None
-        self.telemetry = self._load_telemetry()
         self.api = HfApi(token=HF_TOKEN)
-    def _load_telemetry(self):
-        if os.path.exists(LOG_FILE):
-            with open(LOG_FILE, "r") as f:
-                return json.load(f)
-        return {"load_count": {}, "popular_quants": []}
-    def _sync_telemetry(self):
-        if not HF_TOKEN: return
-        with open(LOG_FILE, "w") as f:
-            json.dump(self.telemetry, f)
-        try:
-            repo_id = os.environ.get("SPACE_ID")
-            if repo_id:
-                self.api.upload_file(path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE, repo_id=repo_id, repo_type="space")
-        except Exception: pass
-    def get_system_status(self):
-        mem = psutil.virtual_memory()
-        return {
-            "ram_used": round(mem.used / (1024**3), 2),
-            "ram_total": round(mem.total / (1024**3), 2),
-            "cpu_pct": psutil.cpu_percent()
-        }
-    def load_engine(self, repo, file):
         path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
-        file_size_gb = os.path.getsize(path) / (1024**3)
-        total_ram = psutil.virtual_memory().total / (1024**3)
-        if file_size_gb > (total_ram * RAM_SAFETY_THRESHOLD):
-            return f"⚠ DECLINED: Model size ({file_size_gb:.2f}GB) exceeds 50% RAM limit."
         with self.lock:
             if self.llm: del self.llm
             self.llm = Llama(
-                model_path=path,
-                n_ctx=4096,
-                n_threads=1, # One core per slot (2 concurrent max)
-                use_mmap=True,
-                logits_all=False,
-                verbose=False
             )
-            self.active_repo = repo
-            self.telemetry["load_count"][file] = self.telemetry["load_count"].get(file, 0) + 1
-            self._sync_telemetry()
-        return f"✅ Engine Active: {file}"
-    def ghost_prefill(self, text):
-        """KV-Cache Stitching: Pre-evaluates tokens to warm the cache."""
-        if not self.llm or not text: return
         tokens = self.llm.tokenize(text.encode("utf-8"))
-        # Eval only, no generation. Internal prefix_matching handles the 'stitching'.
         try:
-            self.llm.eval(tokens)
-            return "⚡ Ghost Cache Primed"
         except Exception:
-            return "⚠ Cache Overflow"
-    def chat(self, message, history, ghost_text):
-        if not self.llm:
-            yield history + [{"role": "assistant", "content": "Engine Offline. Please load a model."}]
-            return
-        # Combine ghost-prefilled context with new message
-        full_input = f"{ghost_text}\n{message}" if ghost_text else message
-        response = ""
-        # Use streaming with high-speed settings
-        for chunk in self.llm.create_chat_completion(
-            messages=[{"role": "user", "content": full_input}],
-            stream=True,
-            max_tokens=1024
-        ):
-            delta = chunk["choices"][0]["delta"]
-            if "content" in delta:
-                response += delta["content"]
-                yield history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
 engine = ZeroEngine()
-# --- Gradio UI Design ---
-with gr.Blocks(theme=gr.themes.Default(primary_hue="slate", radius_size="none"), fill_height=True) as demo:
-    gr.Markdown("# 🛰️ ZeroEngine Kernel V0.1")
     with gr.Row():
-        with gr.Column(scale=9):
-            chat_interface = gr.Chatbot(type="messages", label="Active Slot Output", height=600)
-            msg_input = gr.Textbox(placeholder="Enter command...", label="Primary Input")
-        with gr.Sidebar(label="System Dashboard", open=True) as sidebar:
-            gr.Markdown("### 📊 Resource Monitor")
-            ram_stat = gr.Markdown("RAM: --")
-            cpu_stat = gr.Markdown("CPU: --")
-            gr.Markdown("---")
-            gr.Markdown("### 🛠 Engine Configuration")
             repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
-            file_drop = gr.Dropdown(label="Quantization", choices=[])
-            scan_btn = gr.Button("Scan Manifest")
-            load_btn = gr.Button("ACTIVATE", variant="primary")
-            engine_log = gr.Markdown("Status: Ready")
             gr.Markdown("---")
-            gr.Markdown("### 👻 Ghost Terminal")
-            ghost_in = gr.Textbox(label="Pre-Warm Input (Queue)", placeholder="Type here while waiting...")
-            ghost_status = gr.Markdown("Cache: Idle")
-            ghost_btn = gr.Button("Stitch Cache", size="sm")
-    # --- Logic ---
-    def update_sys():
-        s = engine.get_system_status()
-        return f"**RAM:** {s['ram_used']}GB / {s['ram_total']}GB", f"**CPU:** {s['cpu_pct']}%"
     def scan(repo):
         files = engine.api.list_repo_files(repo_id=repo)
         ggufs = [f for f in files if f.endswith(".gguf")]
         return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
-    # Event Wiring
-    demo.load(update_sys, None, [ram_stat, cpu_stat], every=2)
-    scan_btn.click(scan, [repo_in], [file_drop])
-    load_btn.click(engine.load_engine, [repo_in, file_drop], [engine_log])
-    ghost_btn.click(engine.ghost_prefill, [ghost_in], [ghost_status])
-    msg_input.submit(engine.chat, [msg_input, chat_interface, ghost_in], [chat_interface], concurrency_limit=2)
-    msg_input.submit(lambda: "", None, [msg_input]) # Reset active input
-    msg_input.submit(lambda: "", None, [ghost_in]) # Clear ghost buffer after stitching
 demo.queue().launch()

+import os, json, psutil, threading, time
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+# CONFIG
 HF_TOKEN = os.environ.get("HF_TOKEN")
 LOG_FILE = "engine_telemetry.json"
+RAM_LIMIT = 0.50 # 50% Max per model
+SYSTEM_RESERVE = 200 # MB
 class ZeroEngine:
     def __init__(self):
         self.llm = None
         self.api = HfApi(token=HF_TOKEN)
+        self.lock = threading.Lock()
+        self.ghost_cache = {} # Stores pre-filled token counts
+    def get_mem(self):
+        m = psutil.virtual_memory()
+        return m.available / (1024**2), m.total / (1024**2)
+    def load_model(self, repo, file):
+        avail, total = self.get_mem()
         path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
+        size_mb = os.path.getsize(path) / (1024**2)
+        # GATEKEEPER RULES
+        if size_mb > (total * RAM_LIMIT):
+            return f"❌ DECLINED: {size_mb:.0f}MB exceeds 50% RAM limit."
+        if (size_mb + SYSTEM_RESERVE) > avail:
+            return f"❌ DECLINED: Insufficient RAM (Need {SYSTEM_RESERVE}MB buffer)."
         with self.lock:
             if self.llm: del self.llm
             self.llm = Llama(
+                model_path=path, n_ctx=2048, n_threads=1, # Hard core partitioning
+                use_mmap=True, logits_all=False, verbose=False
             )
+        self.sync_telemetry(file)
+        return f"✅ Engine Online: {file}"
+    def ghost_stitch(self, text):
+        """Processes queue requests in background to prime the KV-Cache."""
+        if not self.llm or not text: return "Idle"
+        # The 'eval' call populates the internal KV cache.
+        # llama-cpp-python's prefix matching handles the 'stitching' automatically.
         tokens = self.llm.tokenize(text.encode("utf-8"))
         try:
+            self.llm.eval(tokens) # Pre-process tokens
+            return f"⚡ Cache Primed ({len(tokens)} tokens)"
         except Exception:
+            return "⚠ Cache Saturated"
+    def sync_telemetry(self, filename):
+        if not HF_TOKEN: return
+        data = {"last_load": filename, "time": time.time()}
+        with open(LOG_FILE, "w") as f: json.dump(data, f)
+        try:
+            self.api.upload_file(
+                path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE,
+                repo_id=os.environ.get("SPACE_ID"), repo_type="space"
+            )
+        except: pass
 engine = ZeroEngine()
+# UI
+with gr.Blocks(theme="shivi/calm_sea", fill_height=True) as demo:
+    gr.Markdown("# 🛰️ ZeroEngine V0.1")
     with gr.Row():
+        with gr.Column(scale=4):
+            chat = gr.Chatbot(type="messages", height=500)
+            msg = gr.Textbox(placeholder="Active Slot Input...", label="Command")
+        with gr.Sidebar(label="Engine Room") as sb:
+            ram_bar = gr.Label(label="RAM Usage")
             repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
+            file_drop = gr.Dropdown(label="Quant File")
+            load_btn = gr.Button("BOOT ENGINE", variant="primary")
             gr.Markdown("---")
+            gr.Markdown("### 👻 Ghost Terminal (Queue)")
+            ghost_in = gr.Textbox(label="Pre-type Prompt", placeholder="While you wait...")
+            ghost_stat = gr.Markdown("Cache: Empty")
+            stitch_btn = gr.Button("Warm Up Cache", size="sm")
+    # Handlers
+    def update_ram():
+        avail, total = engine.get_mem()
+        used = total - avail
+        return {"Used (MB)": used, "Free (MB)": avail}
     def scan(repo):
         files = engine.api.list_repo_files(repo_id=repo)
         ggufs = [f for f in files if f.endswith(".gguf")]
         return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
+    def run_chat(m, h, g):
+        if not engine.llm: yield h + [{"role":"assistant", "content":"Load model first."}]; return
+        full_p = f"{g}\n{m}" if g else m
+        resp = ""
+        for chunk in engine.llm.create_chat_completion(messages=[{"role":"user","content":full_p}], stream=True):
+            delta = chunk["choices"][0]["delta"]
+            if "content" in delta:
+                resp += delta["content"]
+                yield h + [{"role":"user", "content":m}, {"role":"assistant", "content":resp}]
+    demo.load(update_ram, None, ram_bar, every=2)
+    load_btn.click(scan, [repo_in], [file_drop]).then(engine.load_model, [repo_in, file_drop], None)
+    stitch_btn.click(engine.ghost_stitch, [ghost_in], [ghost_stat])
+    msg.submit(run_chat, [msg, chat, ghost_in], [chat], concurrency_limit=2)
 demo.queue().launch()