Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 5 days ago

Commit

26fd9b6

verified ·

1 Parent(s): c9c4656

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import json
+import time
+import psutil
+import threading
+import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download
+from llama_cpp import Llama
+# System-level Constants
+HF_TOKEN = os.environ.get("HF_TOKEN")
+LOG_FILE = "engine_telemetry.json"
+RAM_SAFETY_THRESHOLD = 0.50  # 50% limit for model weights
+SYSTEM_RESERVE_MB = 200
+class ZeroEngine:
+    def __init__(self):
+        self.llm = None
+        self.lock = threading.Lock()
+        self.active_repo = None
+        self.telemetry = self._load_telemetry()
+        self.api = HfApi(token=HF_TOKEN)
+    def _load_telemetry(self):
+        if os.path.exists(LOG_FILE):
+            with open(LOG_FILE, "r") as f:
+                return json.load(f)
+        return {"load_count": {}, "popular_quants": []}
+    def _sync_telemetry(self):
+        if not HF_TOKEN: return
+        with open(LOG_FILE, "w") as f:
+            json.dump(self.telemetry, f)
+        try:
+            repo_id = os.environ.get("SPACE_ID")
+            if repo_id:
+                self.api.upload_file(path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE, repo_id=repo_id, repo_type="space")
+        except Exception: pass
+    def get_system_status(self):
+        mem = psutil.virtual_memory()
+        return {
+            "ram_used": round(mem.used / (1024**3), 2),
+            "ram_total": round(mem.total / (1024**3), 2),
+            "cpu_pct": psutil.cpu_percent()
+        }
+    def load_engine(self, repo, file):
+        path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
+        file_size_gb = os.path.getsize(path) / (1024**3)
+        total_ram = psutil.virtual_memory().total / (1024**3)
+        if file_size_gb > (total_ram * RAM_SAFETY_THRESHOLD):
+            return f"⚠ DECLINED: Model size ({file_size_gb:.2f}GB) exceeds 50% RAM limit."
+        with self.lock:
+            if self.llm: del self.llm
+            self.llm = Llama(
+                model_path=path,
+                n_ctx=4096,
+                n_threads=1, # One core per slot (2 concurrent max)
+                use_mmap=True,
+                logits_all=False,
+                verbose=False
+            )
+            self.active_repo = repo
+            self.telemetry["load_count"][file] = self.telemetry["load_count"].get(file, 0) + 1
+            self._sync_telemetry()
+        return f"✅ Engine Active: {file}"
+    def ghost_prefill(self, text):
+        """KV-Cache Stitching: Pre-evaluates tokens to warm the cache."""
+        if not self.llm or not text: return
+        tokens = self.llm.tokenize(text.encode("utf-8"))
+        # Eval only, no generation. Internal prefix_matching handles the 'stitching'.
+        try:
+            self.llm.eval(tokens)
+            return "⚡ Ghost Cache Primed"
+        except Exception:
+            return "⚠ Cache Overflow"
+    def chat(self, message, history, ghost_text):
+        if not self.llm:
+            yield history + [{"role": "assistant", "content": "Engine Offline. Please load a model."}]
+            return
+        # Combine ghost-prefilled context with new message
+        full_input = f"{ghost_text}\n{message}" if ghost_text else message
+        response = ""
+        # Use streaming with high-speed settings
+        for chunk in self.llm.create_chat_completion(
+            messages=[{"role": "user", "content": full_input}],
+            stream=True,
+            max_tokens=1024
+        ):
+            delta = chunk["choices"][0]["delta"]
+            if "content" in delta:
+                response += delta["content"]
+                yield history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
+engine = ZeroEngine()
+# --- Gradio UI Design ---
+with gr.Blocks(theme=gr.themes.Default(primary_hue="slate", radius_size="none"), fill_height=True) as demo:
+    gr.Markdown("# 🛰️ ZeroEngine Kernel V0.1")
+    with gr.Row():
+        with gr.Column(scale=9):
+            chat_interface = gr.Chatbot(type="messages", label="Active Slot Output", height=600)
+            msg_input = gr.Textbox(placeholder="Enter command...", label="Primary Input")
+        with gr.Sidebar(label="System Dashboard", open=True) as sidebar:
+            gr.Markdown("### 📊 Resource Monitor")
+            ram_stat = gr.Markdown("RAM: --")
+            cpu_stat = gr.Markdown("CPU: --")
+            gr.Markdown("---")
+            gr.Markdown("### 🛠 Engine Configuration")
+            repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
+            file_drop = gr.Dropdown(label="Quantization", choices=[])
+            scan_btn = gr.Button("Scan Manifest")
+            load_btn = gr.Button("ACTIVATE", variant="primary")
+            engine_log = gr.Markdown("Status: Ready")
+            gr.Markdown("---")
+            gr.Markdown("### 👻 Ghost Terminal")
+            ghost_in = gr.Textbox(label="Pre-Warm Input (Queue)", placeholder="Type here while waiting...")
+            ghost_status = gr.Markdown("Cache: Idle")
+            ghost_btn = gr.Button("Stitch Cache", size="sm")
+    # --- Logic ---
+    def update_sys():
+        s = engine.get_system_status()
+        return f"**RAM:** {s['ram_used']}GB / {s['ram_total']}GB", f"**CPU:** {s['cpu_pct']}%"
+    def scan(repo):
+        files = engine.api.list_repo_files(repo_id=repo)
+        ggufs = [f for f in files if f.endswith(".gguf")]
+        return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
+    # Event Wiring
+    demo.load(update_sys, None, [ram_stat, cpu_stat], every=2)
+    scan_btn.click(scan, [repo_in], [file_drop])
+    load_btn.click(engine.load_engine, [repo_in, file_drop], [engine_log])
+    ghost_btn.click(engine.ghost_prefill, [ghost_in], [ghost_status])
+    msg_input.submit(engine.chat, [msg_input, chat_interface, ghost_in], [chat_interface], concurrency_limit=2)
+    msg_input.submit(lambda: "", None, [msg_input]) # Reset active input
+    msg_input.submit(lambda: "", None, [ghost_in]) # Clear ghost buffer after stitching
+demo.queue().launch()