Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 5 days ago

Commit

dbe6259

verified ·

1 Parent(s): 0abb106

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -98

app.py CHANGED Viewed

@@ -11,14 +11,18 @@ from typing import List, Dict, Optional, Generator
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 try:
     from llama_cpp import Llama
 except ImportError:
     try:
         from llama_cpp_pydist import Llama
     except ImportError:
-        Llama = None
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
@@ -30,6 +34,7 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
 class TelemetryManager:
     def __init__(self, api: HfApi):
         self.api = api
@@ -38,10 +43,10 @@ class TelemetryManager:
     def _load_initial_stats(self) -> Dict:
         if os.path.exists(LOG_FILE):
             try:
-                with open(LOG_FILE, "r") as f:
                     return json.load(f)
-            except Exception as e:
-                logger.error(f"Failed to load telemetry: {e}")
         return {
             "session_start": str(datetime.now(pytz.utc)),
             "load_count": {},
@@ -61,7 +66,7 @@ class TelemetryManager:
         if not HF_TOKEN or not SPACE_ID:
             return
         try:
-            with open(LOG_FILE, "w") as f:
                 json.dump(self.stats, f, indent=4)
             self.api.upload_file(
                 path_or_fileobj=LOG_FILE,
@@ -70,8 +75,9 @@ class TelemetryManager:
                 repo_type="space"
             )
         except Exception as e:
-            logger.warning(f"Telemetry sync failed: {e}")
 class ResourceMonitor:
     @staticmethod
     def get_metrics() -> Dict:
@@ -91,15 +97,13 @@ class ResourceMonitor:
         file_size_mb = os.path.getsize(file_path) / (1024**2)
         total_ram_mb = vm.total / (1024**2)
         avail_ram_mb = vm.available / (1024**2)
         if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
-            return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
         if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
-            return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
-        return True, "Resource check passed."
 class ZeroEngine:
     def __init__(self):
         self.api = HfApi(token=HF_TOKEN)
@@ -114,22 +118,21 @@ class ZeroEngine:
             files = self.api.list_repo_files(repo_id=repo_id)
             return [f for f in files if f.endswith(".gguf")]
         except Exception as e:
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
         try:
-            if Llama is None:
-                return "🔴 KERNEL ERROR: llama-cpp-python not installed correctly."
             path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 return msg
             with self.kernel_lock:
                 if self.llm:
                     del self.llm
                 self.llm = Llama(
                     model_path=path,
                     n_ctx=2048,
@@ -141,159 +144,170 @@ class ZeroEngine:
                 self.active_model_info = {"repo": repo, "file": filename}
                 self.telemetry.track_load(repo, filename)
-            return f"🟢 KERNEL ONLINE: {filename} loaded successfully."
         except Exception as e:
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
-        if not self.llm or not ghost_text:
-            return "Kernel Idle"
-        if self.is_prefilling:
-            return "Kernel Busy"
         def _bg_eval():
             self.is_prefilling = True
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
-            except Exception:
-                pass
             finally:
                 self.is_prefilling = False
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Ghost Cache Primed"
-    def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
         if not self.llm:
-            yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
             return
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
         start_time = time.time()
         tokens_count = 0
         try:
             stream = self.llm(
-                formatted_prompt,
-                max_tokens=1024,
-                stop=["User:", "\n\n"],
                 stream=True
             )
             for chunk in stream:
                 token = chunk["choices"][0]["text"]
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
-                yield history + [
-                    {"role": "user", "content": prompt},
-                    {"role": "assistant", "content": f"{response_text}\n\n`[{tps} t/s]`"}
-                ]
             self.telemetry.track_generation(tokens_count)
         except Exception as e:
-            yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
 kernel = ZeroEngine()
-with gr.Blocks(title="ZeroEngine Kernel") as demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
-        <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
-        <p style="margin: 0; font-family: monospace;">STATUS: HIGH-PERFORMANCE KERNEL / VCPU-PARTITIONED</p>
-    </div>
-    """)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
-                label="Active Slot Inference",
-                height=650,
-                show_label=False,
-                bubble_full_width=False
             )
             with gr.Row():
-                with gr.Column(scale=9):
-                    user_input = gr.Textbox(
-                        placeholder="Input command for active processing core...",
-                        label="Active Terminal",
-                        container=False
-                    )
-                with gr.Column(scale=1, min_width=50):
-                    send_btn = gr.Button("EXE", variant="primary")
-        with gr.Sidebar(label="Engine Room", open=True) as sidebar:
-            gr.Markdown("### 📊 Resource Gauges")
-            with gr.Row():
-                ram_metric = gr.Label(label="RAM Allocation", value="0/16 GB")
-                cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
-            gr.Markdown("### 🛠️ Kernel Control")
-            repo_input = gr.Textbox(label="HF Repo ID", value=DEFAULT_MODEL)
-            quant_dropdown = gr.Dropdown(label="Quantization Target", choices=[])
             with gr.Row():
-                scan_btn = gr.Button("Scan Repo", size="sm")
-                boot_btn = gr.Button("BOOT KERNEL", variant="primary", size="sm")
-            boot_status = gr.Markdown("*Standby: Kernel not initialized.*")
             gr.Markdown("---")
-            gr.Markdown("### 👻 Ghost Terminal")
             ghost_buffer = gr.Textbox(
-                label="Pre-typing Buffer (Queue)",
-                placeholder="Queue users type here to prime KV-cache...",
                 lines=3
             )
-            stitch_status = gr.Markdown("Cache State: `EMPTY`")
-            stitch_btn = gr.Button("STITCH CACHE", size="sm")
-            gr.Markdown("---")
-            gr.Markdown("### 📉 System Logs")
-            log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
-    def update_system_stats():
         m = ResourceMonitor.get_metrics()
-        return f"{m['ram_used_gb']} / {m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
     def on_scan(repo):
         files = kernel.list_ggufs(repo)
         if not files:
-            return gr.update(choices=[], value=None), "Repo scan failed or no GGUFs found."
         return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."
     def on_boot(repo, file):
-        yield "Initialising boot sequence...", gr.update(open=True)
         res = kernel.boot_kernel(repo, file)
-        yield res, gr.update(open=True)
-    def on_stitch(text):
-        res = kernel.stitch_cache(text)
-        return f"Cache State: `{res}`"
-    demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
-    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, sidebar])
-    stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
-    input_args = [user_input, chat_box, ghost_buffer]
-    user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
-    send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     user_input.submit(lambda: "", None, [user_input])
-    user_input.submit(lambda: "", None, [ghost_buffer])
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(
-        show_api=False,
-        theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
-        css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
     )

 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
+# --- KERNEL INITIALIZATION ---
 try:
     from llama_cpp import Llama
 except ImportError:
     try:
         from llama_cpp_pydist import Llama
     except ImportError:
+        class Llama:
+            def __init__(self, *args, **kwargs):
+                raise ImportError("Kernel Binary Missing. Ensure llama-cpp-python is installed.")
+# --- CONFIGURATION ---
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 logger = logging.getLogger(__name__)
+# --- TELEMETRY MODULE ---
 class TelemetryManager:
     def __init__(self, api: HfApi):
         self.api = api
     def _load_initial_stats(self) -> Dict:
         if os.path.exists(LOG_FILE):
             try:
+                with open(LOG_FILE, "r", encoding="utf-8") as f:
                     return json.load(f)
+            except Exception:
+                pass
         return {
             "session_start": str(datetime.now(pytz.utc)),
             "load_count": {},
         if not HF_TOKEN or not SPACE_ID:
             return
         try:
+            with open(LOG_FILE, "w", encoding="utf-8") as f:
                 json.dump(self.stats, f, indent=4)
             self.api.upload_file(
                 path_or_fileobj=LOG_FILE,
                 repo_type="space"
             )
         except Exception as e:
+            logger.error(f"Sync Failure: {e}")
+# --- RESOURCE MONITOR ---
 class ResourceMonitor:
     @staticmethod
     def get_metrics() -> Dict:
         file_size_mb = os.path.getsize(file_path) / (1024**2)
         total_ram_mb = vm.total / (1024**2)
         avail_ram_mb = vm.available / (1024**2)
         if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
+            return False, f"Model size ({file_size_mb:.1f}MB) exceeds safety limit."
         if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
+            return False, f"Insufficient headroom for context (Need ~{file_size_mb+SYSTEM_RESERVE_MB:.1f}MB)."
+        return True, "Passed."
+# --- ENGINE CORE ---
 class ZeroEngine:
     def __init__(self):
         self.api = HfApi(token=HF_TOKEN)
             files = self.api.list_repo_files(repo_id=repo_id)
             return [f for f in files if f.endswith(".gguf")]
         except Exception as e:
+            logger.error(f"Scan error: {e}")
             return []
     def boot_kernel(self, repo: str, filename: str) -> str:
         try:
+            logger.info(f"Downloading {filename} from {repo}...")
             path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
             valid, msg = ResourceMonitor.validate_deployment(path)
             if not valid:
                 return msg
             with self.kernel_lock:
                 if self.llm:
                     del self.llm
                 self.llm = Llama(
                     model_path=path,
                     n_ctx=2048,
                 self.active_model_info = {"repo": repo, "file": filename}
                 self.telemetry.track_load(repo, filename)
+            return f"🟢 KERNEL ONLINE: {filename}"
         except Exception as e:
             return f"🔴 BOOT FAILURE: {str(e)}"
     def stitch_cache(self, ghost_text: str) -> str:
+        if not self.llm or not ghost_text or self.is_prefilling:
+            return "Kernel Idle/Busy"
         def _bg_eval():
             self.is_prefilling = True
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
+            except Exception as e:
+                logger.error(f"KV Cache priming failed: {e}")
             finally:
                 self.is_prefilling = False
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Ghost Cache Primed"
+    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
         if not self.llm:
+            history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."})
+            yield history
             return
+        # Prepare input
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
+        # Add User Message & Empty Assistant Message for Streaming
+        history.append({"role": "user", "content": prompt})
+        history.append({"role": "assistant", "content": "..."})
+        yield history
         response_text = ""
         start_time = time.time()
         tokens_count = 0
         try:
             stream = self.llm(
+                formatted_prompt,
+                max_tokens=1024,
+                stop=["User:", "<|eot_id|>", "\n\n"],
                 stream=True
             )
             for chunk in stream:
                 token = chunk["choices"][0]["text"]
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
+                # Gradio 6.5.0: Update the last message content
+                history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`"
+                yield history
             self.telemetry.track_generation(tokens_count)
         except Exception as e:
+            history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
+            yield history
+# --- UI INTERFACE ---
 kernel = ZeroEngine()
+with gr.Blocks(title="ZeroEngine Kernel 6.5", theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none")) as demo:
+    gr.HTML("<div style='text-align: center; border-bottom: 2px solid #333; margin-bottom: 20px;'><h1>🛰️ ZEROENGINE V0.1</h1><p>Gradio 6.5.0 Production Build</p></div>")
     with gr.Row():
         with gr.Column(scale=8):
+            # Gradio 6: 'type="messages"' is required for list of dicts
             chat_box = gr.Chatbot(
+                label="Main Engine Feedback",
+                height=650,
+                show_label=False,
+                type="messages",
+                autoscroll=True
             )
             with gr.Row():
+                user_input = gr.Textbox(
+                    placeholder="Input command...",
+                    label="Terminal",
+                    container=False,
+                    scale=9
+                )
+                send_btn = gr.Button("EXE", variant="primary", scale=1)
+        # The Sidebar is a specialized Gradio 6 component
+        with gr.Sidebar(label="Engine Room", open=True, width=350) as sidebar:
+            gr.Markdown("### 🛠️ Hardware Status")
+            ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
+            cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
+            gr.Markdown("### 📡 Model Control")
+            repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
+            quant_dropdown = gr.Dropdown(label="Available Quants", choices=[])
             with gr.Row():
+                scan_btn = gr.Button("SCAN", size="sm")
+                boot_btn = gr.Button("BOOT", variant="primary", size="sm")
+            boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
+            gr.Markdown("### 👻 Ghost Cache")
             ghost_buffer = gr.Textbox(
+                label="Background Context",
+                placeholder="Queue priming tokens here...",
                 lines=3
             )
+            stitch_status = gr.Markdown("Cache: `EMPTY`")
+            stitch_btn = gr.Button("STITCH", size="sm")
+            log_output = gr.Code(label="Kernel Logs", language="shell", value="[INIT] System Ready.")
+    # --- UI LOGIC ---
+    def update_stats():
         m = ResourceMonitor.get_metrics()
+        return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
     def on_scan(repo):
         files = kernel.list_ggufs(repo)
         if not files:
+            return gr.update(choices=[], value=None), "No GGUFs found in repo."
         return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."
     def on_boot(repo, file):
+        if not repo or not file:
+            return "Selection Missing", gr.update()
+        yield "System: Booting Kernel...", gr.update()
         res = kernel.boot_kernel(repo, file)
+        yield res, gr.update()
+    # Recurring updates (Gradio 6 native)
+    demo.load(update_stats, None, [ram_metric, cpu_metric], every=2)
+    # Event Handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
+    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
+    stitch_btn.click(
+        lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
+        [ghost_buffer],
+        [stitch_status]
+    )
+    # Inference Handling
+    inference_args = [user_input, chat_box, ghost_buffer]
+    user_input.submit(kernel.inference_generator, inference_args, [chat_box])
+    send_btn.click(kernel.inference_generator, inference_args, [chat_box])
+    # Clear input on submit
     user_input.submit(lambda: "", None, [user_input])
+# --- LAUNCH ---
 if __name__ == "__main__":
+    # Removed show_api=False as it's deprecated in 6.x
     demo.queue(max_size=20).launch(
+        server_name="0.0.0.0",
+        share=False
     )