turtle170 commited on
Commit
26fd9b6
·
verified ·
1 Parent(s): c9c4656

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import psutil
5
+ import threading
6
+ import gradio as gr
7
+ from huggingface_hub import HfApi, hf_hub_download
8
+ from llama_cpp import Llama
9
+
10
+ # System-level Constants
11
+ HF_TOKEN = os.environ.get("HF_TOKEN")
12
+ LOG_FILE = "engine_telemetry.json"
13
+ RAM_SAFETY_THRESHOLD = 0.50 # 50% limit for model weights
14
+ SYSTEM_RESERVE_MB = 200
15
+
16
+ class ZeroEngine:
17
+ def __init__(self):
18
+ self.llm = None
19
+ self.lock = threading.Lock()
20
+ self.active_repo = None
21
+ self.telemetry = self._load_telemetry()
22
+ self.api = HfApi(token=HF_TOKEN)
23
+
24
+ def _load_telemetry(self):
25
+ if os.path.exists(LOG_FILE):
26
+ with open(LOG_FILE, "r") as f:
27
+ return json.load(f)
28
+ return {"load_count": {}, "popular_quants": []}
29
+
30
+ def _sync_telemetry(self):
31
+ if not HF_TOKEN: return
32
+ with open(LOG_FILE, "w") as f:
33
+ json.dump(self.telemetry, f)
34
+ try:
35
+ repo_id = os.environ.get("SPACE_ID")
36
+ if repo_id:
37
+ self.api.upload_file(path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE, repo_id=repo_id, repo_type="space")
38
+ except Exception: pass
39
+
40
+ def get_system_status(self):
41
+ mem = psutil.virtual_memory()
42
+ return {
43
+ "ram_used": round(mem.used / (1024**3), 2),
44
+ "ram_total": round(mem.total / (1024**3), 2),
45
+ "cpu_pct": psutil.cpu_percent()
46
+ }
47
+
48
+ def load_engine(self, repo, file):
49
+ path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
50
+ file_size_gb = os.path.getsize(path) / (1024**3)
51
+ total_ram = psutil.virtual_memory().total / (1024**3)
52
+
53
+ if file_size_gb > (total_ram * RAM_SAFETY_THRESHOLD):
54
+ return f"⚠ DECLINED: Model size ({file_size_gb:.2f}GB) exceeds 50% RAM limit."
55
+
56
+ with self.lock:
57
+ if self.llm: del self.llm
58
+ self.llm = Llama(
59
+ model_path=path,
60
+ n_ctx=4096,
61
+ n_threads=1, # One core per slot (2 concurrent max)
62
+ use_mmap=True,
63
+ logits_all=False,
64
+ verbose=False
65
+ )
66
+ self.active_repo = repo
67
+ self.telemetry["load_count"][file] = self.telemetry["load_count"].get(file, 0) + 1
68
+ self._sync_telemetry()
69
+ return f"✅ Engine Active: {file}"
70
+
71
+ def ghost_prefill(self, text):
72
+ """KV-Cache Stitching: Pre-evaluates tokens to warm the cache."""
73
+ if not self.llm or not text: return
74
+ tokens = self.llm.tokenize(text.encode("utf-8"))
75
+ # Eval only, no generation. Internal prefix_matching handles the 'stitching'.
76
+ try:
77
+ self.llm.eval(tokens)
78
+ return "⚡ Ghost Cache Primed"
79
+ except Exception:
80
+ return "⚠ Cache Overflow"
81
+
82
+ def chat(self, message, history, ghost_text):
83
+ if not self.llm:
84
+ yield history + [{"role": "assistant", "content": "Engine Offline. Please load a model."}]
85
+ return
86
+
87
+ # Combine ghost-prefilled context with new message
88
+ full_input = f"{ghost_text}\n{message}" if ghost_text else message
89
+ response = ""
90
+
91
+ # Use streaming with high-speed settings
92
+ for chunk in self.llm.create_chat_completion(
93
+ messages=[{"role": "user", "content": full_input}],
94
+ stream=True,
95
+ max_tokens=1024
96
+ ):
97
+ delta = chunk["choices"][0]["delta"]
98
+ if "content" in delta:
99
+ response += delta["content"]
100
+ yield history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
101
+
102
+ engine = ZeroEngine()
103
+
104
+ # --- Gradio UI Design ---
105
+ with gr.Blocks(theme=gr.themes.Default(primary_hue="slate", radius_size="none"), fill_height=True) as demo:
106
+ gr.Markdown("# 🛰️ ZeroEngine Kernel V0.1")
107
+
108
+ with gr.Row():
109
+ with gr.Column(scale=9):
110
+ chat_interface = gr.Chatbot(type="messages", label="Active Slot Output", height=600)
111
+ msg_input = gr.Textbox(placeholder="Enter command...", label="Primary Input")
112
+
113
+ with gr.Sidebar(label="System Dashboard", open=True) as sidebar:
114
+ gr.Markdown("### 📊 Resource Monitor")
115
+ ram_stat = gr.Markdown("RAM: --")
116
+ cpu_stat = gr.Markdown("CPU: --")
117
+
118
+ gr.Markdown("---")
119
+ gr.Markdown("### 🛠 Engine Configuration")
120
+ repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
121
+ file_drop = gr.Dropdown(label="Quantization", choices=[])
122
+ scan_btn = gr.Button("Scan Manifest")
123
+ load_btn = gr.Button("ACTIVATE", variant="primary")
124
+ engine_log = gr.Markdown("Status: Ready")
125
+
126
+ gr.Markdown("---")
127
+ gr.Markdown("### 👻 Ghost Terminal")
128
+ ghost_in = gr.Textbox(label="Pre-Warm Input (Queue)", placeholder="Type here while waiting...")
129
+ ghost_status = gr.Markdown("Cache: Idle")
130
+ ghost_btn = gr.Button("Stitch Cache", size="sm")
131
+
132
+ # --- Logic ---
133
+ def update_sys():
134
+ s = engine.get_system_status()
135
+ return f"**RAM:** {s['ram_used']}GB / {s['ram_total']}GB", f"**CPU:** {s['cpu_pct']}%"
136
+
137
+ def scan(repo):
138
+ files = engine.api.list_repo_files(repo_id=repo)
139
+ ggufs = [f for f in files if f.endswith(".gguf")]
140
+ return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
141
+
142
+ # Event Wiring
143
+ demo.load(update_sys, None, [ram_stat, cpu_stat], every=2)
144
+ scan_btn.click(scan, [repo_in], [file_drop])
145
+ load_btn.click(engine.load_engine, [repo_in, file_drop], [engine_log])
146
+ ghost_btn.click(engine.ghost_prefill, [ghost_in], [ghost_status])
147
+
148
+ msg_input.submit(engine.chat, [msg_input, chat_interface, ghost_in], [chat_interface], concurrency_limit=2)
149
+ msg_input.submit(lambda: "", None, [msg_input]) # Reset active input
150
+ msg_input.submit(lambda: "", None, [ghost_in]) # Clear ghost buffer after stitching
151
+
152
+ demo.queue().launch()