turtle170 commited on
Commit
3068971
·
verified ·
1 Parent(s): e77443a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +339 -95
app.py CHANGED
@@ -1,115 +1,359 @@
1
- import os, json, psutil, threading, time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
  from huggingface_hub import HfApi, hf_hub_download
4
  from llama_cpp import Llama
5
 
6
- # CONFIG
 
 
7
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
8
  LOG_FILE = "engine_telemetry.json"
9
- RAM_LIMIT = 0.50 # 50% Max per model
10
- SYSTEM_RESERVE = 200 # MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
 
 
 
12
  class ZeroEngine:
13
  def __init__(self):
14
- self.llm = None
15
  self.api = HfApi(token=HF_TOKEN)
16
- self.lock = threading.Lock()
17
- self.ghost_cache = {} # Stores pre-filled token counts
18
-
19
- def get_mem(self):
20
- m = psutil.virtual_memory()
21
- return m.available / (1024**2), m.total / (1024**2)
22
-
23
- def load_model(self, repo, file):
24
- avail, total = self.get_mem()
25
- path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
26
- size_mb = os.path.getsize(path) / (1024**2)
27
-
28
- # GATEKEEPER RULES
29
- if size_mb > (total * RAM_LIMIT):
30
- return f"❌ DECLINED: {size_mb:.0f}MB exceeds 50% RAM limit."
31
- if (size_mb + SYSTEM_RESERVE) > avail:
32
- return f"❌ DECLINED: Insufficient RAM (Need {SYSTEM_RESERVE}MB buffer)."
33
-
34
- with self.lock:
35
- if self.llm: del self.llm
36
- self.llm = Llama(
37
- model_path=path, n_ctx=2048, n_threads=1, # Hard core partitioning
38
- use_mmap=True, logits_all=False, verbose=False
39
- )
40
- self.sync_telemetry(file)
41
- return f"✅ Engine Online: {file}"
42
-
43
- def ghost_stitch(self, text):
44
- """Processes queue requests in background to prime the KV-Cache."""
45
- if not self.llm or not text: return "Idle"
46
- # The 'eval' call populates the internal KV cache.
47
- # llama-cpp-python's prefix matching handles the 'stitching' automatically.
48
- tokens = self.llm.tokenize(text.encode("utf-8"))
49
  try:
50
- self.llm.eval(tokens) # Pre-process tokens
51
- return f"⚡ Cache Primed ({len(tokens)} tokens)"
52
- except Exception:
53
- return " Cache Saturated"
54
-
55
- def sync_telemetry(self, filename):
56
- if not HF_TOKEN: return
57
- data = {"last_load": filename, "time": time.time()}
58
- with open(LOG_FILE, "w") as f: json.dump(data, f)
59
  try:
60
- self.api.upload_file(
61
- path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE,
62
- repo_id=os.environ.get("SPACE_ID"), repo_type="space"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  )
64
- except: pass
65
 
66
- engine = ZeroEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # UI
69
- with gr.Blocks(theme="shivi/calm_sea", fill_height=True) as demo:
70
- gr.Markdown("# 🛰️ ZeroEngine V0.1")
 
 
 
 
71
 
 
 
 
 
 
 
 
72
  with gr.Row():
73
- with gr.Column(scale=4):
74
- chat = gr.Chatbot(type="messages", height=500)
75
- msg = gr.Textbox(placeholder="Active Slot Input...", label="Command")
 
 
 
 
 
 
76
 
77
- with gr.Sidebar(label="Engine Room") as sb:
78
- ram_bar = gr.Label(label="RAM Usage")
79
- repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
80
- file_drop = gr.Dropdown(label="Quant File")
81
- load_btn = gr.Button("BOOT ENGINE", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  gr.Markdown("---")
84
- gr.Markdown("### 👻 Ghost Terminal (Queue)")
85
- ghost_in = gr.Textbox(label="Pre-type Prompt", placeholder="While you wait...")
86
- ghost_stat = gr.Markdown("Cache: Empty")
87
- stitch_btn = gr.Button("Warm Up Cache", size="sm")
88
-
89
- # Handlers
90
- def update_ram():
91
- avail, total = engine.get_mem()
92
- used = total - avail
93
- return {"Used (MB)": used, "Free (MB)": avail}
94
-
95
- def scan(repo):
96
- files = engine.api.list_repo_files(repo_id=repo)
97
- ggufs = [f for f in files if f.endswith(".gguf")]
98
- return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
99
-
100
- def run_chat(m, h, g):
101
- if not engine.llm: yield h + [{"role":"assistant", "content":"Load model first."}]; return
102
- full_p = f"{g}\n{m}" if g else m
103
- resp = ""
104
- for chunk in engine.llm.create_chat_completion(messages=[{"role":"user","content":full_p}], stream=True):
105
- delta = chunk["choices"][0]["delta"]
106
- if "content" in delta:
107
- resp += delta["content"]
108
- yield h + [{"role":"user", "content":m}, {"role":"assistant", "content":resp}]
109
-
110
- demo.load(update_ram, None, ram_bar, every=2)
111
- load_btn.click(scan, [repo_in], [file_drop]).then(engine.load_model, [repo_in, file_drop], None)
112
- stitch_btn.click(engine.ghost_stitch, [ghost_in], [ghost_stat])
113
- msg.submit(run_chat, [msg, chat, ghost_in], [chat], concurrency_limit=2)
114
-
115
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZEROENGINE KERNEL V0.1
3
+ Target SDK: Gradio 6.5.0
4
+ Optimized for: 2 vCPU / 16GB RAM
5
+ Features: KV-Cache Stitching, Hard Partitioning, Resource Gatekeeper, Ghost Terminal
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ import psutil
12
+ import threading
13
+ import logging
14
+ from datetime import datetime
15
+ from typing import List, Dict, Optional, Generator
16
+
17
  import gradio as gr
18
  from huggingface_hub import HfApi, hf_hub_download
19
  from llama_cpp import Llama
20
 
21
+ # ==========================================
22
+ # SYSTEM CONFIGURATION & CONSTANTS
23
+ # ==========================================
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
+ SPACE_ID = os.environ.get("SPACE_ID")
26
  LOG_FILE = "engine_telemetry.json"
27
+ RAM_LIMIT_PCT = 0.50 # Strict 50% limit for model weights
28
+ SYSTEM_RESERVE_MB = 250
29
+ DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
30
+ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
31
+
32
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # ==========================================
36
+ # CORE TELEMETRY & PERSISTENCE
37
+ # ==========================================
38
+ class TelemetryManager:
39
+ """Handles JSON-based usage tracking and HF Space persistence."""
40
+ def __init__(self, api: HfApi):
41
+ self.api = api
42
+ self.stats = self._load_initial_stats()
43
+
44
+ def _load_initial_stats(self) -> Dict:
45
+ if os.path.exists(LOG_FILE):
46
+ try:
47
+ with open(LOG_FILE, "r") as f:
48
+ return json.load(f)
49
+ except Exception as e:
50
+ logger.error(f"Failed to load telemetry: {e}")
51
+ return {
52
+ "session_start": str(datetime.now()),
53
+ "load_count": {},
54
+ "total_tokens_generated": 0,
55
+ "popular_repos": []
56
+ }
57
+
58
+ def track_load(self, repo: str, filename: str):
59
+ key = f"{repo}/{filename}"
60
+ self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
61
+ self._sync_to_cloud()
62
+
63
+ def track_generation(self, tokens: int):
64
+ self.stats["total_tokens_generated"] += tokens
65
+ # Periodic sync could be added here
66
+
67
+ def _sync_to_cloud(self):
68
+ if not HF_TOKEN or not SPACE_ID:
69
+ return
70
+ try:
71
+ with open(LOG_FILE, "w") as f:
72
+ json.dump(self.stats, f, indent=4)
73
+ self.api.upload_file(
74
+ path_or_fileobj=LOG_FILE,
75
+ path_in_repo=LOG_FILE,
76
+ repo_id=SPACE_ID,
77
+ repo_type="space"
78
+ )
79
+ logger.info("Telemetry synced to Space repository.")
80
+ except Exception as e:
81
+ logger.warning(f"Telemetry sync failed: {e}")
82
+
83
+ # ==========================================
84
+ # RESOURCE GATEKEEPER
85
+ # ==========================================
86
+ class ResourceMonitor:
87
+ """Monitors vCPU and RAM to prevent Kernel Panics."""
88
+ @staticmethod
89
+ def get_metrics() -> Dict:
90
+ vm = psutil.virtual_memory()
91
+ cpu_freq = psutil.cpu_freq()
92
+ return {
93
+ "ram_used_gb": round(vm.used / (1024**3), 2),
94
+ "ram_avail_gb": round(vm.available / (1024**3), 2),
95
+ "ram_total_gb": round(vm.total / (1024**3), 2),
96
+ "ram_pct": vm.percent,
97
+ "cpu_usage_pct": psutil.cpu_percent(interval=None),
98
+ "load_avg": os.getloadavg()[0] if hasattr(os, 'getloadavg') else 0
99
+ }
100
+
101
+ @staticmethod
102
+ def validate_deployment(file_path: str) -> (bool, str):
103
+ vm = psutil.virtual_memory()
104
+ file_size_mb = os.path.getsize(file_path) / (1024**2)
105
+ total_ram_mb = vm.total / (1024**2)
106
+ avail_ram_mb = vm.available / (1024**2)
107
+
108
+ # Rule 1: 50% Hard Cap
109
+ if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
110
+ return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
111
+
112
+ # Rule 2: Safety Buffer
113
+ if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
114
+ return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
115
 
116
+ return True, "Resource check passed."
117
+
118
+ # ==========================================
119
+ # THE ZEROENGINE KERNEL
120
+ # ==========================================
121
  class ZeroEngine:
122
  def __init__(self):
 
123
  self.api = HfApi(token=HF_TOKEN)
124
+ self.telemetry = TelemetryManager(self.api)
125
+ self.llm: Optional[Llama] = None
126
+ self.active_model_info = {"repo": "", "file": ""}
127
+ self.kernel_lock = threading.Lock()
128
+ self.is_prefilling = False
129
+
130
+ def list_ggufs(self, repo_id: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  try:
132
+ files = self.api.list_repo_files(repo_id=repo_id)
133
+ return [f for f in files if f.endswith(".gguf")]
134
+ except Exception as e:
135
+ logger.error(f"HF API Error: {e}")
136
+ return []
137
+
138
+ def boot_kernel(self, repo: str, filename: str) -> str:
139
+ """Downloads and initializes the llama-cpp-python instance."""
 
140
  try:
141
+ logger.info(f"Booting Kernel with {filename}...")
142
+ path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
143
+
144
+ valid, msg = ResourceMonitor.validate_deployment(path)
145
+ if not valid:
146
+ return msg
147
+
148
+ with self.kernel_lock:
149
+ # Clean up old instance
150
+ if self.llm:
151
+ del self.llm
152
+
153
+ # Initialize new instance with CPU Affinity Partitioning
154
+ self.llm = Llama(
155
+ model_path=path,
156
+ n_ctx=4096,
157
+ n_threads=1, # Hard-partitioned to 1 vCPU for the active slot
158
+ use_mmap=True,
159
+ n_batch=512,
160
+ last_n_tokens_size=64,
161
+ verbose=False
162
+ )
163
+ self.active_model_info = {"repo": repo, "file": filename}
164
+ self.telemetry.track_load(repo, filename)
165
+
166
+ return f"🟢 KERNEL ONLINE: {filename} loaded successfully."
167
+ except Exception as e:
168
+ return f"🔴 BOOT FAILURE: {str(e)}"
169
+
170
+ def stitch_cache(self, ghost_text: str) -> str:
171
+ """KV-CACHE STITCHING: Pre-processes queue tokens in background."""
172
+ if not self.llm or not ghost_text:
173
+ return "Kernel Idle"
174
+
175
+ if self.is_prefilling:
176
+ return "Kernel Busy"
177
+
178
+ def _bg_eval():
179
+ self.is_prefilling = True
180
+ try:
181
+ tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
182
+ # Prefix matching in llama-cpp happens automatically
183
+ # if we evaluate tokens and store them in the KV cache.
184
+ self.llm.eval(tokens)
185
+ logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
186
+ except Exception as e:
187
+ logger.error(f"Stitching failed: {e}")
188
+ finally:
189
+ self.is_prefilling = False
190
+
191
+ threading.Thread(target=_bg_eval, daemon=True).start()
192
+ return "⚡ Ghost Cache Primed"
193
+
194
+ def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
195
+ """Main chat generator using prefix-matched context."""
196
+ if not self.llm:
197
+ yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
198
+ return
199
+
200
+ # Combine Ghost Terminal context with Active Input
201
+ full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
202
+
203
+ # Prepare history for Llama-3 style chat templates if needed
204
+ # For V0.1 we use raw completion for maximum speed/minimal overhead
205
+ formatted_prompt = f"User: {full_input}\nAssistant: "
206
+
207
+ response_text = ""
208
+ start_time = time.time()
209
+ tokens_count = 0
210
+
211
+ try:
212
+ stream = self.llm(
213
+ formatted_prompt,
214
+ max_tokens=1024,
215
+ stop=["User:", "\n\n"],
216
+ stream=True
217
  )
 
218
 
219
+ for chunk in stream:
220
+ token = chunk["choices"][0]["text"]
221
+ response_text += token
222
+ tokens_count += 1
223
+
224
+ # Calculate performance metrics
225
+ elapsed = time.time() - start_time
226
+ tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
227
+
228
+ yield history + [
229
+ {"role": "user", "content": prompt},
230
+ {"role": "assistant", "content": f"{response_text}\n\n`[{tps} t/s]`"}
231
+ ]
232
+
233
+ self.telemetry.track_generation(tokens_count)
234
+
235
+ except Exception as e:
236
+ yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
237
+
238
+ # ==========================================
239
+ # GRADIO INTERFACE (DASHBOARD)
240
+ # ==========================================
241
+ kernel = ZeroEngine()
242
 
243
+
244
+
245
+ with gr.Blocks(
246
+ title="ZeroEngine Kernel",
247
+ theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
248
+ css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
249
+ ) as demo:
250
 
251
+ gr.HTML("""
252
+ <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
253
+ <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
254
+ <p style="margin: 0; font-family: monospace;">STATUS: HIGH-PERFORMANCE KERNEL / VCPU-PARTITIONED</p>
255
+ </div>
256
+ """)
257
+
258
  with gr.Row():
259
+ # --- LEFT: CHAT ENGINE (FOCUS MODE) ---
260
+ with gr.Column(scale=8):
261
+ chat_box = gr.Chatbot(
262
+ type="messages",
263
+ label="Active Slot Inference",
264
+ height=650,
265
+ show_label=False,
266
+ bubble_full_width=False
267
+ )
268
 
269
+ with gr.Row():
270
+ with gr.Column(scale=9):
271
+ user_input = gr.Textbox(
272
+ placeholder="Input command for active processing core...",
273
+ label="Active Terminal",
274
+ container=False
275
+ )
276
+ with gr.Column(scale=1, min_width=50):
277
+ send_btn = gr.Button("EXE", variant="primary")
278
+
279
+ # --- RIGHT: ENGINE ROOM (SIDEBAR) ---
280
+ with gr.Sidebar(label="Engine Room", open=True) as sidebar:
281
+ gr.Markdown("### 📊 Resource Gauges")
282
+ with gr.Row():
283
+ ram_metric = gr.Label(label="RAM Allocation", value="0/16 GB")
284
+ cpu_metric = gr.Label(label="CPU Load", value="0%")
285
 
286
  gr.Markdown("---")
287
+ gr.Markdown("### 🛠️ Kernel Control")
288
+ repo_input = gr.Textbox(label="HF Repo ID", value=DEFAULT_MODEL)
289
+ quant_dropdown = gr.Dropdown(label="Quantization Target", choices=[])
290
+
291
+ with gr.Row():
292
+ scan_btn = gr.Button("Scan Repo", size="sm")
293
+ boot_btn = gr.Button("BOOT KERNEL", variant="primary", size="sm")
294
+
295
+ boot_status = gr.Markdown("*Standby: Kernel not initialized.*")
296
+
297
+ gr.Markdown("---")
298
+ gr.Markdown("### 👻 Ghost Terminal")
299
+ ghost_buffer = gr.Textbox(
300
+ label="Pre-typing Buffer (Queue)",
301
+ placeholder="Queue users type here to prime KV-cache...",
302
+ lines=3
303
+ )
304
+ stitch_status = gr.Markdown("Cache State: `EMPTY`")
305
+ stitch_btn = gr.Button("STITCH CACHE", size="sm")
306
+
307
+ gr.Markdown("---")
308
+ gr.Markdown("### 📉 System Logs")
309
+ log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
310
+
311
+ # --- UI LOGIC ---
312
+ def update_system_stats():
313
+ m = ResourceMonitor.get_metrics()
314
+ ram_str = f"{m['ram_used_gb']} / {m['ram_total_gb']} GB"
315
+ cpu_str = f"{m['cpu_usage_pct']}%"
316
+ return ram_str, cpu_str
317
+
318
+ def on_scan(repo):
319
+ files = kernel.list_ggufs(repo)
320
+ if not files:
321
+ return gr.update(choices=[], value=None), "Repo scan failed or no GGUFs found."
322
+ return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."
323
+
324
+ def on_boot(repo, file):
325
+ yield "Initialising boot sequence...", gr.update(open=True)
326
+ res = kernel.boot_kernel(repo, file)
327
+ yield res, gr.update(open=True)
328
+
329
+ def on_stitch(text):
330
+ res = kernel.stitch_cache(text)
331
+ return f"Cache State: `{res}`"
332
+
333
+ # Event Mapping
334
+ demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
335
+
336
+ scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
337
+
338
+ boot_btn.click(
339
+ on_boot,
340
+ [repo_input, quant_dropdown],
341
+ [boot_status, sidebar]
342
+ )
343
+
344
+ stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
345
+
346
+ # Inference Flow
347
+ input_args = [user_input, chat_box, ghost_buffer]
348
+ user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
349
+ send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
350
+
351
+ # Clear Inputs
352
+ user_input.submit(lambda: "", None, [user_input])
353
+ user_input.submit(lambda: "", None, [ghost_buffer])
354
+
355
+ # ==========================================
356
+ # KERNEL EXECUTION
357
+ # ==========================================
358
+ if __name__ == "__main__":
359
+ demo.queue(max_size=20).launch(show_api=False)