turtle170 commited on
Commit
0abb106
·
verified ·
1 Parent(s): 6f76ef1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -69
app.py CHANGED
@@ -1,26 +1,24 @@
1
- """
2
- ZEROENGINE KERNEL V0.1
3
- Target SDK: Gradio 6.5.0
4
- Optimized for: 2 vCPU / 16GB RAM
5
- Features: KV-Cache Stitching, Hard Partitioning, Resource Gatekeeper, Ghost Terminal
6
- """
7
-
8
  import os
9
  import json
10
  import time
11
  import psutil
12
  import threading
13
  import logging
 
14
  from datetime import datetime
15
  from typing import List, Dict, Optional, Generator
16
 
17
  import gradio as gr
18
  from huggingface_hub import HfApi, hf_hub_download
19
- from llama_cpp import Llama
20
 
21
- # ==========================================
22
- # SYSTEM CONFIGURATION & CONSTANTS
23
- # ==========================================
 
 
 
 
 
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
  SPACE_ID = os.environ.get("SPACE_ID")
26
  LOG_FILE = "engine_telemetry.json"
@@ -32,11 +30,7 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
32
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
33
  logger = logging.getLogger(__name__)
34
 
35
- # ==========================================
36
- # CORE TELEMETRY & PERSISTENCE
37
- # ==========================================
38
  class TelemetryManager:
39
- """Handles JSON-based usage tracking and HF Space persistence."""
40
  def __init__(self, api: HfApi):
41
  self.api = api
42
  self.stats = self._load_initial_stats()
@@ -49,7 +43,7 @@ class TelemetryManager:
49
  except Exception as e:
50
  logger.error(f"Failed to load telemetry: {e}")
51
  return {
52
- "session_start": str(datetime.now()),
53
  "load_count": {},
54
  "total_tokens_generated": 0,
55
  "popular_repos": []
@@ -75,19 +69,13 @@ class TelemetryManager:
75
  repo_id=SPACE_ID,
76
  repo_type="space"
77
  )
78
- logger.info("Telemetry synced to Space repository.")
79
  except Exception as e:
80
  logger.warning(f"Telemetry sync failed: {e}")
81
 
82
- # ==========================================
83
- # RESOURCE GATEKEEPER
84
- # ==========================================
85
  class ResourceMonitor:
86
- """Monitors vCPU and RAM to prevent Kernel Panics."""
87
  @staticmethod
88
  def get_metrics() -> Dict:
89
  vm = psutil.virtual_memory()
90
- cpu_freq = psutil.cpu_freq()
91
  return {
92
  "ram_used_gb": round(vm.used / (1024**3), 2),
93
  "ram_avail_gb": round(vm.available / (1024**3), 2),
@@ -112,9 +100,6 @@ class ResourceMonitor:
112
 
113
  return True, "Resource check passed."
114
 
115
- # ==========================================
116
- # THE ZEROENGINE KERNEL
117
- # ==========================================
118
  class ZeroEngine:
119
  def __init__(self):
120
  self.api = HfApi(token=HF_TOKEN)
@@ -129,15 +114,14 @@ class ZeroEngine:
129
  files = self.api.list_repo_files(repo_id=repo_id)
130
  return [f for f in files if f.endswith(".gguf")]
131
  except Exception as e:
132
- logger.error(f"HF API Error: {e}")
133
  return []
134
 
135
  def boot_kernel(self, repo: str, filename: str) -> str:
136
- """Downloads and initializes the llama-cpp-python instance."""
137
  try:
138
- logger.info(f"Booting Kernel with {filename}...")
139
- path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
140
 
 
141
  valid, msg = ResourceMonitor.validate_deployment(path)
142
  if not valid:
143
  return msg
@@ -148,11 +132,10 @@ class ZeroEngine:
148
 
149
  self.llm = Llama(
150
  model_path=path,
151
- n_ctx=4096,
152
- n_threads=1,
153
  use_mmap=True,
154
  n_batch=512,
155
- last_n_tokens_size=64,
156
  verbose=False
157
  )
158
  self.active_model_info = {"repo": repo, "file": filename}
@@ -163,7 +146,6 @@ class ZeroEngine:
163
  return f"🔴 BOOT FAILURE: {str(e)}"
164
 
165
  def stitch_cache(self, ghost_text: str) -> str:
166
- """KV-CACHE STITCHING: Pre-processes queue tokens in background."""
167
  if not self.llm or not ghost_text:
168
  return "Kernel Idle"
169
 
@@ -175,9 +157,8 @@ class ZeroEngine:
175
  try:
176
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
177
  self.llm.eval(tokens)
178
- logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
179
- except Exception as e:
180
- logger.error(f"Stitching failed: {e}")
181
  finally:
182
  self.is_prefilling = False
183
 
@@ -185,15 +166,12 @@ class ZeroEngine:
185
  return "⚡ Ghost Cache Primed"
186
 
187
  def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
188
- """Main chat generator using prefix-matched context."""
189
  if not self.llm:
190
  yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
191
  return
192
 
193
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
194
-
195
  formatted_prompt = f"User: {full_input}\nAssistant: "
196
-
197
  response_text = ""
198
  start_time = time.time()
199
  tokens_count = 0
@@ -210,7 +188,6 @@ class ZeroEngine:
210
  token = chunk["choices"][0]["text"]
211
  response_text += token
212
  tokens_count += 1
213
-
214
  elapsed = time.time() - start_time
215
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
216
 
@@ -224,19 +201,9 @@ class ZeroEngine:
224
  except Exception as e:
225
  yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
226
 
227
- # ==========================================
228
- # GRADIO INTERFACE (DASHBOARD)
229
- # ==========================================
230
  kernel = ZeroEngine()
231
 
232
-
233
-
234
- with gr.Blocks(
235
- title="ZeroEngine Kernel",
236
- theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
237
- css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
238
- ) as demo:
239
-
240
  gr.HTML("""
241
  <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
242
  <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
@@ -247,7 +214,6 @@ with gr.Blocks(
247
  with gr.Row():
248
  with gr.Column(scale=8):
249
  chat_box = gr.Chatbot(
250
- type="messages",
251
  label="Active Slot Inference",
252
  height=650,
253
  show_label=False,
@@ -295,12 +261,9 @@ with gr.Blocks(
295
  gr.Markdown("### 📉 System Logs")
296
  log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
297
 
298
- # --- UI LOGIC ---
299
  def update_system_stats():
300
  m = ResourceMonitor.get_metrics()
301
- ram_str = f"{m['ram_used_gb']} / {m['ram_total_gb']} GB"
302
- cpu_str = f"{m['cpu_usage_pct']}%"
303
- return ram_str, cpu_str
304
 
305
  def on_scan(repo):
306
  files = kernel.list_ggufs(repo)
@@ -318,26 +281,19 @@ with gr.Blocks(
318
  return f"Cache State: `{res}`"
319
 
320
  demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
321
-
322
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
323
-
324
- boot_btn.click(
325
- on_boot,
326
- [repo_input, quant_dropdown],
327
- [boot_status, sidebar]
328
- )
329
-
330
  stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
331
 
332
  input_args = [user_input, chat_box, ghost_buffer]
333
  user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
334
  send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
335
-
336
  user_input.submit(lambda: "", None, [user_input])
337
  user_input.submit(lambda: "", None, [ghost_buffer])
338
 
339
- # ==========================================
340
- # KERNEL EXECUTION
341
- # ==========================================
342
  if __name__ == "__main__":
343
- demo.queue(max_size=20).launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import time
4
  import psutil
5
  import threading
6
  import logging
7
+ import pytz
8
  from datetime import datetime
9
  from typing import List, Dict, Optional, Generator
10
 
11
  import gradio as gr
12
  from huggingface_hub import HfApi, hf_hub_download
 
13
 
14
+ try:
15
+ from llama_cpp import Llama
16
+ except ImportError:
17
+ try:
18
+ from llama_cpp_pydist import Llama
19
+ except ImportError:
20
+ Llama = None
21
+
22
  HF_TOKEN = os.environ.get("HF_TOKEN")
23
  SPACE_ID = os.environ.get("SPACE_ID")
24
  LOG_FILE = "engine_telemetry.json"
 
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
31
  logger = logging.getLogger(__name__)
32
 
 
 
 
33
  class TelemetryManager:
 
34
  def __init__(self, api: HfApi):
35
  self.api = api
36
  self.stats = self._load_initial_stats()
 
43
  except Exception as e:
44
  logger.error(f"Failed to load telemetry: {e}")
45
  return {
46
+ "session_start": str(datetime.now(pytz.utc)),
47
  "load_count": {},
48
  "total_tokens_generated": 0,
49
  "popular_repos": []
 
69
  repo_id=SPACE_ID,
70
  repo_type="space"
71
  )
 
72
  except Exception as e:
73
  logger.warning(f"Telemetry sync failed: {e}")
74
 
 
 
 
75
  class ResourceMonitor:
 
76
  @staticmethod
77
  def get_metrics() -> Dict:
78
  vm = psutil.virtual_memory()
 
79
  return {
80
  "ram_used_gb": round(vm.used / (1024**3), 2),
81
  "ram_avail_gb": round(vm.available / (1024**3), 2),
 
100
 
101
  return True, "Resource check passed."
102
 
 
 
 
103
  class ZeroEngine:
104
  def __init__(self):
105
  self.api = HfApi(token=HF_TOKEN)
 
114
  files = self.api.list_repo_files(repo_id=repo_id)
115
  return [f for f in files if f.endswith(".gguf")]
116
  except Exception as e:
 
117
  return []
118
 
119
  def boot_kernel(self, repo: str, filename: str) -> str:
 
120
  try:
121
+ if Llama is None:
122
+ return "🔴 KERNEL ERROR: llama-cpp-python not installed correctly."
123
 
124
+ path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
125
  valid, msg = ResourceMonitor.validate_deployment(path)
126
  if not valid:
127
  return msg
 
132
 
133
  self.llm = Llama(
134
  model_path=path,
135
+ n_ctx=2048,
136
+ n_threads=2,
137
  use_mmap=True,
138
  n_batch=512,
 
139
  verbose=False
140
  )
141
  self.active_model_info = {"repo": repo, "file": filename}
 
146
  return f"🔴 BOOT FAILURE: {str(e)}"
147
 
148
  def stitch_cache(self, ghost_text: str) -> str:
 
149
  if not self.llm or not ghost_text:
150
  return "Kernel Idle"
151
 
 
157
  try:
158
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
159
  self.llm.eval(tokens)
160
+ except Exception:
161
+ pass
 
162
  finally:
163
  self.is_prefilling = False
164
 
 
166
  return "⚡ Ghost Cache Primed"
167
 
168
  def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
 
169
  if not self.llm:
170
  yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
171
  return
172
 
173
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
 
174
  formatted_prompt = f"User: {full_input}\nAssistant: "
 
175
  response_text = ""
176
  start_time = time.time()
177
  tokens_count = 0
 
188
  token = chunk["choices"][0]["text"]
189
  response_text += token
190
  tokens_count += 1
 
191
  elapsed = time.time() - start_time
192
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
193
 
 
201
  except Exception as e:
202
  yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
203
 
 
 
 
204
  kernel = ZeroEngine()
205
 
206
+ with gr.Blocks(title="ZeroEngine Kernel") as demo:
 
 
 
 
 
 
 
207
  gr.HTML("""
208
  <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
209
  <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
 
214
  with gr.Row():
215
  with gr.Column(scale=8):
216
  chat_box = gr.Chatbot(
 
217
  label="Active Slot Inference",
218
  height=650,
219
  show_label=False,
 
261
  gr.Markdown("### 📉 System Logs")
262
  log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
263
 
 
264
  def update_system_stats():
265
  m = ResourceMonitor.get_metrics()
266
+ return f"{m['ram_used_gb']} / {m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
 
 
267
 
268
  def on_scan(repo):
269
  files = kernel.list_ggufs(repo)
 
281
  return f"Cache State: `{res}`"
282
 
283
  demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
 
284
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
285
+ boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, sidebar])
 
 
 
 
 
 
286
  stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
287
 
288
  input_args = [user_input, chat_box, ghost_buffer]
289
  user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
290
  send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
 
291
  user_input.submit(lambda: "", None, [user_input])
292
  user_input.submit(lambda: "", None, [ghost_buffer])
293
 
 
 
 
294
  if __name__ == "__main__":
295
+ demo.queue(max_size=20).launch(
296
+ show_api=False,
297
+ theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
298
+ css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
299
+ )