turtle170 commited on
Commit
dbe6259
·
verified ·
1 Parent(s): 0abb106

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -98
app.py CHANGED
@@ -11,14 +11,18 @@ from typing import List, Dict, Optional, Generator
11
  import gradio as gr
12
  from huggingface_hub import HfApi, hf_hub_download
13
 
 
14
  try:
15
  from llama_cpp import Llama
16
  except ImportError:
17
  try:
18
  from llama_cpp_pydist import Llama
19
  except ImportError:
20
- Llama = None
 
 
21
 
 
22
  HF_TOKEN = os.environ.get("HF_TOKEN")
23
  SPACE_ID = os.environ.get("SPACE_ID")
24
  LOG_FILE = "engine_telemetry.json"
@@ -30,6 +34,7 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
31
  logger = logging.getLogger(__name__)
32
 
 
33
  class TelemetryManager:
34
  def __init__(self, api: HfApi):
35
  self.api = api
@@ -38,10 +43,10 @@ class TelemetryManager:
38
  def _load_initial_stats(self) -> Dict:
39
  if os.path.exists(LOG_FILE):
40
  try:
41
- with open(LOG_FILE, "r") as f:
42
  return json.load(f)
43
- except Exception as e:
44
- logger.error(f"Failed to load telemetry: {e}")
45
  return {
46
  "session_start": str(datetime.now(pytz.utc)),
47
  "load_count": {},
@@ -61,7 +66,7 @@ class TelemetryManager:
61
  if not HF_TOKEN or not SPACE_ID:
62
  return
63
  try:
64
- with open(LOG_FILE, "w") as f:
65
  json.dump(self.stats, f, indent=4)
66
  self.api.upload_file(
67
  path_or_fileobj=LOG_FILE,
@@ -70,8 +75,9 @@ class TelemetryManager:
70
  repo_type="space"
71
  )
72
  except Exception as e:
73
- logger.warning(f"Telemetry sync failed: {e}")
74
 
 
75
  class ResourceMonitor:
76
  @staticmethod
77
  def get_metrics() -> Dict:
@@ -91,15 +97,13 @@ class ResourceMonitor:
91
  file_size_mb = os.path.getsize(file_path) / (1024**2)
92
  total_ram_mb = vm.total / (1024**2)
93
  avail_ram_mb = vm.available / (1024**2)
94
-
95
  if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
96
- return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
97
-
98
  if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
99
- return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
100
-
101
- return True, "Resource check passed."
102
 
 
103
  class ZeroEngine:
104
  def __init__(self):
105
  self.api = HfApi(token=HF_TOKEN)
@@ -114,22 +118,21 @@ class ZeroEngine:
114
  files = self.api.list_repo_files(repo_id=repo_id)
115
  return [f for f in files if f.endswith(".gguf")]
116
  except Exception as e:
 
117
  return []
118
 
119
  def boot_kernel(self, repo: str, filename: str) -> str:
120
  try:
121
- if Llama is None:
122
- return "🔴 KERNEL ERROR: llama-cpp-python not installed correctly."
123
-
124
  path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
 
125
  valid, msg = ResourceMonitor.validate_deployment(path)
126
  if not valid:
127
  return msg
128
-
129
  with self.kernel_lock:
130
  if self.llm:
131
  del self.llm
132
-
133
  self.llm = Llama(
134
  model_path=path,
135
  n_ctx=2048,
@@ -141,159 +144,170 @@ class ZeroEngine:
141
  self.active_model_info = {"repo": repo, "file": filename}
142
  self.telemetry.track_load(repo, filename)
143
 
144
- return f"🟢 KERNEL ONLINE: {filename} loaded successfully."
145
  except Exception as e:
146
  return f"🔴 BOOT FAILURE: {str(e)}"
147
 
148
  def stitch_cache(self, ghost_text: str) -> str:
149
- if not self.llm or not ghost_text:
150
- return "Kernel Idle"
151
-
152
- if self.is_prefilling:
153
- return "Kernel Busy"
154
-
155
  def _bg_eval():
156
  self.is_prefilling = True
157
  try:
158
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
159
  self.llm.eval(tokens)
160
- except Exception:
161
- pass
162
  finally:
163
  self.is_prefilling = False
164
-
165
  threading.Thread(target=_bg_eval, daemon=True).start()
166
  return "⚡ Ghost Cache Primed"
167
 
168
- def inference_generator(self, prompt: str, history: List, ghost_context: str) -> Generator:
169
  if not self.llm:
170
- yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
 
171
  return
172
 
 
173
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
174
  formatted_prompt = f"User: {full_input}\nAssistant: "
 
 
 
 
 
 
175
  response_text = ""
176
  start_time = time.time()
177
  tokens_count = 0
178
 
179
  try:
180
  stream = self.llm(
181
- formatted_prompt,
182
- max_tokens=1024,
183
- stop=["User:", "\n\n"],
184
  stream=True
185
  )
186
-
187
  for chunk in stream:
188
  token = chunk["choices"][0]["text"]
189
  response_text += token
190
  tokens_count += 1
 
191
  elapsed = time.time() - start_time
192
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
193
 
194
- yield history + [
195
- {"role": "user", "content": prompt},
196
- {"role": "assistant", "content": f"{response_text}\n\n`[{tps} t/s]`"}
197
- ]
198
 
199
  self.telemetry.track_generation(tokens_count)
200
-
201
  except Exception as e:
202
- yield history + [{"role": "assistant", "content": f"Inference Error: {str(e)}"}]
 
203
 
 
204
  kernel = ZeroEngine()
205
 
206
- with gr.Blocks(title="ZeroEngine Kernel") as demo:
207
- gr.HTML("""
208
- <div style="text-align: center; padding: 10px; border-bottom: 2px solid #000;">
209
- <h1 style="margin: 0;">🛰️ ZEROENGINE V0.1</h1>
210
- <p style="margin: 0; font-family: monospace;">STATUS: HIGH-PERFORMANCE KERNEL / VCPU-PARTITIONED</p>
211
- </div>
212
- """)
213
-
214
  with gr.Row():
215
  with gr.Column(scale=8):
 
216
  chat_box = gr.Chatbot(
217
- label="Active Slot Inference",
218
- height=650,
219
- show_label=False,
220
- bubble_full_width=False
 
221
  )
222
 
223
  with gr.Row():
224
- with gr.Column(scale=9):
225
- user_input = gr.Textbox(
226
- placeholder="Input command for active processing core...",
227
- label="Active Terminal",
228
- container=False
229
- )
230
- with gr.Column(scale=1, min_width=50):
231
- send_btn = gr.Button("EXE", variant="primary")
232
-
233
- with gr.Sidebar(label="Engine Room", open=True) as sidebar:
234
- gr.Markdown("### 📊 Resource Gauges")
235
- with gr.Row():
236
- ram_metric = gr.Label(label="RAM Allocation", value="0/16 GB")
237
- cpu_metric = gr.Label(label="CPU Load", value="0%")
238
 
239
  gr.Markdown("---")
240
- gr.Markdown("### 🛠️ Kernel Control")
241
- repo_input = gr.Textbox(label="HF Repo ID", value=DEFAULT_MODEL)
242
- quant_dropdown = gr.Dropdown(label="Quantization Target", choices=[])
243
 
244
  with gr.Row():
245
- scan_btn = gr.Button("Scan Repo", size="sm")
246
- boot_btn = gr.Button("BOOT KERNEL", variant="primary", size="sm")
247
 
248
- boot_status = gr.Markdown("*Standby: Kernel not initialized.*")
249
 
250
  gr.Markdown("---")
251
- gr.Markdown("### 👻 Ghost Terminal")
252
  ghost_buffer = gr.Textbox(
253
- label="Pre-typing Buffer (Queue)",
254
- placeholder="Queue users type here to prime KV-cache...",
255
  lines=3
256
  )
257
- stitch_status = gr.Markdown("Cache State: `EMPTY`")
258
- stitch_btn = gr.Button("STITCH CACHE", size="sm")
259
 
260
- gr.Markdown("---")
261
- gr.Markdown("### 📉 System Logs")
262
- log_output = gr.Code(label="Kernel Output", language="shell", value="[INIT] ZeroEngine Ready.")
263
 
264
- def update_system_stats():
 
265
  m = ResourceMonitor.get_metrics()
266
- return f"{m['ram_used_gb']} / {m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
267
 
268
  def on_scan(repo):
269
  files = kernel.list_ggufs(repo)
270
  if not files:
271
- return gr.update(choices=[], value=None), "Repo scan failed or no GGUFs found."
272
  return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."
273
 
274
  def on_boot(repo, file):
275
- yield "Initialising boot sequence...", gr.update(open=True)
 
 
276
  res = kernel.boot_kernel(repo, file)
277
- yield res, gr.update(open=True)
278
-
279
- def on_stitch(text):
280
- res = kernel.stitch_cache(text)
281
- return f"Cache State: `{res}`"
282
 
283
- demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
 
 
 
284
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
285
- boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, sidebar])
286
- stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- input_args = [user_input, chat_box, ghost_buffer]
289
- user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
290
- send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
291
  user_input.submit(lambda: "", None, [user_input])
292
- user_input.submit(lambda: "", None, [ghost_buffer])
293
 
 
294
  if __name__ == "__main__":
 
295
  demo.queue(max_size=20).launch(
296
- show_api=False,
297
- theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none"),
298
- css=".gradio-container {background-color: #fafafa;} #sidebar {border-left: 1px solid #ddd;}"
299
  )
 
11
  import gradio as gr
12
  from huggingface_hub import HfApi, hf_hub_download
13
 
14
+ # --- KERNEL INITIALIZATION ---
15
  try:
16
  from llama_cpp import Llama
17
  except ImportError:
18
  try:
19
  from llama_cpp_pydist import Llama
20
  except ImportError:
21
+ class Llama:
22
+ def __init__(self, *args, **kwargs):
23
+ raise ImportError("Kernel Binary Missing. Ensure llama-cpp-python is installed.")
24
 
25
+ # --- CONFIGURATION ---
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
  SPACE_ID = os.environ.get("SPACE_ID")
28
  LOG_FILE = "engine_telemetry.json"
 
34
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
35
  logger = logging.getLogger(__name__)
36
 
37
+ # --- TELEMETRY MODULE ---
38
  class TelemetryManager:
39
  def __init__(self, api: HfApi):
40
  self.api = api
 
43
  def _load_initial_stats(self) -> Dict:
44
  if os.path.exists(LOG_FILE):
45
  try:
46
+ with open(LOG_FILE, "r", encoding="utf-8") as f:
47
  return json.load(f)
48
+ except Exception:
49
+ pass
50
  return {
51
  "session_start": str(datetime.now(pytz.utc)),
52
  "load_count": {},
 
66
  if not HF_TOKEN or not SPACE_ID:
67
  return
68
  try:
69
+ with open(LOG_FILE, "w", encoding="utf-8") as f:
70
  json.dump(self.stats, f, indent=4)
71
  self.api.upload_file(
72
  path_or_fileobj=LOG_FILE,
 
75
  repo_type="space"
76
  )
77
  except Exception as e:
78
+ logger.error(f"Sync Failure: {e}")
79
 
80
+ # --- RESOURCE MONITOR ---
81
  class ResourceMonitor:
82
  @staticmethod
83
  def get_metrics() -> Dict:
 
97
  file_size_mb = os.path.getsize(file_path) / (1024**2)
98
  total_ram_mb = vm.total / (1024**2)
99
  avail_ram_mb = vm.available / (1024**2)
 
100
  if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
101
+ return False, f"Model size ({file_size_mb:.1f}MB) exceeds safety limit."
 
102
  if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
103
+ return False, f"Insufficient headroom for context (Need ~{file_size_mb+SYSTEM_RESERVE_MB:.1f}MB)."
104
+ return True, "Passed."
 
105
 
106
+ # --- ENGINE CORE ---
107
  class ZeroEngine:
108
  def __init__(self):
109
  self.api = HfApi(token=HF_TOKEN)
 
118
  files = self.api.list_repo_files(repo_id=repo_id)
119
  return [f for f in files if f.endswith(".gguf")]
120
  except Exception as e:
121
+ logger.error(f"Scan error: {e}")
122
  return []
123
 
124
  def boot_kernel(self, repo: str, filename: str) -> str:
125
  try:
126
+ logger.info(f"Downloading {filename} from {repo}...")
 
 
127
  path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
128
+
129
  valid, msg = ResourceMonitor.validate_deployment(path)
130
  if not valid:
131
  return msg
132
+
133
  with self.kernel_lock:
134
  if self.llm:
135
  del self.llm
 
136
  self.llm = Llama(
137
  model_path=path,
138
  n_ctx=2048,
 
144
  self.active_model_info = {"repo": repo, "file": filename}
145
  self.telemetry.track_load(repo, filename)
146
 
147
+ return f"🟢 KERNEL ONLINE: {filename}"
148
  except Exception as e:
149
  return f"🔴 BOOT FAILURE: {str(e)}"
150
 
151
  def stitch_cache(self, ghost_text: str) -> str:
152
+ if not self.llm or not ghost_text or self.is_prefilling:
153
+ return "Kernel Idle/Busy"
154
+
 
 
 
155
  def _bg_eval():
156
  self.is_prefilling = True
157
  try:
158
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
159
  self.llm.eval(tokens)
160
+ except Exception as e:
161
+ logger.error(f"KV Cache priming failed: {e}")
162
  finally:
163
  self.is_prefilling = False
164
+
165
  threading.Thread(target=_bg_eval, daemon=True).start()
166
  return "⚡ Ghost Cache Primed"
167
 
168
+ def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
169
  if not self.llm:
170
+ history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."})
171
+ yield history
172
  return
173
 
174
+ # Prepare input
175
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
176
  formatted_prompt = f"User: {full_input}\nAssistant: "
177
+
178
+ # Add User Message & Empty Assistant Message for Streaming
179
+ history.append({"role": "user", "content": prompt})
180
+ history.append({"role": "assistant", "content": "..."})
181
+ yield history
182
+
183
  response_text = ""
184
  start_time = time.time()
185
  tokens_count = 0
186
 
187
  try:
188
  stream = self.llm(
189
+ formatted_prompt,
190
+ max_tokens=1024,
191
+ stop=["User:", "<|eot_id|>", "\n\n"],
192
  stream=True
193
  )
194
+
195
  for chunk in stream:
196
  token = chunk["choices"][0]["text"]
197
  response_text += token
198
  tokens_count += 1
199
+
200
  elapsed = time.time() - start_time
201
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
202
 
203
+ # Gradio 6.5.0: Update the last message content
204
+ history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`"
205
+ yield history
 
206
 
207
  self.telemetry.track_generation(tokens_count)
 
208
  except Exception as e:
209
+ history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
210
+ yield history
211
 
212
+ # --- UI INTERFACE ---
213
  kernel = ZeroEngine()
214
 
215
+ with gr.Blocks(title="ZeroEngine Kernel 6.5", theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none")) as demo:
216
+ gr.HTML("<div style='text-align: center; border-bottom: 2px solid #333; margin-bottom: 20px;'><h1>🛰️ ZEROENGINE V0.1</h1><p>Gradio 6.5.0 Production Build</p></div>")
217
+
 
 
 
 
 
218
  with gr.Row():
219
  with gr.Column(scale=8):
220
+ # Gradio 6: 'type="messages"' is required for list of dicts
221
  chat_box = gr.Chatbot(
222
+ label="Main Engine Feedback",
223
+ height=650,
224
+ show_label=False,
225
+ type="messages",
226
+ autoscroll=True
227
  )
228
 
229
  with gr.Row():
230
+ user_input = gr.Textbox(
231
+ placeholder="Input command...",
232
+ label="Terminal",
233
+ container=False,
234
+ scale=9
235
+ )
236
+ send_btn = gr.Button("EXE", variant="primary", scale=1)
237
+
238
+ # The Sidebar is a specialized Gradio 6 component
239
+ with gr.Sidebar(label="Engine Room", open=True, width=350) as sidebar:
240
+ gr.Markdown("### 🛠️ Hardware Status")
241
+ ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
242
+ cpu_metric = gr.Label(label="CPU Load", value="0%")
 
243
 
244
  gr.Markdown("---")
245
+ gr.Markdown("### 📡 Model Control")
246
+ repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
247
+ quant_dropdown = gr.Dropdown(label="Available Quants", choices=[])
248
 
249
  with gr.Row():
250
+ scan_btn = gr.Button("SCAN", size="sm")
251
+ boot_btn = gr.Button("BOOT", variant="primary", size="sm")
252
 
253
+ boot_status = gr.Markdown("Status: `STANDBY`")
254
 
255
  gr.Markdown("---")
256
+ gr.Markdown("### 👻 Ghost Cache")
257
  ghost_buffer = gr.Textbox(
258
+ label="Background Context",
259
+ placeholder="Queue priming tokens here...",
260
  lines=3
261
  )
262
+ stitch_status = gr.Markdown("Cache: `EMPTY`")
263
+ stitch_btn = gr.Button("STITCH", size="sm")
264
 
265
+ log_output = gr.Code(label="Kernel Logs", language="shell", value="[INIT] System Ready.")
 
 
266
 
267
+ # --- UI LOGIC ---
268
+ def update_stats():
269
  m = ResourceMonitor.get_metrics()
270
+ return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
271
 
272
  def on_scan(repo):
273
  files = kernel.list_ggufs(repo)
274
  if not files:
275
+ return gr.update(choices=[], value=None), "No GGUFs found in repo."
276
  return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."
277
 
278
  def on_boot(repo, file):
279
+ if not repo or not file:
280
+ return "Selection Missing", gr.update()
281
+ yield "System: Booting Kernel...", gr.update()
282
  res = kernel.boot_kernel(repo, file)
283
+ yield res, gr.update()
 
 
 
 
284
 
285
+ # Recurring updates (Gradio 6 native)
286
+ demo.load(update_stats, None, [ram_metric, cpu_metric], every=2)
287
+
288
+ # Event Handlers
289
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
290
+ boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
291
+
292
+ stitch_btn.click(
293
+ lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
294
+ [ghost_buffer],
295
+ [stitch_status]
296
+ )
297
+
298
+ # Inference Handling
299
+ inference_args = [user_input, chat_box, ghost_buffer]
300
+
301
+ user_input.submit(kernel.inference_generator, inference_args, [chat_box])
302
+ send_btn.click(kernel.inference_generator, inference_args, [chat_box])
303
 
304
+ # Clear input on submit
 
 
305
  user_input.submit(lambda: "", None, [user_input])
 
306
 
307
+ # --- LAUNCH ---
308
  if __name__ == "__main__":
309
+ # Removed show_api=False as it's deprecated in 6.x
310
  demo.queue(max_size=20).launch(
311
+ server_name="0.0.0.0",
312
+ share=False
 
313
  )