turtle170 commited on
Commit
f7e811f
·
verified ·
1 Parent(s): 7de41ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -20
app.py CHANGED
@@ -16,7 +16,10 @@ from typing import List, Dict, Optional, Generator
16
 
17
  import gradio as gr
18
  from huggingface_hub import HfApi, hf_hub_download
19
- from llama_cpp import Llama
 
 
 
20
 
21
  # ==========================================
22
  # SYSTEM CONFIGURATION & CONSTANTS
@@ -24,7 +27,7 @@ from llama_cpp import Llama
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
  SPACE_ID = os.environ.get("SPACE_ID")
26
  LOG_FILE = "engine_telemetry.json"
27
- RAM_LIMIT_PCT = 0.50 # Strict 50% limit for model weights
28
  SYSTEM_RESERVE_MB = 250
29
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
30
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
@@ -62,7 +65,6 @@ class TelemetryManager:
62
 
63
  def track_generation(self, tokens: int):
64
  self.stats["total_tokens_generated"] += tokens
65
- # Periodic sync could be added here
66
 
67
  def _sync_to_cloud(self):
68
  if not HF_TOKEN or not SPACE_ID:
@@ -105,11 +107,9 @@ class ResourceMonitor:
105
  total_ram_mb = vm.total / (1024**2)
106
  avail_ram_mb = vm.available / (1024**2)
107
 
108
- # Rule 1: 50% Hard Cap
109
  if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
110
  return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
111
 
112
- # Rule 2: Safety Buffer
113
  if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
114
  return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
115
 
@@ -146,15 +146,13 @@ class ZeroEngine:
146
  return msg
147
 
148
  with self.kernel_lock:
149
- # Clean up old instance
150
  if self.llm:
151
  del self.llm
152
-
153
- # Initialize new instance with CPU Affinity Partitioning
154
  self.llm = Llama(
155
  model_path=path,
156
  n_ctx=4096,
157
- n_threads=1, # Hard-partitioned to 1 vCPU for the active slot
158
  use_mmap=True,
159
  n_batch=512,
160
  last_n_tokens_size=64,
@@ -179,8 +177,6 @@ class ZeroEngine:
179
  self.is_prefilling = True
180
  try:
181
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
182
- # Prefix matching in llama-cpp happens automatically
183
- # if we evaluate tokens and store them in the KV cache.
184
  self.llm.eval(tokens)
185
  logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
186
  except Exception as e:
@@ -197,11 +193,8 @@ class ZeroEngine:
197
  yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
198
  return
199
 
200
- # Combine Ghost Terminal context with Active Input
201
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
202
 
203
- # Prepare history for Llama-3 style chat templates if needed
204
- # For V0.1 we use raw completion for maximum speed/minimal overhead
205
  formatted_prompt = f"User: {full_input}\nAssistant: "
206
 
207
  response_text = ""
@@ -221,7 +214,6 @@ class ZeroEngine:
221
  response_text += token
222
  tokens_count += 1
223
 
224
- # Calculate performance metrics
225
  elapsed = time.time() - start_time
226
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
227
 
@@ -256,7 +248,6 @@ with gr.Blocks(
256
  """)
257
 
258
  with gr.Row():
259
- # --- LEFT: CHAT ENGINE (FOCUS MODE) ---
260
  with gr.Column(scale=8):
261
  chat_box = gr.Chatbot(
262
  type="messages",
@@ -276,7 +267,6 @@ with gr.Blocks(
276
  with gr.Column(scale=1, min_width=50):
277
  send_btn = gr.Button("EXE", variant="primary")
278
 
279
- # --- RIGHT: ENGINE ROOM (SIDEBAR) ---
280
  with gr.Sidebar(label="Engine Room", open=True) as sidebar:
281
  gr.Markdown("### 📊 Resource Gauges")
282
  with gr.Row():
@@ -330,7 +320,6 @@ with gr.Blocks(
330
  res = kernel.stitch_cache(text)
331
  return f"Cache State: `{res}`"
332
 
333
- # Event Mapping
334
  demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
335
 
336
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
@@ -343,12 +332,10 @@ with gr.Blocks(
343
 
344
  stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
345
 
346
- # Inference Flow
347
  input_args = [user_input, chat_box, ghost_buffer]
348
  user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
349
  send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
350
 
351
- # Clear Inputs
352
  user_input.submit(lambda: "", None, [user_input])
353
  user_input.submit(lambda: "", None, [ghost_buffer])
354
 
 
16
 
17
  import gradio as gr
18
  from huggingface_hub import HfApi, hf_hub_download
19
+ try:
20
+ from llama_cpp import Llama
21
+ except ImportError:
22
+ from llama_cpp_pydist import Llama
23
 
24
  # ==========================================
25
  # SYSTEM CONFIGURATION & CONSTANTS
 
27
  HF_TOKEN = os.environ.get("HF_TOKEN")
28
  SPACE_ID = os.environ.get("SPACE_ID")
29
  LOG_FILE = "engine_telemetry.json"
30
+ RAM_LIMIT_PCT = 0.50
31
  SYSTEM_RESERVE_MB = 250
32
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
33
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 
65
 
66
  def track_generation(self, tokens: int):
67
  self.stats["total_tokens_generated"] += tokens
 
68
 
69
  def _sync_to_cloud(self):
70
  if not HF_TOKEN or not SPACE_ID:
 
107
  total_ram_mb = vm.total / (1024**2)
108
  avail_ram_mb = vm.available / (1024**2)
109
 
 
110
  if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
111
  return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
112
 
 
113
  if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
114
  return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
115
 
 
146
  return msg
147
 
148
  with self.kernel_lock:
 
149
  if self.llm:
150
  del self.llm
151
+
 
152
  self.llm = Llama(
153
  model_path=path,
154
  n_ctx=4096,
155
+ n_threads=1,
156
  use_mmap=True,
157
  n_batch=512,
158
  last_n_tokens_size=64,
 
177
  self.is_prefilling = True
178
  try:
179
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
 
 
180
  self.llm.eval(tokens)
181
  logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
182
  except Exception as e:
 
193
  yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
194
  return
195
 
 
196
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
197
 
 
 
198
  formatted_prompt = f"User: {full_input}\nAssistant: "
199
 
200
  response_text = ""
 
214
  response_text += token
215
  tokens_count += 1
216
 
 
217
  elapsed = time.time() - start_time
218
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
219
 
 
248
  """)
249
 
250
  with gr.Row():
 
251
  with gr.Column(scale=8):
252
  chat_box = gr.Chatbot(
253
  type="messages",
 
267
  with gr.Column(scale=1, min_width=50):
268
  send_btn = gr.Button("EXE", variant="primary")
269
 
 
270
  with gr.Sidebar(label="Engine Room", open=True) as sidebar:
271
  gr.Markdown("### 📊 Resource Gauges")
272
  with gr.Row():
 
320
  res = kernel.stitch_cache(text)
321
  return f"Cache State: `{res}`"
322
 
 
323
  demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
324
 
325
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
 
332
 
333
  stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
334
 
 
335
  input_args = [user_input, chat_box, ghost_buffer]
336
  user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
337
  send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
338
 
 
339
  user_input.submit(lambda: "", None, [user_input])
340
  user_input.submit(lambda: "", None, [ghost_buffer])
341