turtle170 commited on
Commit
e77443a
·
verified ·
1 Parent(s): 26fd9b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -110
app.py CHANGED
@@ -1,152 +1,115 @@
1
- import os
2
- import json
3
- import time
4
- import psutil
5
- import threading
6
  import gradio as gr
7
  from huggingface_hub import HfApi, hf_hub_download
8
  from llama_cpp import Llama
9
 
10
- # System-level Constants
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
  LOG_FILE = "engine_telemetry.json"
13
- RAM_SAFETY_THRESHOLD = 0.50 # 50% limit for model weights
14
- SYSTEM_RESERVE_MB = 200
15
 
16
  class ZeroEngine:
17
  def __init__(self):
18
  self.llm = None
19
- self.lock = threading.Lock()
20
- self.active_repo = None
21
- self.telemetry = self._load_telemetry()
22
  self.api = HfApi(token=HF_TOKEN)
 
 
23
 
24
- def _load_telemetry(self):
25
- if os.path.exists(LOG_FILE):
26
- with open(LOG_FILE, "r") as f:
27
- return json.load(f)
28
- return {"load_count": {}, "popular_quants": []}
29
-
30
- def _sync_telemetry(self):
31
- if not HF_TOKEN: return
32
- with open(LOG_FILE, "w") as f:
33
- json.dump(self.telemetry, f)
34
- try:
35
- repo_id = os.environ.get("SPACE_ID")
36
- if repo_id:
37
- self.api.upload_file(path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE, repo_id=repo_id, repo_type="space")
38
- except Exception: pass
39
-
40
- def get_system_status(self):
41
- mem = psutil.virtual_memory()
42
- return {
43
- "ram_used": round(mem.used / (1024**3), 2),
44
- "ram_total": round(mem.total / (1024**3), 2),
45
- "cpu_pct": psutil.cpu_percent()
46
- }
47
 
48
- def load_engine(self, repo, file):
 
49
  path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
50
- file_size_gb = os.path.getsize(path) / (1024**3)
51
- total_ram = psutil.virtual_memory().total / (1024**3)
52
 
53
- if file_size_gb > (total_ram * RAM_SAFETY_THRESHOLD):
54
- return f"⚠ DECLINED: Model size ({file_size_gb:.2f}GB) exceeds 50% RAM limit."
 
 
 
55
 
56
  with self.lock:
57
  if self.llm: del self.llm
58
  self.llm = Llama(
59
- model_path=path,
60
- n_ctx=4096,
61
- n_threads=1, # One core per slot (2 concurrent max)
62
- use_mmap=True,
63
- logits_all=False,
64
- verbose=False
65
  )
66
- self.active_repo = repo
67
- self.telemetry["load_count"][file] = self.telemetry["load_count"].get(file, 0) + 1
68
- self._sync_telemetry()
69
- return f"✅ Engine Active: {file}"
70
-
71
- def ghost_prefill(self, text):
72
- """KV-Cache Stitching: Pre-evaluates tokens to warm the cache."""
73
- if not self.llm or not text: return
74
  tokens = self.llm.tokenize(text.encode("utf-8"))
75
- # Eval only, no generation. Internal prefix_matching handles the 'stitching'.
76
  try:
77
- self.llm.eval(tokens)
78
- return "⚡ Ghost Cache Primed"
79
  except Exception:
80
- return "⚠ Cache Overflow"
81
-
82
- def chat(self, message, history, ghost_text):
83
- if not self.llm:
84
- yield history + [{"role": "assistant", "content": "Engine Offline. Please load a model."}]
85
- return
86
 
87
- # Combine ghost-prefilled context with new message
88
- full_input = f"{ghost_text}\n{message}" if ghost_text else message
89
- response = ""
90
-
91
- # Use streaming with high-speed settings
92
- for chunk in self.llm.create_chat_completion(
93
- messages=[{"role": "user", "content": full_input}],
94
- stream=True,
95
- max_tokens=1024
96
- ):
97
- delta = chunk["choices"][0]["delta"]
98
- if "content" in delta:
99
- response += delta["content"]
100
- yield history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
101
 
102
  engine = ZeroEngine()
103
 
104
- # --- Gradio UI Design ---
105
- with gr.Blocks(theme=gr.themes.Default(primary_hue="slate", radius_size="none"), fill_height=True) as demo:
106
- gr.Markdown("# 🛰️ ZeroEngine Kernel V0.1")
107
 
108
  with gr.Row():
109
- with gr.Column(scale=9):
110
- chat_interface = gr.Chatbot(type="messages", label="Active Slot Output", height=600)
111
- msg_input = gr.Textbox(placeholder="Enter command...", label="Primary Input")
112
-
113
- with gr.Sidebar(label="System Dashboard", open=True) as sidebar:
114
- gr.Markdown("### 📊 Resource Monitor")
115
- ram_stat = gr.Markdown("RAM: --")
116
- cpu_stat = gr.Markdown("CPU: --")
117
 
118
- gr.Markdown("---")
119
- gr.Markdown("### 🛠 Engine Configuration")
120
  repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
121
- file_drop = gr.Dropdown(label="Quantization", choices=[])
122
- scan_btn = gr.Button("Scan Manifest")
123
- load_btn = gr.Button("ACTIVATE", variant="primary")
124
- engine_log = gr.Markdown("Status: Ready")
125
 
126
  gr.Markdown("---")
127
- gr.Markdown("### 👻 Ghost Terminal")
128
- ghost_in = gr.Textbox(label="Pre-Warm Input (Queue)", placeholder="Type here while waiting...")
129
- ghost_status = gr.Markdown("Cache: Idle")
130
- ghost_btn = gr.Button("Stitch Cache", size="sm")
131
 
132
- # --- Logic ---
133
- def update_sys():
134
- s = engine.get_system_status()
135
- return f"**RAM:** {s['ram_used']}GB / {s['ram_total']}GB", f"**CPU:** {s['cpu_pct']}%"
 
136
 
137
  def scan(repo):
138
  files = engine.api.list_repo_files(repo_id=repo)
139
  ggufs = [f for f in files if f.endswith(".gguf")]
140
  return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
141
 
142
- # Event Wiring
143
- demo.load(update_sys, None, [ram_stat, cpu_stat], every=2)
144
- scan_btn.click(scan, [repo_in], [file_drop])
145
- load_btn.click(engine.load_engine, [repo_in, file_drop], [engine_log])
146
- ghost_btn.click(engine.ghost_prefill, [ghost_in], [ghost_status])
147
-
148
- msg_input.submit(engine.chat, [msg_input, chat_interface, ghost_in], [chat_interface], concurrency_limit=2)
149
- msg_input.submit(lambda: "", None, [msg_input]) # Reset active input
150
- msg_input.submit(lambda: "", None, [ghost_in]) # Clear ghost buffer after stitching
 
 
 
 
 
151
 
152
  demo.queue().launch()
 
1
+ import os, json, psutil, threading, time
 
 
 
 
2
  import gradio as gr
3
  from huggingface_hub import HfApi, hf_hub_download
4
  from llama_cpp import Llama
5
 
6
+ # CONFIG
7
  HF_TOKEN = os.environ.get("HF_TOKEN")
8
  LOG_FILE = "engine_telemetry.json"
9
+ RAM_LIMIT = 0.50 # 50% Max per model
10
+ SYSTEM_RESERVE = 200 # MB
11
 
12
  class ZeroEngine:
13
  def __init__(self):
14
  self.llm = None
 
 
 
15
  self.api = HfApi(token=HF_TOKEN)
16
+ self.lock = threading.Lock()
17
+ self.ghost_cache = {} # Stores pre-filled token counts
18
 
19
+ def get_mem(self):
20
+ m = psutil.virtual_memory()
21
+ return m.available / (1024**2), m.total / (1024**2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def load_model(self, repo, file):
24
+ avail, total = self.get_mem()
25
  path = hf_hub_download(repo_id=repo, filename=file, token=HF_TOKEN)
26
+ size_mb = os.path.getsize(path) / (1024**2)
 
27
 
28
+ # GATEKEEPER RULES
29
+ if size_mb > (total * RAM_LIMIT):
30
+ return f"❌ DECLINED: {size_mb:.0f}MB exceeds 50% RAM limit."
31
+ if (size_mb + SYSTEM_RESERVE) > avail:
32
+ return f"❌ DECLINED: Insufficient RAM (Need {SYSTEM_RESERVE}MB buffer)."
33
 
34
  with self.lock:
35
  if self.llm: del self.llm
36
  self.llm = Llama(
37
+ model_path=path, n_ctx=2048, n_threads=1, # Hard core partitioning
38
+ use_mmap=True, logits_all=False, verbose=False
 
 
 
 
39
  )
40
+ self.sync_telemetry(file)
41
+ return f" Engine Online: {file}"
42
+
43
+ def ghost_stitch(self, text):
44
+ """Processes queue requests in background to prime the KV-Cache."""
45
+ if not self.llm or not text: return "Idle"
46
+ # The 'eval' call populates the internal KV cache.
47
+ # llama-cpp-python's prefix matching handles the 'stitching' automatically.
48
  tokens = self.llm.tokenize(text.encode("utf-8"))
 
49
  try:
50
+ self.llm.eval(tokens) # Pre-process tokens
51
+ return f"⚡ Cache Primed ({len(tokens)} tokens)"
52
  except Exception:
53
+ return "⚠ Cache Saturated"
 
 
 
 
 
54
 
55
+ def sync_telemetry(self, filename):
56
+ if not HF_TOKEN: return
57
+ data = {"last_load": filename, "time": time.time()}
58
+ with open(LOG_FILE, "w") as f: json.dump(data, f)
59
+ try:
60
+ self.api.upload_file(
61
+ path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE,
62
+ repo_id=os.environ.get("SPACE_ID"), repo_type="space"
63
+ )
64
+ except: pass
 
 
 
 
65
 
66
  engine = ZeroEngine()
67
 
68
+ # UI
69
+ with gr.Blocks(theme="shivi/calm_sea", fill_height=True) as demo:
70
+ gr.Markdown("# 🛰️ ZeroEngine V0.1")
71
 
72
  with gr.Row():
73
+ with gr.Column(scale=4):
74
+ chat = gr.Chatbot(type="messages", height=500)
75
+ msg = gr.Textbox(placeholder="Active Slot Input...", label="Command")
 
 
 
 
 
76
 
77
+ with gr.Sidebar(label="Engine Room") as sb:
78
+ ram_bar = gr.Label(label="RAM Usage")
79
  repo_in = gr.Textbox(label="HF Repo", value="unsloth/Llama-3.2-1B-Instruct-GGUF")
80
+ file_drop = gr.Dropdown(label="Quant File")
81
+ load_btn = gr.Button("BOOT ENGINE", variant="primary")
 
 
82
 
83
  gr.Markdown("---")
84
+ gr.Markdown("### 👻 Ghost Terminal (Queue)")
85
+ ghost_in = gr.Textbox(label="Pre-type Prompt", placeholder="While you wait...")
86
+ ghost_stat = gr.Markdown("Cache: Empty")
87
+ stitch_btn = gr.Button("Warm Up Cache", size="sm")
88
 
89
+ # Handlers
90
+ def update_ram():
91
+ avail, total = engine.get_mem()
92
+ used = total - avail
93
+ return {"Used (MB)": used, "Free (MB)": avail}
94
 
95
  def scan(repo):
96
  files = engine.api.list_repo_files(repo_id=repo)
97
  ggufs = [f for f in files if f.endswith(".gguf")]
98
  return gr.update(choices=ggufs, value=ggufs[0] if ggufs else None)
99
 
100
+ def run_chat(m, h, g):
101
+ if not engine.llm: yield h + [{"role":"assistant", "content":"Load model first."}]; return
102
+ full_p = f"{g}\n{m}" if g else m
103
+ resp = ""
104
+ for chunk in engine.llm.create_chat_completion(messages=[{"role":"user","content":full_p}], stream=True):
105
+ delta = chunk["choices"][0]["delta"]
106
+ if "content" in delta:
107
+ resp += delta["content"]
108
+ yield h + [{"role":"user", "content":m}, {"role":"assistant", "content":resp}]
109
+
110
+ demo.load(update_ram, None, ram_bar, every=2)
111
+ load_btn.click(scan, [repo_in], [file_drop]).then(engine.load_model, [repo_in, file_drop], None)
112
+ stitch_btn.click(engine.ghost_stitch, [ghost_in], [ghost_stat])
113
+ msg.submit(run_chat, [msg, chat, ghost_in], [chat], concurrency_limit=2)
114
 
115
  demo.queue().launch()