Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,7 +16,10 @@ from typing import List, Dict, Optional, Generator
|
|
| 16 |
|
| 17 |
import gradio as gr
|
| 18 |
from huggingface_hub import HfApi, hf_hub_download
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# ==========================================
|
| 22 |
# SYSTEM CONFIGURATION & CONSTANTS
|
|
@@ -24,7 +27,7 @@ from llama_cpp import Llama
|
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 25 |
SPACE_ID = os.environ.get("SPACE_ID")
|
| 26 |
LOG_FILE = "engine_telemetry.json"
|
| 27 |
-
RAM_LIMIT_PCT = 0.50
|
| 28 |
SYSTEM_RESERVE_MB = 250
|
| 29 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 30 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
|
@@ -62,7 +65,6 @@ class TelemetryManager:
|
|
| 62 |
|
| 63 |
def track_generation(self, tokens: int):
|
| 64 |
self.stats["total_tokens_generated"] += tokens
|
| 65 |
-
# Periodic sync could be added here
|
| 66 |
|
| 67 |
def _sync_to_cloud(self):
|
| 68 |
if not HF_TOKEN or not SPACE_ID:
|
|
@@ -105,11 +107,9 @@ class ResourceMonitor:
|
|
| 105 |
total_ram_mb = vm.total / (1024**2)
|
| 106 |
avail_ram_mb = vm.available / (1024**2)
|
| 107 |
|
| 108 |
-
# Rule 1: 50% Hard Cap
|
| 109 |
if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
|
| 110 |
return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
|
| 111 |
|
| 112 |
-
# Rule 2: Safety Buffer
|
| 113 |
if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
|
| 114 |
return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
|
| 115 |
|
|
@@ -146,15 +146,13 @@ class ZeroEngine:
|
|
| 146 |
return msg
|
| 147 |
|
| 148 |
with self.kernel_lock:
|
| 149 |
-
# Clean up old instance
|
| 150 |
if self.llm:
|
| 151 |
del self.llm
|
| 152 |
-
|
| 153 |
-
# Initialize new instance with CPU Affinity Partitioning
|
| 154 |
self.llm = Llama(
|
| 155 |
model_path=path,
|
| 156 |
n_ctx=4096,
|
| 157 |
-
n_threads=1,
|
| 158 |
use_mmap=True,
|
| 159 |
n_batch=512,
|
| 160 |
last_n_tokens_size=64,
|
|
@@ -179,8 +177,6 @@ class ZeroEngine:
|
|
| 179 |
self.is_prefilling = True
|
| 180 |
try:
|
| 181 |
tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
|
| 182 |
-
# Prefix matching in llama-cpp happens automatically
|
| 183 |
-
# if we evaluate tokens and store them in the KV cache.
|
| 184 |
self.llm.eval(tokens)
|
| 185 |
logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
|
| 186 |
except Exception as e:
|
|
@@ -197,11 +193,8 @@ class ZeroEngine:
|
|
| 197 |
yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
|
| 198 |
return
|
| 199 |
|
| 200 |
-
# Combine Ghost Terminal context with Active Input
|
| 201 |
full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
|
| 202 |
|
| 203 |
-
# Prepare history for Llama-3 style chat templates if needed
|
| 204 |
-
# For V0.1 we use raw completion for maximum speed/minimal overhead
|
| 205 |
formatted_prompt = f"User: {full_input}\nAssistant: "
|
| 206 |
|
| 207 |
response_text = ""
|
|
@@ -221,7 +214,6 @@ class ZeroEngine:
|
|
| 221 |
response_text += token
|
| 222 |
tokens_count += 1
|
| 223 |
|
| 224 |
-
# Calculate performance metrics
|
| 225 |
elapsed = time.time() - start_time
|
| 226 |
tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
|
| 227 |
|
|
@@ -256,7 +248,6 @@ with gr.Blocks(
|
|
| 256 |
""")
|
| 257 |
|
| 258 |
with gr.Row():
|
| 259 |
-
# --- LEFT: CHAT ENGINE (FOCUS MODE) ---
|
| 260 |
with gr.Column(scale=8):
|
| 261 |
chat_box = gr.Chatbot(
|
| 262 |
type="messages",
|
|
@@ -276,7 +267,6 @@ with gr.Blocks(
|
|
| 276 |
with gr.Column(scale=1, min_width=50):
|
| 277 |
send_btn = gr.Button("EXE", variant="primary")
|
| 278 |
|
| 279 |
-
# --- RIGHT: ENGINE ROOM (SIDEBAR) ---
|
| 280 |
with gr.Sidebar(label="Engine Room", open=True) as sidebar:
|
| 281 |
gr.Markdown("### 📊 Resource Gauges")
|
| 282 |
with gr.Row():
|
|
@@ -330,7 +320,6 @@ with gr.Blocks(
|
|
| 330 |
res = kernel.stitch_cache(text)
|
| 331 |
return f"Cache State: `{res}`"
|
| 332 |
|
| 333 |
-
# Event Mapping
|
| 334 |
demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
|
| 335 |
|
| 336 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
|
@@ -343,12 +332,10 @@ with gr.Blocks(
|
|
| 343 |
|
| 344 |
stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
|
| 345 |
|
| 346 |
-
# Inference Flow
|
| 347 |
input_args = [user_input, chat_box, ghost_buffer]
|
| 348 |
user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
|
| 349 |
send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
|
| 350 |
|
| 351 |
-
# Clear Inputs
|
| 352 |
user_input.submit(lambda: "", None, [user_input])
|
| 353 |
user_input.submit(lambda: "", None, [ghost_buffer])
|
| 354 |
|
|
|
|
| 16 |
|
| 17 |
import gradio as gr
|
| 18 |
from huggingface_hub import HfApi, hf_hub_download
|
| 19 |
+
try:
|
| 20 |
+
from llama_cpp import Llama
|
| 21 |
+
except ImportError:
|
| 22 |
+
from llama_cpp_pydist import Llama
|
| 23 |
|
| 24 |
# ==========================================
|
| 25 |
# SYSTEM CONFIGURATION & CONSTANTS
|
|
|
|
| 27 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 28 |
SPACE_ID = os.environ.get("SPACE_ID")
|
| 29 |
LOG_FILE = "engine_telemetry.json"
|
| 30 |
+
RAM_LIMIT_PCT = 0.50
|
| 31 |
SYSTEM_RESERVE_MB = 250
|
| 32 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 33 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
|
|
|
| 65 |
|
| 66 |
def track_generation(self, tokens: int):
|
| 67 |
self.stats["total_tokens_generated"] += tokens
|
|
|
|
| 68 |
|
| 69 |
def _sync_to_cloud(self):
|
| 70 |
if not HF_TOKEN or not SPACE_ID:
|
|
|
|
| 107 |
total_ram_mb = vm.total / (1024**2)
|
| 108 |
avail_ram_mb = vm.available / (1024**2)
|
| 109 |
|
|
|
|
| 110 |
if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
|
| 111 |
return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
|
| 112 |
|
|
|
|
| 113 |
if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
|
| 114 |
return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
|
| 115 |
|
|
|
|
| 146 |
return msg
|
| 147 |
|
| 148 |
with self.kernel_lock:
|
|
|
|
| 149 |
if self.llm:
|
| 150 |
del self.llm
|
| 151 |
+
|
|
|
|
| 152 |
self.llm = Llama(
|
| 153 |
model_path=path,
|
| 154 |
n_ctx=4096,
|
| 155 |
+
n_threads=1,
|
| 156 |
use_mmap=True,
|
| 157 |
n_batch=512,
|
| 158 |
last_n_tokens_size=64,
|
|
|
|
| 177 |
self.is_prefilling = True
|
| 178 |
try:
|
| 179 |
tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
|
|
|
|
|
|
|
| 180 |
self.llm.eval(tokens)
|
| 181 |
logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
|
| 182 |
except Exception as e:
|
|
|
|
| 193 |
yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
|
| 194 |
return
|
| 195 |
|
|
|
|
| 196 |
full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
|
| 197 |
|
|
|
|
|
|
|
| 198 |
formatted_prompt = f"User: {full_input}\nAssistant: "
|
| 199 |
|
| 200 |
response_text = ""
|
|
|
|
| 214 |
response_text += token
|
| 215 |
tokens_count += 1
|
| 216 |
|
|
|
|
| 217 |
elapsed = time.time() - start_time
|
| 218 |
tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
|
| 219 |
|
|
|
|
| 248 |
""")
|
| 249 |
|
| 250 |
with gr.Row():
|
|
|
|
| 251 |
with gr.Column(scale=8):
|
| 252 |
chat_box = gr.Chatbot(
|
| 253 |
type="messages",
|
|
|
|
| 267 |
with gr.Column(scale=1, min_width=50):
|
| 268 |
send_btn = gr.Button("EXE", variant="primary")
|
| 269 |
|
|
|
|
| 270 |
with gr.Sidebar(label="Engine Room", open=True) as sidebar:
|
| 271 |
gr.Markdown("### 📊 Resource Gauges")
|
| 272 |
with gr.Row():
|
|
|
|
| 320 |
res = kernel.stitch_cache(text)
|
| 321 |
return f"Cache State: `{res}`"
|
| 322 |
|
|
|
|
| 323 |
demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
|
| 324 |
|
| 325 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
|
|
|
| 332 |
|
| 333 |
stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
|
| 334 |
|
|
|
|
| 335 |
input_args = [user_input, chat_box, ghost_buffer]
|
| 336 |
user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
|
| 337 |
send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
|
| 338 |
|
|
|
|
| 339 |
user_input.submit(lambda: "", None, [user_input])
|
| 340 |
user_input.submit(lambda: "", None, [ghost_buffer])
|
| 341 |
|