Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,32 +31,47 @@ SYSTEM_RESERVE_MB = 500
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 35 |
-
FLASH_ATTENTION =
|
| 36 |
-
KV_CACHE_QUANTIZATION = True #
|
| 37 |
-
CONTINUOUS_BATCHING =
|
| 38 |
-
SPECULATIVE_DECODE = False #
|
| 39 |
-
MLOCK_MODEL = False #
|
| 40 |
-
USE_MMAP = True #
|
| 41 |
-
OFFLOAD_KQV = False # CPU-only
|
| 42 |
-
OPTIMAL_THREADS =
|
| 43 |
-
ROPE_SCALING = 1.0
|
| 44 |
-
NUMA_OPTIMIZE =
|
| 45 |
-
AGGRESSIVE_GC = True
|
| 46 |
-
|
| 47 |
-
# Quantization detection
|
| 48 |
QUANT_OPTIMIZATIONS = {
|
| 49 |
-
"BF16": {"batch_multiplier": 0.
|
| 50 |
-
"F16": {"batch_multiplier": 0.
|
| 51 |
-
"Q8_0": {"batch_multiplier": 0
|
| 52 |
-
"Q6_K": {"batch_multiplier":
|
| 53 |
-
"Q5_K_M": {"batch_multiplier": 1.
|
| 54 |
-
"Q5_K_S": {"batch_multiplier": 1.
|
| 55 |
-
"Q4_K_M": {"batch_multiplier":
|
| 56 |
-
"Q4_K_S": {"batch_multiplier":
|
| 57 |
-
"Q4_0": {"batch_multiplier":
|
| 58 |
-
"Q3_K_M": {"batch_multiplier":
|
| 59 |
-
"Q2_K": {"batch_multiplier":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
@@ -207,6 +222,100 @@ class ModelCacheManager:
|
|
| 207 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 208 |
return False
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# Global cache manager
|
| 211 |
model_cache = ModelCacheManager()
|
| 212 |
|
|
@@ -272,7 +381,7 @@ class ZeroEngine:
|
|
| 272 |
self.api = HfApi(token=HF_TOKEN)
|
| 273 |
self.telemetry = TelemetryManager(self.api)
|
| 274 |
self.llm: Optional[Llama] = None
|
| 275 |
-
self.active_model_info = {"repo": "", "file": ""}
|
| 276 |
self.kernel_lock = threading.Lock()
|
| 277 |
self.is_prefilling = False
|
| 278 |
self.perf_stats = {
|
|
@@ -282,9 +391,9 @@ class ZeroEngine:
|
|
| 282 |
"peak_tps": 0.0,
|
| 283 |
"cache_hits": 0
|
| 284 |
}
|
| 285 |
-
self.prompt_cache = {}
|
| 286 |
self.last_activity = time.time()
|
| 287 |
-
self.idle_timeout = 20
|
| 288 |
self.auto_cleanup_thread = None
|
| 289 |
self.start_idle_monitor()
|
| 290 |
|
|
@@ -293,6 +402,29 @@ class ZeroEngine:
|
|
| 293 |
self.typing_timer = None
|
| 294 |
self.preprocessed_tokens = None
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
def detect_quantization(self, filename: str) -> dict:
|
| 297 |
"""Detect quantization method from filename and return optimizations"""
|
| 298 |
filename_upper = filename.upper()
|
|
@@ -389,7 +521,158 @@ class ZeroEngine:
|
|
| 389 |
logger.error(f"Scan error: {e}")
|
| 390 |
return []
|
| 391 |
|
| 392 |
-
def boot_kernel(self, repo: str, filename: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 394 |
try:
|
| 395 |
if not repo or not filename:
|
|
@@ -600,23 +883,28 @@ class ZeroEngine:
|
|
| 600 |
first_token_time = None
|
| 601 |
|
| 602 |
try:
|
| 603 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
stream = self.llm(
|
| 605 |
formatted_prompt,
|
| 606 |
-
max_tokens=
|
| 607 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 608 |
stream=True,
|
| 609 |
-
temperature=
|
| 610 |
-
top_p=
|
| 611 |
-
top_k=
|
| 612 |
-
repeat_penalty=
|
| 613 |
-
frequency_penalty=0.0,
|
| 614 |
-
presence_penalty=0.0,
|
| 615 |
-
tfs_z=1.0,
|
| 616 |
-
typical_p=1.0,
|
| 617 |
-
mirostat_mode=2, #
|
| 618 |
-
mirostat_tau=5.0,
|
| 619 |
-
mirostat_eta=0.1,
|
| 620 |
)
|
| 621 |
|
| 622 |
for chunk in stream:
|
|
@@ -636,10 +924,19 @@ class ZeroEngine:
|
|
| 636 |
if tps > self.perf_stats["peak_tps"]:
|
| 637 |
self.perf_stats["peak_tps"] = tps
|
| 638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
# Update history with streaming content + performance metrics
|
| 640 |
-
|
|
|
|
| 641 |
yield history
|
| 642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
# Update global performance stats
|
| 644 |
self.perf_stats["total_tokens"] += tokens_count
|
| 645 |
self.perf_stats["total_time"] += elapsed
|
|
@@ -763,27 +1060,49 @@ h1, h2, h3, h4, h5, h6 {
|
|
| 763 |
# --- UI INTERFACE ---
|
| 764 |
kernel = ZeroEngine()
|
| 765 |
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
with gr.Row():
|
| 783 |
with gr.Column(scale=8):
|
| 784 |
chat_box = gr.Chatbot(
|
| 785 |
label="Main Engine Feedback",
|
| 786 |
-
height=
|
| 787 |
show_label=False,
|
| 788 |
autoscroll=True,
|
| 789 |
container=True
|
|
@@ -798,12 +1117,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 798 |
)
|
| 799 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 800 |
|
| 801 |
-
with gr.Column(scale=
|
|
|
|
| 802 |
gr.Markdown("### π οΈ Hardware Status")
|
| 803 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 804 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 805 |
|
| 806 |
gr.Markdown("---")
|
|
|
|
|
|
|
| 807 |
gr.Markdown("### π‘ Model Control")
|
| 808 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 809 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
@@ -815,6 +1137,26 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 815 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 816 |
|
| 817 |
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
gr.Markdown("### π» Ghost Cache (Pre-Context)")
|
| 819 |
ghost_buffer = gr.Textbox(
|
| 820 |
label="Background Context",
|
|
@@ -828,7 +1170,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 828 |
log_output = gr.Code(
|
| 829 |
label="Kernel Logs",
|
| 830 |
language="shell",
|
| 831 |
-
value="[INIT] System Ready.",
|
| 832 |
lines=5
|
| 833 |
)
|
| 834 |
|
|
@@ -836,9 +1178,11 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 836 |
def update_stats():
|
| 837 |
try:
|
| 838 |
m = ResourceMonitor.get_metrics()
|
| 839 |
-
|
|
|
|
| 840 |
except Exception as e:
|
| 841 |
logger.error(f"Stats update error: {e}")
|
|
|
|
| 842 |
return "Error", "Error"
|
| 843 |
|
| 844 |
def on_scan(repo):
|
|
@@ -864,37 +1208,78 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 864 |
return
|
| 865 |
|
| 866 |
yield "βοΈ System: Initiating boot sequence...", gr.update()
|
| 867 |
-
time.sleep(0.5)
|
| 868 |
|
| 869 |
-
result = kernel.boot_kernel(repo, file)
|
| 870 |
yield result, gr.update()
|
| 871 |
|
| 872 |
except Exception as e:
|
| 873 |
logger.error(f"Boot UI error: {e}")
|
| 874 |
yield f"π΄ BOOT ERROR: {str(e)}", gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
-
# Timer for periodic stats updates
|
| 877 |
timer = gr.Timer(value=2)
|
| 878 |
-
timer.tick(update_stats, None, [ram_metric, cpu_metric])
|
| 879 |
|
| 880 |
# Event handlers
|
| 881 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 882 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
stitch_btn.click(
|
| 885 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 886 |
[ghost_buffer],
|
| 887 |
[stitch_status]
|
| 888 |
)
|
| 889 |
|
| 890 |
-
# Keyboard input preprocessing
|
| 891 |
user_input.change(
|
| 892 |
lambda x: kernel.preprocess_input(x),
|
| 893 |
[user_input],
|
| 894 |
None
|
| 895 |
)
|
| 896 |
|
| 897 |
-
# Auto-boot enabled inference
|
| 898 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 899 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 900 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|
|
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
| 34 |
+
# --- TOKEN SYSTEM CONFIG ---
|
| 35 |
+
MONTHLY_TOKEN_CREDITS = 100.0
|
| 36 |
+
TOKEN_COST_PER_100MS = 0.001
|
| 37 |
+
BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
|
| 38 |
+
TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
|
| 39 |
+
|
| 40 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 41 |
+
FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
|
| 42 |
+
KV_CACHE_QUANTIZATION = True # Keep for RAM savings
|
| 43 |
+
CONTINUOUS_BATCHING = False # CPU doesn't benefit much
|
| 44 |
+
SPECULATIVE_DECODE = False # CPU-only, no draft model
|
| 45 |
+
MLOCK_MODEL = False # Don't lock - allow OS to manage memory
|
| 46 |
+
USE_MMAP = True # Critical for CPU - fast loading
|
| 47 |
+
OFFLOAD_KQV = False # CPU-only
|
| 48 |
+
OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
|
| 49 |
+
ROPE_SCALING = 1.0
|
| 50 |
+
NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
|
| 51 |
+
AGGRESSIVE_GC = True
|
| 52 |
+
|
| 53 |
+
# Quantization detection - CPU-optimized batch multipliers (more aggressive)
|
| 54 |
QUANT_OPTIMIZATIONS = {
|
| 55 |
+
"BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
|
| 56 |
+
"F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
|
| 57 |
+
"Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
|
| 58 |
+
"Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
|
| 59 |
+
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 60 |
+
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 61 |
+
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
|
| 62 |
+
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 63 |
+
"Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
|
| 64 |
+
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
|
| 65 |
+
"Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Model format/architecture detection patterns
|
| 69 |
+
MODEL_FORMATS = {
|
| 70 |
+
"llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
|
| 71 |
+
"gemma": {"pattern": ["gemma"], "template": "gemma"},
|
| 72 |
+
"phi": {"pattern": ["phi"], "template": "phi"},
|
| 73 |
+
"qwen": {"pattern": ["qwen"], "template": "chatml"},
|
| 74 |
+
"deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
|
| 75 |
}
|
| 76 |
|
| 77 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
|
|
| 222 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 223 |
return False
|
| 224 |
|
| 225 |
+
# --- TOKEN MANAGER ---
|
| 226 |
+
class TokenManager:
|
| 227 |
+
def __init__(self):
|
| 228 |
+
self.user_tokens = {} # {session_id: {"balance": float, "start_time": float, "purchases": {}}}
|
| 229 |
+
self.active_sessions = {}
|
| 230 |
+
|
| 231 |
+
def get_session_id(self) -> str:
|
| 232 |
+
"""Generate or retrieve session ID from Gradio request"""
|
| 233 |
+
import hashlib
|
| 234 |
+
import time
|
| 235 |
+
# Simple session ID based on timestamp (in production, use gr.Request)
|
| 236 |
+
return hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
|
| 237 |
+
|
| 238 |
+
def initialize_user(self, session_id: str):
|
| 239 |
+
"""Initialize new user with monthly credits"""
|
| 240 |
+
if session_id not in self.user_tokens:
|
| 241 |
+
self.user_tokens[session_id] = {
|
| 242 |
+
"balance": MONTHLY_TOKEN_CREDITS,
|
| 243 |
+
"start_time": time.time(),
|
| 244 |
+
"purchases": {"batch_multiplier": 1, "token_limit": 2048},
|
| 245 |
+
"total_spent": 0.0
|
| 246 |
+
}
|
| 247 |
+
logger.info(f"[TOKEN] New user {session_id}: {MONTHLY_TOKEN_CREDITS} tokens")
|
| 248 |
+
|
| 249 |
+
def charge_usage(self, session_id: str, duration_ms: float) -> bool:
|
| 250 |
+
"""Charge user for inference time. Returns True if successful"""
|
| 251 |
+
self.initialize_user(session_id)
|
| 252 |
+
|
| 253 |
+
cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
|
| 254 |
+
|
| 255 |
+
if self.user_tokens[session_id]["balance"] >= cost:
|
| 256 |
+
self.user_tokens[session_id]["balance"] -= cost
|
| 257 |
+
self.user_tokens[session_id]["total_spent"] += cost
|
| 258 |
+
logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[session_id]['balance']:.2f}")
|
| 259 |
+
return True
|
| 260 |
+
else:
|
| 261 |
+
logger.warning(f"[TOKEN] Insufficient balance! Need {cost:.4f}, have {self.user_tokens[session_id]['balance']:.2f}")
|
| 262 |
+
return False
|
| 263 |
+
|
| 264 |
+
def purchase_batch_upgrade(self, session_id: str) -> tuple:
|
| 265 |
+
"""Purchase batch size upgrade (exponential cost)"""
|
| 266 |
+
self.initialize_user(session_id)
|
| 267 |
+
|
| 268 |
+
current_mult = self.user_tokens[session_id]["purchases"]["batch_multiplier"]
|
| 269 |
+
upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
|
| 270 |
+
cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
|
| 271 |
+
|
| 272 |
+
if self.user_tokens[session_id]["balance"] >= cost:
|
| 273 |
+
self.user_tokens[session_id]["balance"] -= cost
|
| 274 |
+
self.user_tokens[session_id]["purchases"]["batch_multiplier"] = current_mult * 2
|
| 275 |
+
new_mult = current_mult * 2
|
| 276 |
+
logger.info(f"[TOKEN] Batch upgrade: {current_mult}x β {new_mult}x | Cost: {cost:.5f}")
|
| 277 |
+
return True, f"β
Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
|
| 278 |
+
else:
|
| 279 |
+
return False, f"β Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
|
| 280 |
+
|
| 281 |
+
def purchase_token_upgrade(self, session_id: str, extra_tokens: int = 1000) -> tuple:
|
| 282 |
+
"""Purchase extra response token length"""
|
| 283 |
+
self.initialize_user(session_id)
|
| 284 |
+
|
| 285 |
+
cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
|
| 286 |
+
|
| 287 |
+
if self.user_tokens[session_id]["balance"] >= cost:
|
| 288 |
+
self.user_tokens[session_id]["balance"] -= cost
|
| 289 |
+
self.user_tokens[session_id]["purchases"]["token_limit"] += extra_tokens
|
| 290 |
+
new_limit = self.user_tokens[session_id]["purchases"]["token_limit"]
|
| 291 |
+
logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
|
| 292 |
+
return True, f"β
Token limit now {new_limit}! (-{cost:.5f} tokens)"
|
| 293 |
+
else:
|
| 294 |
+
return False, f"β Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
|
| 295 |
+
|
| 296 |
+
def get_balance(self, session_id: str) -> float:
|
| 297 |
+
"""Get user's current token balance"""
|
| 298 |
+
self.initialize_user(session_id)
|
| 299 |
+
return round(self.user_tokens[session_id]["balance"], 2)
|
| 300 |
+
|
| 301 |
+
def get_purchases(self, session_id: str) -> dict:
|
| 302 |
+
"""Get user's current purchases"""
|
| 303 |
+
self.initialize_user(session_id)
|
| 304 |
+
return self.user_tokens[session_id]["purchases"]
|
| 305 |
+
|
| 306 |
+
def end_session(self, session_id: str):
|
| 307 |
+
"""End user session and log stats"""
|
| 308 |
+
if session_id in self.user_tokens:
|
| 309 |
+
stats = self.user_tokens[session_id]
|
| 310 |
+
logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
|
| 311 |
+
# Don't delete - keep for monthly tracking
|
| 312 |
+
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session."
|
| 313 |
+
return "No active session found."
|
| 314 |
+
|
| 315 |
+
# Global token manager
|
| 316 |
+
import math
|
| 317 |
+
token_manager = TokenManager()
|
| 318 |
+
|
| 319 |
# Global cache manager
|
| 320 |
model_cache = ModelCacheManager()
|
| 321 |
|
|
|
|
| 381 |
self.api = HfApi(token=HF_TOKEN)
|
| 382 |
self.telemetry = TelemetryManager(self.api)
|
| 383 |
self.llm: Optional[Llama] = None
|
| 384 |
+
self.active_model_info = {"repo": "", "file": "", "format": ""}
|
| 385 |
self.kernel_lock = threading.Lock()
|
| 386 |
self.is_prefilling = False
|
| 387 |
self.perf_stats = {
|
|
|
|
| 391 |
"peak_tps": 0.0,
|
| 392 |
"cache_hits": 0
|
| 393 |
}
|
| 394 |
+
self.prompt_cache = {}
|
| 395 |
self.last_activity = time.time()
|
| 396 |
+
self.idle_timeout = 20
|
| 397 |
self.auto_cleanup_thread = None
|
| 398 |
self.start_idle_monitor()
|
| 399 |
|
|
|
|
| 402 |
self.typing_timer = None
|
| 403 |
self.preprocessed_tokens = None
|
| 404 |
|
| 405 |
+
# Custom parameters (user-configurable)
|
| 406 |
+
self.custom_params = {
|
| 407 |
+
"temperature": 0.7,
|
| 408 |
+
"top_p": 0.95,
|
| 409 |
+
"top_k": 40,
|
| 410 |
+
"repeat_penalty": 1.1,
|
| 411 |
+
"batch_size_override": None, # None = auto
|
| 412 |
+
"max_tokens_override": None # None = auto
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
def detect_model_format(self, filename: str, repo: str) -> str:
|
| 416 |
+
"""Auto-detect model format/architecture from filename and repo"""
|
| 417 |
+
combined = f"{repo.lower()} {filename.lower()}"
|
| 418 |
+
|
| 419 |
+
for format_name, format_info in MODEL_FORMATS.items():
|
| 420 |
+
for pattern in format_info["pattern"]:
|
| 421 |
+
if pattern in combined:
|
| 422 |
+
logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
|
| 423 |
+
return format_name
|
| 424 |
+
|
| 425 |
+
logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
|
| 426 |
+
return "llama"
|
| 427 |
+
|
| 428 |
def detect_quantization(self, filename: str) -> dict:
|
| 429 |
"""Detect quantization method from filename and return optimizations"""
|
| 430 |
filename_upper = filename.upper()
|
|
|
|
| 521 |
logger.error(f"Scan error: {e}")
|
| 522 |
return []
|
| 523 |
|
| 524 |
+
def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
|
| 525 |
+
"""HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
|
| 526 |
+
try:
|
| 527 |
+
if not repo or not filename:
|
| 528 |
+
return "π΄ ERROR: Repository or filename missing"
|
| 529 |
+
|
| 530 |
+
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 531 |
+
|
| 532 |
+
# DETECT QUANTIZATION FROM FILENAME
|
| 533 |
+
quant_config = self.detect_quantization(filename)
|
| 534 |
+
|
| 535 |
+
# DETECT MODEL FORMAT/ARCHITECTURE
|
| 536 |
+
model_format = self.detect_model_format(filename, repo)
|
| 537 |
+
|
| 538 |
+
# Download with timeout protection
|
| 539 |
+
try:
|
| 540 |
+
path = hf_hub_download(
|
| 541 |
+
repo_id=repo,
|
| 542 |
+
filename=filename,
|
| 543 |
+
token=HF_TOKEN,
|
| 544 |
+
local_files_only=False
|
| 545 |
+
)
|
| 546 |
+
logger.info(f"[BOOT] Download complete: {path}")
|
| 547 |
+
except Exception as e:
|
| 548 |
+
logger.error(f"[BOOT] Download failed: {e}")
|
| 549 |
+
return f"π΄ DOWNLOAD FAILED: {str(e)}"
|
| 550 |
+
|
| 551 |
+
# Check if model is cached
|
| 552 |
+
is_cached = model_cache.is_cached(path)
|
| 553 |
+
cache_status = "π― CACHED" if is_cached else "π NEW"
|
| 554 |
+
|
| 555 |
+
# Validate before loading
|
| 556 |
+
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 557 |
+
if not valid:
|
| 558 |
+
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 559 |
+
return f"π΄ VALIDATION FAILED: {msg}"
|
| 560 |
+
|
| 561 |
+
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
|
| 562 |
+
|
| 563 |
+
# Load model with MAXIMUM PERFORMANCE SETTINGS
|
| 564 |
+
with self.kernel_lock:
|
| 565 |
+
# WRECK OLD MODEL
|
| 566 |
+
if self.llm:
|
| 567 |
+
logger.info("[BOOT] π£ WRECKING old model...")
|
| 568 |
+
try:
|
| 569 |
+
model_cache.wreck_old_model_cache()
|
| 570 |
+
del self.llm
|
| 571 |
+
self.llm = None
|
| 572 |
+
nuclear_ram_clear()
|
| 573 |
+
logger.info("[BOOT] β
Old model DESTROYED")
|
| 574 |
+
except Exception as e:
|
| 575 |
+
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 576 |
+
|
| 577 |
+
# Calculate optimal parameters with token purchases
|
| 578 |
+
vm = psutil.virtual_memory()
|
| 579 |
+
available_ram_gb = vm.available / (1024**3)
|
| 580 |
+
|
| 581 |
+
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 582 |
+
# Base calculation: use more RAM for batching on CPU
|
| 583 |
+
base_batch = int(512 * available_ram_gb / 8) # More aggressive base
|
| 584 |
+
optimal_batch = int(base_batch * quant_config["batch_multiplier"])
|
| 585 |
+
|
| 586 |
+
# Apply user's batch multiplier from token purchases
|
| 587 |
+
if session_id:
|
| 588 |
+
user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
|
| 589 |
+
optimal_batch = int(optimal_batch * user_batch_mult)
|
| 590 |
+
logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
|
| 591 |
+
|
| 592 |
+
# CPU can handle larger batches with quantized models
|
| 593 |
+
optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
|
| 594 |
+
|
| 595 |
+
# Context size
|
| 596 |
+
optimal_ctx = quant_config["ctx_size"]
|
| 597 |
+
|
| 598 |
+
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 599 |
+
if model_format == "gemma":
|
| 600 |
+
optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
|
| 601 |
+
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 602 |
+
|
| 603 |
+
# Thread optimization - use ALL threads on CPU (including hyperthreading)
|
| 604 |
+
optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
|
| 605 |
+
logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
|
| 606 |
+
|
| 607 |
+
try:
|
| 608 |
+
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
| 609 |
+
|
| 610 |
+
# Preload cache if available
|
| 611 |
+
if is_cached:
|
| 612 |
+
model_cache.preload_cache(path)
|
| 613 |
+
|
| 614 |
+
# ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
|
| 615 |
+
init_params = {
|
| 616 |
+
"model_path": path,
|
| 617 |
+
"n_ctx": optimal_ctx,
|
| 618 |
+
"n_threads": optimal_threads,
|
| 619 |
+
"n_threads_batch": optimal_threads,
|
| 620 |
+
"use_mmap": USE_MMAP, # Critical for CPU
|
| 621 |
+
"use_mlock": MLOCK_MODEL, # Let OS manage memory
|
| 622 |
+
"n_batch": optimal_batch, # MASSIVE batches for CPU
|
| 623 |
+
"n_gpu_layers": 0, # CPU-only
|
| 624 |
+
"rope_scaling_type": 0,
|
| 625 |
+
"rope_freq_scale": ROPE_SCALING,
|
| 626 |
+
"verbose": False,
|
| 627 |
+
"logits_all": False,
|
| 628 |
+
"embedding": False,
|
| 629 |
+
"f16_kv": False # Use quantized KV cache
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
+
# Add KV quantization only if not Gemma (Gemma can be finicky)
|
| 633 |
+
if model_format != "gemma" and KV_CACHE_QUANTIZATION:
|
| 634 |
+
init_params["type_k"] = 2
|
| 635 |
+
init_params["type_v"] = 2
|
| 636 |
+
logger.info("[OPTIM] KV cache quantization enabled (Q4)")
|
| 637 |
+
|
| 638 |
+
self.llm = Llama(**init_params)
|
| 639 |
+
|
| 640 |
+
self.active_model_info = {
|
| 641 |
+
"repo": repo,
|
| 642 |
+
"file": filename,
|
| 643 |
+
"quant": quant_config['type'],
|
| 644 |
+
"format": model_format
|
| 645 |
+
}
|
| 646 |
+
self.telemetry.track_load(repo, filename)
|
| 647 |
+
|
| 648 |
+
# Extract and cache signature
|
| 649 |
+
if not is_cached:
|
| 650 |
+
logger.info("[BOOT] Extracting cache signature...")
|
| 651 |
+
signature = model_cache.extract_cache_signature(path)
|
| 652 |
+
if signature:
|
| 653 |
+
model_cache.save_to_cache(path, signature)
|
| 654 |
+
|
| 655 |
+
# Warm-up
|
| 656 |
+
logger.info("[BOOT] Warming up model caches...")
|
| 657 |
+
try:
|
| 658 |
+
self.llm("Warmup", max_tokens=1, stream=False)
|
| 659 |
+
force_gc()
|
| 660 |
+
except:
|
| 661 |
+
pass
|
| 662 |
+
|
| 663 |
+
logger.info("[BOOT] π CPU-OPTIMIZED MODEL READY!")
|
| 664 |
+
return f"π’ {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
|
| 665 |
+
|
| 666 |
+
except Exception as e:
|
| 667 |
+
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 668 |
+
self.llm = None
|
| 669 |
+
nuclear_ram_clear()
|
| 670 |
+
return f"π΄ LOAD FAILED: {str(e)}"
|
| 671 |
+
|
| 672 |
+
except Exception as e:
|
| 673 |
+
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 674 |
+
nuclear_ram_clear()
|
| 675 |
+
return f"π΄ BOOT FAILURE: {str(e)}"
|
| 676 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 677 |
try:
|
| 678 |
if not repo or not filename:
|
|
|
|
| 883 |
first_token_time = None
|
| 884 |
|
| 885 |
try:
|
| 886 |
+
# Get max tokens from user purchases
|
| 887 |
+
max_tokens = 2048
|
| 888 |
+
if session_id:
|
| 889 |
+
max_tokens = token_manager.get_purchases(session_id)["token_limit"]
|
| 890 |
+
|
| 891 |
+
# HYPER-OPTIMIZED CPU INFERENCE SETTINGS
|
| 892 |
stream = self.llm(
|
| 893 |
formatted_prompt,
|
| 894 |
+
max_tokens=max_tokens,
|
| 895 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 896 |
stream=True,
|
| 897 |
+
temperature=self.custom_params["temperature"],
|
| 898 |
+
top_p=self.custom_params["top_p"],
|
| 899 |
+
top_k=self.custom_params["top_k"],
|
| 900 |
+
repeat_penalty=self.custom_params["repeat_penalty"],
|
| 901 |
+
frequency_penalty=0.0,
|
| 902 |
+
presence_penalty=0.0,
|
| 903 |
+
tfs_z=1.0,
|
| 904 |
+
typical_p=1.0,
|
| 905 |
+
mirostat_mode=2, # CPU benefits from mirostat
|
| 906 |
+
mirostat_tau=5.0,
|
| 907 |
+
mirostat_eta=0.1,
|
| 908 |
)
|
| 909 |
|
| 910 |
for chunk in stream:
|
|
|
|
| 924 |
if tps > self.perf_stats["peak_tps"]:
|
| 925 |
self.perf_stats["peak_tps"] = tps
|
| 926 |
|
| 927 |
+
# Charge tokens every second
|
| 928 |
+
if int(elapsed * 1000) % 1000 < 100 and session_id: # Every ~1 second
|
| 929 |
+
token_manager.charge_usage(session_id, elapsed * 1000)
|
| 930 |
+
|
| 931 |
# Update history with streaming content + performance metrics
|
| 932 |
+
balance = token_manager.get_balance(session_id) if session_id else 0
|
| 933 |
+
history[-1]["content"] = f"{response_text}\n\n`β‘ {tps} t/s | π― Peak: {self.perf_stats['peak_tps']:.1f} t/s | π° {balance:.2f} tokens`"
|
| 934 |
yield history
|
| 935 |
|
| 936 |
+
# Final token charge for remaining time
|
| 937 |
+
if session_id:
|
| 938 |
+
token_manager.charge_usage(session_id, elapsed * 1000)
|
| 939 |
+
|
| 940 |
# Update global performance stats
|
| 941 |
self.perf_stats["total_tokens"] += tokens_count
|
| 942 |
self.perf_stats["total_time"] += elapsed
|
|
|
|
| 1060 |
# --- UI INTERFACE ---
|
| 1061 |
kernel = ZeroEngine()
|
| 1062 |
|
| 1063 |
+
# Session ID for token tracking
|
| 1064 |
+
session_id = token_manager.get_session_id()
|
| 1065 |
+
|
| 1066 |
+
with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
| 1067 |
+
# Header with Token Display
|
| 1068 |
+
with gr.Row():
|
| 1069 |
+
with gr.Column(scale=8):
|
| 1070 |
+
gr.HTML("""
|
| 1071 |
+
<div style='text-align: center; padding: 30px; border-radius: 24px;
|
| 1072 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 1073 |
+
margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
|
| 1074 |
+
<h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
|
| 1075 |
+
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
| 1076 |
+
font-family: Consolas, monospace;'>
|
| 1077 |
+
π°οΈ ZEROENGINE V0.2
|
| 1078 |
+
</h1>
|
| 1079 |
+
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 1080 |
+
CPU-Optimized | Token System | Custom Parameters | Auto-Format
|
| 1081 |
+
</p>
|
| 1082 |
+
</div>
|
| 1083 |
+
""")
|
| 1084 |
+
with gr.Column(scale=2):
|
| 1085 |
+
# Token Display
|
| 1086 |
+
gr.HTML("""
|
| 1087 |
+
<div style='text-align: center; padding: 20px; border-radius: 20px;
|
| 1088 |
+
background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
|
| 1089 |
+
margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
|
| 1090 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>π°</div>
|
| 1091 |
+
<div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
|
| 1092 |
+
100.00
|
| 1093 |
+
</div>
|
| 1094 |
+
<div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
|
| 1095 |
+
</div>
|
| 1096 |
+
""")
|
| 1097 |
+
token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
|
| 1098 |
+
end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
|
| 1099 |
+
session_status = gr.Markdown("", visible=False)
|
| 1100 |
|
| 1101 |
with gr.Row():
|
| 1102 |
with gr.Column(scale=8):
|
| 1103 |
chat_box = gr.Chatbot(
|
| 1104 |
label="Main Engine Feedback",
|
| 1105 |
+
height=600,
|
| 1106 |
show_label=False,
|
| 1107 |
autoscroll=True,
|
| 1108 |
container=True
|
|
|
|
| 1117 |
)
|
| 1118 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 1119 |
|
| 1120 |
+
with gr.Column(scale=4):
|
| 1121 |
+
# Hardware Status
|
| 1122 |
gr.Markdown("### π οΈ Hardware Status")
|
| 1123 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 1124 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 1125 |
|
| 1126 |
gr.Markdown("---")
|
| 1127 |
+
|
| 1128 |
+
# Model Control
|
| 1129 |
gr.Markdown("### π‘ Model Control")
|
| 1130 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 1131 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
|
|
| 1137 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 1138 |
|
| 1139 |
gr.Markdown("---")
|
| 1140 |
+
|
| 1141 |
+
# Custom Parameters
|
| 1142 |
+
gr.Markdown("### βοΈ Custom Parameters")
|
| 1143 |
+
temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
| 1144 |
+
top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
|
| 1145 |
+
top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
|
| 1146 |
+
repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
|
| 1147 |
+
|
| 1148 |
+
gr.Markdown("---")
|
| 1149 |
+
|
| 1150 |
+
# Token Purchases
|
| 1151 |
+
gr.Markdown("### π Token Upgrades")
|
| 1152 |
+
with gr.Row():
|
| 1153 |
+
batch_upgrade_btn = gr.Button("π Batch x2", size="sm", variant="secondary")
|
| 1154 |
+
token_upgrade_btn = gr.Button("π +1K Tokens", size="sm", variant="secondary")
|
| 1155 |
+
purchase_status = gr.Markdown("Ready to upgrade!")
|
| 1156 |
+
|
| 1157 |
+
gr.Markdown("---")
|
| 1158 |
+
|
| 1159 |
+
# Ghost Cache
|
| 1160 |
gr.Markdown("### π» Ghost Cache (Pre-Context)")
|
| 1161 |
ghost_buffer = gr.Textbox(
|
| 1162 |
label="Background Context",
|
|
|
|
| 1170 |
log_output = gr.Code(
|
| 1171 |
label="Kernel Logs",
|
| 1172 |
language="shell",
|
| 1173 |
+
value="[INIT] V0.2 System Ready.",
|
| 1174 |
lines=5
|
| 1175 |
)
|
| 1176 |
|
|
|
|
| 1178 |
def update_stats():
|
| 1179 |
try:
|
| 1180 |
m = ResourceMonitor.get_metrics()
|
| 1181 |
+
balance = token_manager.get_balance(session_id)
|
| 1182 |
+
return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
|
| 1183 |
except Exception as e:
|
| 1184 |
logger.error(f"Stats update error: {e}")
|
| 1185 |
+
return "Error", "Error", "0.00"
|
| 1186 |
return "Error", "Error"
|
| 1187 |
|
| 1188 |
def on_scan(repo):
|
|
|
|
| 1208 |
return
|
| 1209 |
|
| 1210 |
yield "βοΈ System: Initiating boot sequence...", gr.update()
|
| 1211 |
+
time.sleep(0.5)
|
| 1212 |
|
| 1213 |
+
result = kernel.boot_kernel(repo, file, session_id)
|
| 1214 |
yield result, gr.update()
|
| 1215 |
|
| 1216 |
except Exception as e:
|
| 1217 |
logger.error(f"Boot UI error: {e}")
|
| 1218 |
yield f"π΄ BOOT ERROR: {str(e)}", gr.update()
|
| 1219 |
+
|
| 1220 |
+
def on_batch_upgrade():
|
| 1221 |
+
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
| 1222 |
+
balance = token_manager.get_balance(session_id)
|
| 1223 |
+
return msg, f"{balance}"
|
| 1224 |
+
|
| 1225 |
+
def on_token_upgrade():
|
| 1226 |
+
success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
|
| 1227 |
+
balance = token_manager.get_balance(session_id)
|
| 1228 |
+
return msg, f"{balance}"
|
| 1229 |
+
|
| 1230 |
+
def on_end_session():
|
| 1231 |
+
msg = token_manager.end_session(session_id)
|
| 1232 |
+
return msg
|
| 1233 |
+
|
| 1234 |
+
def update_custom_params(temp, top_p, top_k, repeat_pen):
|
| 1235 |
+
kernel.custom_params["temperature"] = temp
|
| 1236 |
+
kernel.custom_params["top_p"] = top_p
|
| 1237 |
+
kernel.custom_params["top_k"] = int(top_k)
|
| 1238 |
+
kernel.custom_params["repeat_penalty"] = repeat_pen
|
| 1239 |
+
return "β
Parameters updated!"
|
| 1240 |
|
| 1241 |
+
# Timer for periodic stats updates (includes token balance)
|
| 1242 |
timer = gr.Timer(value=2)
|
| 1243 |
+
timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
|
| 1244 |
|
| 1245 |
# Event handlers
|
| 1246 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 1247 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 1248 |
|
| 1249 |
+
# Token purchases
|
| 1250 |
+
batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
|
| 1251 |
+
token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
|
| 1252 |
+
end_session_btn.click(on_end_session, None, [session_status])
|
| 1253 |
+
|
| 1254 |
+
# Custom parameter updates
|
| 1255 |
+
temperature_slider.change(update_custom_params,
|
| 1256 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1257 |
+
[purchase_status])
|
| 1258 |
+
top_p_slider.change(update_custom_params,
|
| 1259 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1260 |
+
[purchase_status])
|
| 1261 |
+
top_k_slider.change(update_custom_params,
|
| 1262 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1263 |
+
[purchase_status])
|
| 1264 |
+
repeat_penalty_slider.change(update_custom_params,
|
| 1265 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1266 |
+
[purchase_status])
|
| 1267 |
+
|
| 1268 |
+
# Ghost cache
|
| 1269 |
stitch_btn.click(
|
| 1270 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 1271 |
[ghost_buffer],
|
| 1272 |
[stitch_status]
|
| 1273 |
)
|
| 1274 |
|
| 1275 |
+
# Keyboard input preprocessing
|
| 1276 |
user_input.change(
|
| 1277 |
lambda x: kernel.preprocess_input(x),
|
| 1278 |
[user_input],
|
| 1279 |
None
|
| 1280 |
)
|
| 1281 |
|
| 1282 |
+
# Auto-boot enabled inference
|
| 1283 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 1284 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 1285 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|