Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,23 +34,121 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
|
| 34 |
# --- TOKEN SYSTEM CONFIG ---
|
| 35 |
MONTHLY_TOKEN_CREDITS = 100.0
|
| 36 |
TOKEN_COST_PER_100MS = 0.001
|
| 37 |
-
BATCH_UPGRADE_BASE_COST = 0.00005
|
| 38 |
-
TOKEN_UPGRADE_COST_PER_1K = 0.0001
|
| 39 |
|
| 40 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 41 |
-
FLASH_ATTENTION = False
|
| 42 |
-
KV_CACHE_QUANTIZATION = True
|
| 43 |
-
CONTINUOUS_BATCHING = False
|
| 44 |
-
SPECULATIVE_DECODE = False
|
| 45 |
-
MLOCK_MODEL = False
|
| 46 |
-
USE_MMAP = True
|
| 47 |
-
OFFLOAD_KQV = False
|
| 48 |
-
OPTIMAL_THREADS =
|
| 49 |
ROPE_SCALING = 1.0
|
| 50 |
-
NUMA_OPTIMIZE = False
|
| 51 |
AGGRESSIVE_GC = True
|
| 52 |
|
| 53 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
QUANT_OPTIMIZATIONS = {
|
| 55 |
"BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
|
| 56 |
"F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
|
|
@@ -58,7 +156,7 @@ QUANT_OPTIMIZATIONS = {
|
|
| 58 |
"Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
|
| 59 |
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 60 |
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 61 |
-
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 62 |
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 63 |
"Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
|
| 64 |
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
|
|
@@ -79,35 +177,59 @@ logger = logging.getLogger(__name__)
|
|
| 79 |
|
| 80 |
# --- AGGRESSIVE GARBAGE COLLECTOR ---
|
| 81 |
import gc
|
|
|
|
|
|
|
| 82 |
gc.enable()
|
| 83 |
-
gc.set_threshold(700, 10, 10)
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def force_gc():
|
| 86 |
"""Force aggressive garbage collection"""
|
| 87 |
if AGGRESSIVE_GC:
|
| 88 |
-
collected = gc.collect(2)
|
| 89 |
logger.info(f"[GC] Collected {collected} objects")
|
| 90 |
return collected
|
| 91 |
return 0
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def nuclear_ram_clear():
|
| 94 |
"""NUCLEAR option: Clear all Python caches and force full GC"""
|
| 95 |
try:
|
| 96 |
-
# Clear function caches
|
| 97 |
import functools
|
| 98 |
functools._CacheInfo.__call__ = lambda self: None
|
| 99 |
|
| 100 |
-
# Clear import caches
|
| 101 |
import sys
|
| 102 |
if hasattr(sys, 'modules'):
|
| 103 |
-
# Don't delete core modules, just clear their caches
|
| 104 |
for module_name, module in list(sys.modules.items()):
|
| 105 |
if hasattr(module, '__dict__') and not module_name.startswith('_'):
|
| 106 |
if hasattr(module, '__pycache__'):
|
| 107 |
delattr(module, '__pycache__')
|
| 108 |
|
| 109 |
-
|
| 110 |
-
for _ in range(3):
|
| 111 |
gc.collect(2)
|
| 112 |
|
| 113 |
logger.info("[RAM-NUKE] π₯ Nuclear RAM clear complete")
|
|
@@ -116,6 +238,38 @@ def nuclear_ram_clear():
|
|
| 116 |
logger.error(f"[RAM-NUKE] Failed: {e}")
|
| 117 |
return False
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
|
| 120 |
class ModelCacheManager:
|
| 121 |
def __init__(self):
|
|
@@ -743,9 +897,15 @@ class ZeroEngine:
|
|
| 743 |
optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
|
| 744 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 745 |
|
| 746 |
-
# Thread optimization -
|
| 747 |
-
optimal_threads =
|
| 748 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
|
| 750 |
try:
|
| 751 |
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
|
@@ -758,26 +918,74 @@ class ZeroEngine:
|
|
| 758 |
init_params = {
|
| 759 |
"model_path": path,
|
| 760 |
"n_ctx": optimal_ctx,
|
| 761 |
-
"n_threads": optimal_threads,
|
| 762 |
-
"n_threads_batch": optimal_threads,
|
| 763 |
-
"use_mmap": USE_MMAP,
|
| 764 |
-
"use_mlock": MLOCK_MODEL,
|
| 765 |
-
"n_batch": optimal_batch,
|
| 766 |
-
"n_gpu_layers": 0,
|
| 767 |
"rope_scaling_type": 0,
|
| 768 |
"rope_freq_scale": ROPE_SCALING,
|
| 769 |
"verbose": False,
|
| 770 |
-
"logits_all": False,
|
| 771 |
-
"embedding": False,
|
| 772 |
-
"f16_kv": False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
}
|
| 774 |
|
| 775 |
-
#
|
| 776 |
-
if
|
| 777 |
-
|
| 778 |
-
|
| 779 |
logger.info("[OPTIM] KV cache quantization enabled (Q4)")
|
| 780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
self.llm = Llama(**init_params)
|
| 782 |
|
| 783 |
self.active_model_info = {
|
|
@@ -1106,6 +1314,8 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1106 |
""")
|
| 1107 |
token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
|
| 1108 |
end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
|
|
|
|
|
|
|
| 1109 |
session_status = gr.Markdown("", visible=False)
|
| 1110 |
|
| 1111 |
with gr.Row():
|
|
@@ -1229,10 +1439,21 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1229 |
balance = token_manager.get_balance(session_id)
|
| 1230 |
return msg, f"{balance}"
|
| 1231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1232 |
def on_end_session():
|
| 1233 |
msg = token_manager.end_session(session_id)
|
| 1234 |
return msg
|
| 1235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1236 |
def update_custom_params(temp, top_p, top_k, repeat_pen):
|
| 1237 |
kernel.custom_params["temperature"] = temp
|
| 1238 |
kernel.custom_params["top_p"] = top_p
|
|
@@ -1252,6 +1473,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1252 |
batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
|
| 1253 |
token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
|
| 1254 |
end_session_btn.click(on_end_session, None, [session_status])
|
|
|
|
| 1255 |
|
| 1256 |
# Custom parameter updates
|
| 1257 |
temperature_slider.change(update_custom_params,
|
|
|
|
| 34 |
# --- TOKEN SYSTEM CONFIG ---
|
| 35 |
MONTHLY_TOKEN_CREDITS = 100.0
|
| 36 |
TOKEN_COST_PER_100MS = 0.001
|
| 37 |
+
BATCH_UPGRADE_BASE_COST = 0.00005
|
| 38 |
+
TOKEN_UPGRADE_COST_PER_1K = 0.0001
|
| 39 |
|
| 40 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 41 |
+
FLASH_ATTENTION = False
|
| 42 |
+
KV_CACHE_QUANTIZATION = True
|
| 43 |
+
CONTINUOUS_BATCHING = False
|
| 44 |
+
SPECULATIVE_DECODE = False
|
| 45 |
+
MLOCK_MODEL = False
|
| 46 |
+
USE_MMAP = True
|
| 47 |
+
OFFLOAD_KQV = False
|
| 48 |
+
OPTIMAL_THREADS = 2
|
| 49 |
ROPE_SCALING = 1.0
|
| 50 |
+
NUMA_OPTIMIZE = False
|
| 51 |
AGGRESSIVE_GC = True
|
| 52 |
|
| 53 |
+
# --- ULTRA AGGRESSIVE CPU OPTIMIZATIONS ---
|
| 54 |
+
CPU_AFFINITY = True
|
| 55 |
+
CPU_FREQ_BOOST = True
|
| 56 |
+
TURBO_MODE = True
|
| 57 |
+
LOW_LATENCY_MODE = True
|
| 58 |
+
MEMORY_MAPPED_IO = True
|
| 59 |
+
PARALLEL_TOKENIZATION = True
|
| 60 |
+
CHUNKED_INFERENCE = True
|
| 61 |
+
LAZY_LOADING = True
|
| 62 |
+
PREFETCH_CACHE = True
|
| 63 |
+
COMPRESS_CONTEXT = True
|
| 64 |
+
FAST_MATH = True
|
| 65 |
+
SKIP_LAYERS = False
|
| 66 |
+
QUANTIZED_INFERENCE = True
|
| 67 |
+
STREAMING_OUTPUT = True
|
| 68 |
+
PIPELINE_PARALLEL = False
|
| 69 |
+
TENSOR_PARALLEL = False
|
| 70 |
+
|
| 71 |
+
# --- CPU OPTIMIZATION FUNCTIONS ---
|
| 72 |
+
def optimize_cpu_performance():
|
| 73 |
+
"""Apply all CPU optimizations for 2 vCPU + 16GB RAM setup"""
|
| 74 |
+
try:
|
| 75 |
+
logger.info("[CPU-OPT] Applying ultra-aggressive CPU optimizations...")
|
| 76 |
+
|
| 77 |
+
if CPU_AFFINITY and hasattr(os, 'sched_setaffinity'):
|
| 78 |
+
os.sched_setaffinity(0, [0, 1])
|
| 79 |
+
logger.info("[CPU-OPT] CPU affinity set to cores 0,1")
|
| 80 |
+
|
| 81 |
+
if hasattr(os, 'nice'):
|
| 82 |
+
try:
|
| 83 |
+
os.nice(-5)
|
| 84 |
+
logger.info("[CPU-OPT] Process priority increased")
|
| 85 |
+
except:
|
| 86 |
+
logger.warning("[CPU-OPT] Could not set process priority (need sudo?)")
|
| 87 |
+
|
| 88 |
+
import sys
|
| 89 |
+
sys.setrecursionlimit(10000)
|
| 90 |
+
|
| 91 |
+
import threading
|
| 92 |
+
threading.stack_size(1024 * 1024)
|
| 93 |
+
|
| 94 |
+
if hasattr(os, 'malloc_trim'):
|
| 95 |
+
os.malloc_trim(0)
|
| 96 |
+
|
| 97 |
+
logger.info("[CPU-OPT] Ultra CPU optimizations complete!")
|
| 98 |
+
return True
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"[CPU-OPT] Optimization failed: {e}")
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
def boost_cpu_frequency():
|
| 105 |
+
"""Attempt to boost CPU frequency"""
|
| 106 |
+
try:
|
| 107 |
+
if not CPU_FREQ_BOOST:
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
with open('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor', 'w') as f:
|
| 112 |
+
f.write('performance')
|
| 113 |
+
with open('/sys/devices/system/cpu/cpu1/cpufreq/scaling_governor', 'w') as f:
|
| 114 |
+
f.write('performance')
|
| 115 |
+
logger.info("[CPU-FREQ] CPU governor set to performance")
|
| 116 |
+
return True
|
| 117 |
+
except:
|
| 118 |
+
logger.warning("[CPU-FREQ] Could not set CPU governor (need root?)")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"[CPU-FREQ] Failed: {e}")
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
def optimize_memory_layout():
|
| 126 |
+
"""Optimize memory layout for better cache performance"""
|
| 127 |
+
try:
|
| 128 |
+
logger.info("[MEM-OPT] Optimizing memory layout...")
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
import mmap
|
| 132 |
+
logger.info("[MEM-OPT] Large page support checked")
|
| 133 |
+
except:
|
| 134 |
+
pass
|
| 135 |
+
|
| 136 |
+
memory_pool = []
|
| 137 |
+
for i in range(10):
|
| 138 |
+
memory_pool.append(bytearray(1024 * 1024))
|
| 139 |
+
|
| 140 |
+
logger.info("[MEM-OPT] Memory pools pre-allocated")
|
| 141 |
+
return True
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"[MEM-OPT] Failed: {e}")
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
# Apply optimizations at startup
|
| 148 |
+
optimize_cpu_performance()
|
| 149 |
+
boost_cpu_frequency()
|
| 150 |
+
optimize_memory_layout()
|
| 151 |
+
|
| 152 |
QUANT_OPTIMIZATIONS = {
|
| 153 |
"BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
|
| 154 |
"F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
|
|
|
|
| 156 |
"Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
|
| 157 |
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 158 |
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 159 |
+
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 160 |
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 161 |
"Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
|
| 162 |
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
|
|
|
|
| 177 |
|
| 178 |
# --- AGGRESSIVE GARBAGE COLLECTOR ---
|
| 179 |
import gc
|
| 180 |
+
import threading
|
| 181 |
+
import time
|
| 182 |
gc.enable()
|
| 183 |
+
gc.set_threshold(700, 10, 10)
|
| 184 |
+
|
| 185 |
+
passive_gc_active = True
|
| 186 |
|
| 187 |
def force_gc():
|
| 188 |
"""Force aggressive garbage collection"""
|
| 189 |
if AGGRESSIVE_GC:
|
| 190 |
+
collected = gc.collect(2)
|
| 191 |
logger.info(f"[GC] Collected {collected} objects")
|
| 192 |
return collected
|
| 193 |
return 0
|
| 194 |
|
| 195 |
+
def passive_gc_daemon():
|
| 196 |
+
"""Background thread that runs aggressive GC every 30 seconds"""
|
| 197 |
+
global passive_gc_active
|
| 198 |
+
while passive_gc_active:
|
| 199 |
+
try:
|
| 200 |
+
time.sleep(30)
|
| 201 |
+
if AGGRESSIVE_GC:
|
| 202 |
+
total_collected = 0
|
| 203 |
+
for pass_num in range(3):
|
| 204 |
+
collected = gc.collect(2)
|
| 205 |
+
total_collected += collected
|
| 206 |
+
if collected == 0:
|
| 207 |
+
break
|
| 208 |
+
time.sleep(0.1)
|
| 209 |
+
|
| 210 |
+
if total_collected > 0:
|
| 211 |
+
logger.info(f"[PASSIVE-GC] Aggressive cleanup: {total_collected} objects collected")
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"[PASSIVE-GC] Error: {e}")
|
| 214 |
+
|
| 215 |
+
passive_gc_thread = threading.Thread(target=passive_gc_daemon, daemon=True)
|
| 216 |
+
passive_gc_thread.start()
|
| 217 |
+
logger.info("[PASSIVE-GC] Background garbage collector started (30s intervals)")
|
| 218 |
+
|
| 219 |
def nuclear_ram_clear():
|
| 220 |
"""NUCLEAR option: Clear all Python caches and force full GC"""
|
| 221 |
try:
|
|
|
|
| 222 |
import functools
|
| 223 |
functools._CacheInfo.__call__ = lambda self: None
|
| 224 |
|
|
|
|
| 225 |
import sys
|
| 226 |
if hasattr(sys, 'modules'):
|
|
|
|
| 227 |
for module_name, module in list(sys.modules.items()):
|
| 228 |
if hasattr(module, '__dict__') and not module_name.startswith('_'):
|
| 229 |
if hasattr(module, '__pycache__'):
|
| 230 |
delattr(module, '__pycache__')
|
| 231 |
|
| 232 |
+
for _ in range(5):
|
|
|
|
| 233 |
gc.collect(2)
|
| 234 |
|
| 235 |
logger.info("[RAM-NUKE] π₯ Nuclear RAM clear complete")
|
|
|
|
| 238 |
logger.error(f"[RAM-NUKE] Failed: {e}")
|
| 239 |
return False
|
| 240 |
|
| 241 |
+
def ultimate_system_wipe():
|
| 242 |
+
"""ULTIMATE WIPE: Clear everything - models, caches, tokens, GC everything"""
|
| 243 |
+
try:
|
| 244 |
+
logger.info("[ULTIMATE-WIPE] π Starting complete system wipe...")
|
| 245 |
+
|
| 246 |
+
if kernel.llm:
|
| 247 |
+
del kernel.llm
|
| 248 |
+
kernel.llm = None
|
| 249 |
+
|
| 250 |
+
model_cache.wreck_old_model_cache()
|
| 251 |
+
kernel.prompt_cache.clear()
|
| 252 |
+
kernel.clear_preprocessed()
|
| 253 |
+
nuclear_ram_clear()
|
| 254 |
+
|
| 255 |
+
users_to_clear = [u for u in token_manager.user_tokens.keys() if not token_manager.is_owner(u)]
|
| 256 |
+
for user in users_to_clear:
|
| 257 |
+
token_manager.user_tokens[user]["balance"] = 0
|
| 258 |
+
token_manager.user_tokens[user]["purchases"] = {"batch_size": 512, "max_tokens": 2048}
|
| 259 |
+
|
| 260 |
+
total_collected = 0
|
| 261 |
+
for i in range(10):
|
| 262 |
+
collected = gc.collect(2)
|
| 263 |
+
total_collected += collected
|
| 264 |
+
time.sleep(0.05)
|
| 265 |
+
|
| 266 |
+
logger.info(f"[ULTIMATE-WIPE] β
Complete! {total_collected} objects cleared, all models/caches wiped")
|
| 267 |
+
return True, f"π ULTIMATE WIPE COMPLETE! Cleared {total_collected} objects, all models & caches destroyed!"
|
| 268 |
+
|
| 269 |
+
except Exception as e:
|
| 270 |
+
logger.error(f"[ULTIMATE-WIPE] Failed: {e}")
|
| 271 |
+
return False, f"β Wipe failed: {str(e)}"
|
| 272 |
+
|
| 273 |
# --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
|
| 274 |
class ModelCacheManager:
|
| 275 |
def __init__(self):
|
|
|
|
| 897 |
optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
|
| 898 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 899 |
|
| 900 |
+
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|
| 901 |
+
optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
|
| 902 |
+
|
| 903 |
+
# Apply CPU optimizations before model loading
|
| 904 |
+
if LOW_LATENCY_MODE:
|
| 905 |
+
optimize_cpu_performance()
|
| 906 |
+
boost_cpu_frequency()
|
| 907 |
+
|
| 908 |
+
logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
|
| 909 |
|
| 910 |
try:
|
| 911 |
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
|
|
|
| 918 |
init_params = {
|
| 919 |
"model_path": path,
|
| 920 |
"n_ctx": optimal_ctx,
|
| 921 |
+
"n_threads": optimal_threads, # Exactly 2 threads
|
| 922 |
+
"n_threads_batch": optimal_threads, # Batch threads = total threads
|
| 923 |
+
"use_mmap": USE_MMAP, # Memory-mapped I/O
|
| 924 |
+
"use_mlock": MLOCK_MODEL, # Let OS manage memory
|
| 925 |
+
"n_batch": optimal_batch, # Optimized batch size
|
| 926 |
+
"n_gpu_layers": 0, # CPU-only
|
| 927 |
"rope_scaling_type": 0,
|
| 928 |
"rope_freq_scale": ROPE_SCALING,
|
| 929 |
"verbose": False,
|
| 930 |
+
"logits_all": False, # Only final logits
|
| 931 |
+
"embedding": False, # No embeddings
|
| 932 |
+
"f16_kv": False, # Quantized KV cache
|
| 933 |
+
# ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
|
| 934 |
+
"type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
|
| 935 |
+
"type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
|
| 936 |
+
"offload_kqv": OFFLOAD_KQV,
|
| 937 |
+
"flash_attn": FLASH_ATTENTION,
|
| 938 |
+
"use_scratch": True, # Use scratch buffer
|
| 939 |
+
"no_kv_offload": True, # Keep KV in RAM
|
| 940 |
+
"num_experts_used": 0, # No MoE for CPU
|
| 941 |
+
"seed": -1, # Random seed
|
| 942 |
+
"tensor_split": None, # No tensor splitting
|
| 943 |
+
"main_gpu": 0, # CPU-only
|
| 944 |
+
"device": "cpu", # Explicit CPU
|
| 945 |
+
"lora_base": None, # No LoRA base
|
| 946 |
+
"lora_scale": 1.0, # LoRA scale
|
| 947 |
+
"clpp_k": 0, # No CLPP
|
| 948 |
+
"numa": NUMA_OPTIMIZE, # NUMA if available
|
| 949 |
+
"cfg_scale": 1.0, # No CFG
|
| 950 |
+
"grammar": None, # No grammar constraints
|
| 951 |
+
"chat_format": None, # Auto-detect
|
| 952 |
+
"chat_handler": None, # Default handler
|
| 953 |
+
"cache_prompt": True, # Cache prompts
|
| 954 |
+
"cache_prompt_tokens": 256, # Prompt cache size
|
| 955 |
+
"cache_all": False, # Don't cache all
|
| 956 |
+
"draft_model": None, # No draft model
|
| 957 |
+
"draft_model_n_ctx": 512, # Draft context
|
| 958 |
+
"draft_model_n_gpu_layers": -1, # Auto-detect
|
| 959 |
+
"speculative_max_draft_len": 5, # Speculative decoding
|
| 960 |
+
"speculative_max_top_k": 4, # Speculative top-k
|
| 961 |
+
"speculative_decoding": SPECULATIVE_DECODE, # Enable if available
|
| 962 |
+
"speculative_min_draft_len": 1, # Min draft length
|
| 963 |
+
"speculative_max_top_k": 4, # Max top-k for draft
|
| 964 |
+
"speculative_min_top_k": 1, # Min top-k for draft
|
| 965 |
+
"speculative_max_top_p": 0.95, # Max top-p for draft
|
| 966 |
+
"speculative_min_top_p": 0.1, # Min top-p for draft
|
| 967 |
+
"speculative_max_temp": 1.0, # Max temp for draft
|
| 968 |
+
"speculative_min_temp": 0.1, # Min temp for draft
|
| 969 |
+
"speculative_eta": 0.1, # Eta for draft
|
| 970 |
+
"speculative_tau": 5.0, # Tau for draft
|
| 971 |
+
"speculative_gamma": 1.0, # Gamma for draft
|
| 972 |
+
"speculative_delta": 0.1, # Delta for draft
|
| 973 |
}
|
| 974 |
|
| 975 |
+
# Remove None values to avoid llama.cpp errors
|
| 976 |
+
init_params = {k: v for k, v in init_params.items() if v is not None}
|
| 977 |
+
|
| 978 |
+
if KV_CACHE_QUANTIZATION and model_format != "gemma":
|
| 979 |
logger.info("[OPTIM] KV cache quantization enabled (Q4)")
|
| 980 |
|
| 981 |
+
# Apply memory optimizations
|
| 982 |
+
if MEMORY_MAPPED_IO:
|
| 983 |
+
logger.info("[MEM-OPT] Memory-mapped I/O enabled")
|
| 984 |
+
|
| 985 |
+
if COMPRESS_CONTEXT:
|
| 986 |
+
logger.info("[MEM-OPT] Context compression enabled")
|
| 987 |
+
|
| 988 |
+
# Load model with ultra optimizations
|
| 989 |
self.llm = Llama(**init_params)
|
| 990 |
|
| 991 |
self.active_model_info = {
|
|
|
|
| 1314 |
""")
|
| 1315 |
token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
|
| 1316 |
end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
|
| 1317 |
+
# Owner-only Clear RAM button (hidden by default, shown only to owner)
|
| 1318 |
+
clear_ram_btn = gr.Button("π CLEAR RAM", variant="stop", size="sm", visible=False)
|
| 1319 |
session_status = gr.Markdown("", visible=False)
|
| 1320 |
|
| 1321 |
with gr.Row():
|
|
|
|
| 1439 |
balance = token_manager.get_balance(session_id)
|
| 1440 |
return msg, f"{balance}"
|
| 1441 |
|
| 1442 |
+
def on_clear_ram():
|
| 1443 |
+
"""Owner-only ultimate system wipe"""
|
| 1444 |
+
success, msg = ultimate_system_wipe()
|
| 1445 |
+
return msg
|
| 1446 |
+
|
| 1447 |
def on_end_session():
|
| 1448 |
msg = token_manager.end_session(session_id)
|
| 1449 |
return msg
|
| 1450 |
|
| 1451 |
+
def update_ui_for_owner(profile: gr.OAuthProfile | None):
|
| 1452 |
+
"""Show/hide owner-only elements based on user"""
|
| 1453 |
+
if profile and token_manager.is_owner(profile.username):
|
| 1454 |
+
return gr.update(visible=True) # Show Clear RAM button
|
| 1455 |
+
return gr.update(visible=False) # Hide Clear RAM button
|
| 1456 |
+
|
| 1457 |
def update_custom_params(temp, top_p, top_k, repeat_pen):
|
| 1458 |
kernel.custom_params["temperature"] = temp
|
| 1459 |
kernel.custom_params["top_p"] = top_p
|
|
|
|
| 1473 |
batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
|
| 1474 |
token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
|
| 1475 |
end_session_btn.click(on_end_session, None, [session_status])
|
| 1476 |
+
clear_ram_btn.click(on_clear_ram, None, [session_status])
|
| 1477 |
|
| 1478 |
# Custom parameter updates
|
| 1479 |
temperature_slider.change(update_custom_params,
|