Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,32 +31,47 @@ SYSTEM_RESERVE_MB = 500
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 35 |
-
FLASH_ATTENTION =
|
| 36 |
-
KV_CACHE_QUANTIZATION = True #
|
| 37 |
-
CONTINUOUS_BATCHING =
|
| 38 |
-
SPECULATIVE_DECODE = False #
|
| 39 |
-
MLOCK_MODEL = False #
|
| 40 |
-
USE_MMAP = True #
|
| 41 |
-
OFFLOAD_KQV = False # CPU-only
|
| 42 |
-
OPTIMAL_THREADS =
|
| 43 |
-
ROPE_SCALING = 1.0
|
| 44 |
-
NUMA_OPTIMIZE =
|
| 45 |
-
AGGRESSIVE_GC = True
|
| 46 |
-
|
| 47 |
-
# Quantization detection
|
| 48 |
QUANT_OPTIMIZATIONS = {
|
| 49 |
-
"BF16": {"batch_multiplier": 0.
|
| 50 |
-
"F16": {"batch_multiplier": 0.
|
| 51 |
-
"Q8_0": {"batch_multiplier": 0
|
| 52 |
-
"Q6_K": {"batch_multiplier":
|
| 53 |
-
"Q5_K_M": {"batch_multiplier": 1.
|
| 54 |
-
"Q5_K_S": {"batch_multiplier": 1.
|
| 55 |
-
"Q4_K_M": {"batch_multiplier":
|
| 56 |
-
"Q4_K_S": {"batch_multiplier":
|
| 57 |
-
"Q4_0": {"batch_multiplier":
|
| 58 |
-
"Q3_K_M": {"batch_multiplier":
|
| 59 |
-
"Q2_K": {"batch_multiplier":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
@@ -207,6 +222,212 @@ class ModelCacheManager:
|
|
| 207 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 208 |
return False
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# Global cache manager
|
| 211 |
model_cache = ModelCacheManager()
|
| 212 |
|
|
@@ -272,7 +493,7 @@ class ZeroEngine:
|
|
| 272 |
self.api = HfApi(token=HF_TOKEN)
|
| 273 |
self.telemetry = TelemetryManager(self.api)
|
| 274 |
self.llm: Optional[Llama] = None
|
| 275 |
-
self.active_model_info = {"repo": "", "file": ""}
|
| 276 |
self.kernel_lock = threading.Lock()
|
| 277 |
self.is_prefilling = False
|
| 278 |
self.perf_stats = {
|
|
@@ -282,9 +503,9 @@ class ZeroEngine:
|
|
| 282 |
"peak_tps": 0.0,
|
| 283 |
"cache_hits": 0
|
| 284 |
}
|
| 285 |
-
self.prompt_cache = {}
|
| 286 |
self.last_activity = time.time()
|
| 287 |
-
self.idle_timeout = 20
|
| 288 |
self.auto_cleanup_thread = None
|
| 289 |
self.start_idle_monitor()
|
| 290 |
|
|
@@ -293,6 +514,29 @@ class ZeroEngine:
|
|
| 293 |
self.typing_timer = None
|
| 294 |
self.preprocessed_tokens = None
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
def detect_quantization(self, filename: str) -> dict:
|
| 297 |
"""Detect quantization method from filename and return optimizations"""
|
| 298 |
filename_upper = filename.upper()
|
|
@@ -389,7 +633,158 @@ class ZeroEngine:
|
|
| 389 |
logger.error(f"Scan error: {e}")
|
| 390 |
return []
|
| 391 |
|
| 392 |
-
def boot_kernel(self, repo: str, filename: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 394 |
try:
|
| 395 |
if not repo or not filename:
|
|
@@ -547,7 +942,7 @@ class ZeroEngine:
|
|
| 547 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 548 |
return "β‘ Primed"
|
| 549 |
|
| 550 |
-
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
|
| 551 |
# Update activity timestamp
|
| 552 |
self.update_activity()
|
| 553 |
|
|
@@ -600,23 +995,28 @@ class ZeroEngine:
|
|
| 600 |
first_token_time = None
|
| 601 |
|
| 602 |
try:
|
| 603 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
stream = self.llm(
|
| 605 |
formatted_prompt,
|
| 606 |
-
max_tokens=
|
| 607 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 608 |
stream=True,
|
| 609 |
-
temperature=
|
| 610 |
-
top_p=
|
| 611 |
-
top_k=
|
| 612 |
-
repeat_penalty=
|
| 613 |
-
frequency_penalty=0.0,
|
| 614 |
-
presence_penalty=0.0,
|
| 615 |
-
tfs_z=1.0,
|
| 616 |
-
typical_p=1.0,
|
| 617 |
-
mirostat_mode=2, #
|
| 618 |
-
mirostat_tau=5.0,
|
| 619 |
-
mirostat_eta=0.1,
|
| 620 |
)
|
| 621 |
|
| 622 |
for chunk in stream:
|
|
@@ -636,10 +1036,19 @@ class ZeroEngine:
|
|
| 636 |
if tps > self.perf_stats["peak_tps"]:
|
| 637 |
self.perf_stats["peak_tps"] = tps
|
| 638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
# Update history with streaming content + performance metrics
|
| 640 |
-
|
|
|
|
| 641 |
yield history
|
| 642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
# Update global performance stats
|
| 644 |
self.perf_stats["total_tokens"] += tokens_count
|
| 645 |
self.perf_stats["total_time"] += elapsed
|
|
@@ -763,27 +1172,49 @@ h1, h2, h3, h4, h5, h6 {
|
|
| 763 |
# --- UI INTERFACE ---
|
| 764 |
kernel = ZeroEngine()
|
| 765 |
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
with gr.Row():
|
| 783 |
with gr.Column(scale=8):
|
| 784 |
chat_box = gr.Chatbot(
|
| 785 |
label="Main Engine Feedback",
|
| 786 |
-
height=
|
| 787 |
show_label=False,
|
| 788 |
autoscroll=True,
|
| 789 |
container=True
|
|
@@ -798,12 +1229,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 798 |
)
|
| 799 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 800 |
|
| 801 |
-
with gr.Column(scale=
|
|
|
|
| 802 |
gr.Markdown("### π οΈ Hardware Status")
|
| 803 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 804 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 805 |
|
| 806 |
gr.Markdown("---")
|
|
|
|
|
|
|
| 807 |
gr.Markdown("### π‘ Model Control")
|
| 808 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 809 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
@@ -815,6 +1249,26 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 815 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 816 |
|
| 817 |
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
gr.Markdown("### π» Ghost Cache (Pre-Context)")
|
| 819 |
ghost_buffer = gr.Textbox(
|
| 820 |
label="Background Context",
|
|
@@ -828,7 +1282,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 828 |
log_output = gr.Code(
|
| 829 |
label="Kernel Logs",
|
| 830 |
language="shell",
|
| 831 |
-
value="[INIT] System Ready.",
|
| 832 |
lines=5
|
| 833 |
)
|
| 834 |
|
|
@@ -836,9 +1290,11 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 836 |
def update_stats():
|
| 837 |
try:
|
| 838 |
m = ResourceMonitor.get_metrics()
|
| 839 |
-
|
|
|
|
| 840 |
except Exception as e:
|
| 841 |
logger.error(f"Stats update error: {e}")
|
|
|
|
| 842 |
return "Error", "Error"
|
| 843 |
|
| 844 |
def on_scan(repo):
|
|
@@ -864,37 +1320,78 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 864 |
return
|
| 865 |
|
| 866 |
yield "βοΈ System: Initiating boot sequence...", gr.update()
|
| 867 |
-
time.sleep(0.5)
|
| 868 |
|
| 869 |
-
result = kernel.boot_kernel(repo, file)
|
| 870 |
yield result, gr.update()
|
| 871 |
|
| 872 |
except Exception as e:
|
| 873 |
logger.error(f"Boot UI error: {e}")
|
| 874 |
yield f"π΄ BOOT ERROR: {str(e)}", gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
-
# Timer for periodic stats updates
|
| 877 |
timer = gr.Timer(value=2)
|
| 878 |
-
timer.tick(update_stats, None, [ram_metric, cpu_metric])
|
| 879 |
|
| 880 |
# Event handlers
|
| 881 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 882 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
stitch_btn.click(
|
| 885 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 886 |
[ghost_buffer],
|
| 887 |
[stitch_status]
|
| 888 |
)
|
| 889 |
|
| 890 |
-
# Keyboard input preprocessing
|
| 891 |
user_input.change(
|
| 892 |
lambda x: kernel.preprocess_input(x),
|
| 893 |
[user_input],
|
| 894 |
None
|
| 895 |
)
|
| 896 |
|
| 897 |
-
# Auto-boot enabled inference
|
| 898 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 899 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 900 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|
|
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
| 34 |
+
# --- TOKEN SYSTEM CONFIG ---
|
| 35 |
+
MONTHLY_TOKEN_CREDITS = 100.0
|
| 36 |
+
TOKEN_COST_PER_100MS = 0.001
|
| 37 |
+
BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
|
| 38 |
+
TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
|
| 39 |
+
|
| 40 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 41 |
+
FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
|
| 42 |
+
KV_CACHE_QUANTIZATION = True # Keep for RAM savings
|
| 43 |
+
CONTINUOUS_BATCHING = False # CPU doesn't benefit much
|
| 44 |
+
SPECULATIVE_DECODE = False # CPU-only, no draft model
|
| 45 |
+
MLOCK_MODEL = False # Don't lock - allow OS to manage memory
|
| 46 |
+
USE_MMAP = True # Critical for CPU - fast loading
|
| 47 |
+
OFFLOAD_KQV = False # CPU-only
|
| 48 |
+
OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
|
| 49 |
+
ROPE_SCALING = 1.0
|
| 50 |
+
NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
|
| 51 |
+
AGGRESSIVE_GC = True
|
| 52 |
+
|
| 53 |
+
# Quantization detection - CPU-optimized batch multipliers (more aggressive)
|
| 54 |
QUANT_OPTIMIZATIONS = {
|
| 55 |
+
"BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
|
| 56 |
+
"F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
|
| 57 |
+
"Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
|
| 58 |
+
"Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
|
| 59 |
+
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 60 |
+
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
|
| 61 |
+
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
|
| 62 |
+
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
|
| 63 |
+
"Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
|
| 64 |
+
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
|
| 65 |
+
"Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Model format/architecture detection patterns
|
| 69 |
+
MODEL_FORMATS = {
|
| 70 |
+
"llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
|
| 71 |
+
"gemma": {"pattern": ["gemma"], "template": "gemma"},
|
| 72 |
+
"phi": {"pattern": ["phi"], "template": "phi"},
|
| 73 |
+
"qwen": {"pattern": ["qwen"], "template": "chatml"},
|
| 74 |
+
"deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
|
| 75 |
}
|
| 76 |
|
| 77 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
|
|
| 222 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 223 |
return False
|
| 224 |
|
| 225 |
+
# --- TOKEN MANAGER ---
|
| 226 |
+
class TokenManager:
|
| 227 |
+
def __init__(self):
|
| 228 |
+
self.user_tokens = {} # {username: {"balance": float, "start_time": float, "purchases": {}}}
|
| 229 |
+
self.owner_username = "turtle170" # Owner gets infinite tokens
|
| 230 |
+
|
| 231 |
+
def is_owner(self, username: str) -> bool:
|
| 232 |
+
"""Check if user is the owner"""
|
| 233 |
+
if not username:
|
| 234 |
+
return False
|
| 235 |
+
return username.lower() == self.owner_username.lower()
|
| 236 |
+
|
| 237 |
+
def initialize_user(self, username: str):
|
| 238 |
+
"""Initialize new user with monthly credits (or infinite for owner)"""
|
| 239 |
+
if not username:
|
| 240 |
+
username = "anonymous"
|
| 241 |
+
|
| 242 |
+
if username not in self.user_tokens:
|
| 243 |
+
# Owner gets infinite tokens
|
| 244 |
+
if self.is_owner(username):
|
| 245 |
+
self.user_tokens[username] = {
|
| 246 |
+
"balance": float('inf'),
|
| 247 |
+
"start_time": time.time(),
|
| 248 |
+
"purchases": {"batch_multiplier": 1, "token_limit": 2048},
|
| 249 |
+
"total_spent": 0.0,
|
| 250 |
+
"is_owner": True,
|
| 251 |
+
"username": username
|
| 252 |
+
}
|
| 253 |
+
logger.info(f"[TOKEN] π OWNER {username} initialized with INFINITE tokens!")
|
| 254 |
+
else:
|
| 255 |
+
self.user_tokens[username] = {
|
| 256 |
+
"balance": MONTHLY_TOKEN_CREDITS,
|
| 257 |
+
"start_time": time.time(),
|
| 258 |
+
"purchases": {"batch_multiplier": 1, "token_limit": 2048},
|
| 259 |
+
"total_spent": 0.0,
|
| 260 |
+
"is_owner": False,
|
| 261 |
+
"username": username,
|
| 262 |
+
"last_reset": time.time()
|
| 263 |
+
}
|
| 264 |
+
logger.info(f"[TOKEN] New user {username}: {MONTHLY_TOKEN_CREDITS} tokens")
|
| 265 |
+
|
| 266 |
+
def check_monthly_reset(self, username: str):
|
| 267 |
+
"""Reset tokens if a month has passed"""
|
| 268 |
+
if not username or username not in self.user_tokens:
|
| 269 |
+
return
|
| 270 |
+
|
| 271 |
+
if self.user_tokens[username].get("is_owner", False):
|
| 272 |
+
return # Owner never needs reset
|
| 273 |
+
|
| 274 |
+
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 275 |
+
month_in_seconds = 30 * 24 * 60 * 60 # 30 days
|
| 276 |
+
|
| 277 |
+
if time.time() - last_reset > month_in_seconds:
|
| 278 |
+
self.user_tokens[username]["balance"] = MONTHLY_TOKEN_CREDITS
|
| 279 |
+
self.user_tokens[username]["last_reset"] = time.time()
|
| 280 |
+
self.user_tokens[username]["total_spent"] = 0.0
|
| 281 |
+
logger.info(f"[TOKEN] Monthly reset for {username}: {MONTHLY_TOKEN_CREDITS} tokens")
|
| 282 |
+
|
| 283 |
+
def charge_usage(self, username: str, duration_ms: float) -> bool:
|
| 284 |
+
"""Charge user for inference time. Returns True if successful. Owner never charged."""
|
| 285 |
+
if not username:
|
| 286 |
+
username = "anonymous"
|
| 287 |
+
|
| 288 |
+
self.initialize_user(username)
|
| 289 |
+
self.check_monthly_reset(username)
|
| 290 |
+
|
| 291 |
+
# Owner never gets charged
|
| 292 |
+
if self.user_tokens[username].get("is_owner", False):
|
| 293 |
+
return True
|
| 294 |
+
|
| 295 |
+
cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
|
| 296 |
+
|
| 297 |
+
# Check if user has enough balance
|
| 298 |
+
if self.user_tokens[username]["balance"] <= 0:
|
| 299 |
+
logger.warning(f"[TOKEN] β {username} has 0 tokens! Access denied.")
|
| 300 |
+
return False
|
| 301 |
+
|
| 302 |
+
if self.user_tokens[username]["balance"] >= cost:
|
| 303 |
+
self.user_tokens[username]["balance"] -= cost
|
| 304 |
+
self.user_tokens[username]["balance"] = max(0, self.user_tokens[username]["balance"]) # Never go below 0
|
| 305 |
+
self.user_tokens[username]["total_spent"] += cost
|
| 306 |
+
logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[username]['balance']:.2f}")
|
| 307 |
+
return True
|
| 308 |
+
else:
|
| 309 |
+
# Insufficient balance - set to 0 and deny
|
| 310 |
+
self.user_tokens[username]["balance"] = 0
|
| 311 |
+
logger.warning(f"[TOKEN] β Insufficient balance! {username} now at 0 tokens.")
|
| 312 |
+
return False
|
| 313 |
+
|
| 314 |
+
def can_use_engine(self, username: str) -> tuple:
|
| 315 |
+
"""Check if user can use the engine. Returns (bool, message)"""
|
| 316 |
+
if not username:
|
| 317 |
+
username = "anonymous"
|
| 318 |
+
|
| 319 |
+
self.initialize_user(username)
|
| 320 |
+
self.check_monthly_reset(username)
|
| 321 |
+
|
| 322 |
+
if self.user_tokens[username].get("is_owner", False):
|
| 323 |
+
return True, "π Owner access granted"
|
| 324 |
+
|
| 325 |
+
balance = self.user_tokens[username]["balance"]
|
| 326 |
+
|
| 327 |
+
if balance <= 0:
|
| 328 |
+
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 329 |
+
time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
|
| 330 |
+
days_left = int(time_until_reset / (24 * 60 * 60))
|
| 331 |
+
return False, f"β Out of tokens! Resets in {days_left} days. Current balance: 0.00"
|
| 332 |
+
|
| 333 |
+
return True, f"β
Access granted. Balance: {balance:.2f} tokens"
|
| 334 |
+
|
| 335 |
+
def purchase_batch_upgrade(self, username: str) -> tuple:
|
| 336 |
+
"""Purchase batch size upgrade (exponential cost). Free for owner."""
|
| 337 |
+
if not username:
|
| 338 |
+
return False, "β Please login first"
|
| 339 |
+
|
| 340 |
+
self.initialize_user(username)
|
| 341 |
+
|
| 342 |
+
# Owner gets free upgrades
|
| 343 |
+
if self.user_tokens[username].get("is_owner", False):
|
| 344 |
+
current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
|
| 345 |
+
self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
|
| 346 |
+
new_mult = current_mult * 2
|
| 347 |
+
logger.info(f"[TOKEN] π OWNER free batch upgrade: {current_mult}x β {new_mult}x")
|
| 348 |
+
return True, f"π FREE UPGRADE! Batch now {new_mult}x!"
|
| 349 |
+
|
| 350 |
+
current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
|
| 351 |
+
upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
|
| 352 |
+
cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
|
| 353 |
+
|
| 354 |
+
if self.user_tokens[username]["balance"] >= cost:
|
| 355 |
+
self.user_tokens[username]["balance"] -= cost
|
| 356 |
+
self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
|
| 357 |
+
new_mult = current_mult * 2
|
| 358 |
+
logger.info(f"[TOKEN] Batch upgrade: {current_mult}x β {new_mult}x | Cost: {cost:.5f}")
|
| 359 |
+
return True, f"β
Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
|
| 360 |
+
else:
|
| 361 |
+
return False, f"β Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 362 |
+
|
| 363 |
+
def purchase_token_upgrade(self, username: str, extra_tokens: int = 1000) -> tuple:
|
| 364 |
+
"""Purchase extra response token length. Free for owner."""
|
| 365 |
+
if not username:
|
| 366 |
+
return False, "β Please login first"
|
| 367 |
+
|
| 368 |
+
self.initialize_user(username)
|
| 369 |
+
|
| 370 |
+
# Owner gets free upgrades
|
| 371 |
+
if self.user_tokens[username].get("is_owner", False):
|
| 372 |
+
self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
|
| 373 |
+
new_limit = self.user_tokens[username]["purchases"]["token_limit"]
|
| 374 |
+
logger.info(f"[TOKEN] π OWNER free token upgrade: +{extra_tokens} tokens")
|
| 375 |
+
return True, f"π FREE UPGRADE! Token limit now {new_limit}!"
|
| 376 |
+
|
| 377 |
+
cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
|
| 378 |
+
|
| 379 |
+
if self.user_tokens[username]["balance"] >= cost:
|
| 380 |
+
self.user_tokens[username]["balance"] -= cost
|
| 381 |
+
self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
|
| 382 |
+
new_limit = self.user_tokens[username]["purchases"]["token_limit"]
|
| 383 |
+
logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
|
| 384 |
+
return True, f"β
Token limit now {new_limit}! (-{cost:.5f} tokens)"
|
| 385 |
+
else:
|
| 386 |
+
return False, f"β Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 387 |
+
|
| 388 |
+
def get_balance(self, username: str) -> float:
|
| 389 |
+
"""Get user's current token balance"""
|
| 390 |
+
if not username:
|
| 391 |
+
username = "anonymous"
|
| 392 |
+
|
| 393 |
+
self.initialize_user(username)
|
| 394 |
+
self.check_monthly_reset(username)
|
| 395 |
+
|
| 396 |
+
balance = self.user_tokens[username]["balance"]
|
| 397 |
+
|
| 398 |
+
# Show β for owner
|
| 399 |
+
if balance == float('inf'):
|
| 400 |
+
return balance
|
| 401 |
+
|
| 402 |
+
return round(max(0, balance), 2) # Never show negative
|
| 403 |
+
|
| 404 |
+
def get_purchases(self, username: str) -> dict:
|
| 405 |
+
"""Get user's current purchases"""
|
| 406 |
+
if not username:
|
| 407 |
+
username = "anonymous"
|
| 408 |
+
|
| 409 |
+
self.initialize_user(username)
|
| 410 |
+
return self.user_tokens[username]["purchases"]
|
| 411 |
+
|
| 412 |
+
def end_session(self, username: str):
|
| 413 |
+
"""End user session and log stats"""
|
| 414 |
+
if not username:
|
| 415 |
+
return "No active session found."
|
| 416 |
+
|
| 417 |
+
if username in self.user_tokens:
|
| 418 |
+
stats = self.user_tokens[username]
|
| 419 |
+
|
| 420 |
+
if stats.get("is_owner", False):
|
| 421 |
+
return f"π Owner session ended. Welcome back anytime, {stats['username']}!"
|
| 422 |
+
|
| 423 |
+
logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
|
| 424 |
+
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
| 425 |
+
return "No active session found."
|
| 426 |
+
|
| 427 |
+
# Global token manager
|
| 428 |
+
import math
|
| 429 |
+
token_manager = TokenManager()
|
| 430 |
+
|
| 431 |
# Global cache manager
|
| 432 |
model_cache = ModelCacheManager()
|
| 433 |
|
|
|
|
| 493 |
self.api = HfApi(token=HF_TOKEN)
|
| 494 |
self.telemetry = TelemetryManager(self.api)
|
| 495 |
self.llm: Optional[Llama] = None
|
| 496 |
+
self.active_model_info = {"repo": "", "file": "", "format": ""}
|
| 497 |
self.kernel_lock = threading.Lock()
|
| 498 |
self.is_prefilling = False
|
| 499 |
self.perf_stats = {
|
|
|
|
| 503 |
"peak_tps": 0.0,
|
| 504 |
"cache_hits": 0
|
| 505 |
}
|
| 506 |
+
self.prompt_cache = {}
|
| 507 |
self.last_activity = time.time()
|
| 508 |
+
self.idle_timeout = 20
|
| 509 |
self.auto_cleanup_thread = None
|
| 510 |
self.start_idle_monitor()
|
| 511 |
|
|
|
|
| 514 |
self.typing_timer = None
|
| 515 |
self.preprocessed_tokens = None
|
| 516 |
|
| 517 |
+
# Custom parameters (user-configurable)
|
| 518 |
+
self.custom_params = {
|
| 519 |
+
"temperature": 0.7,
|
| 520 |
+
"top_p": 0.95,
|
| 521 |
+
"top_k": 40,
|
| 522 |
+
"repeat_penalty": 1.1,
|
| 523 |
+
"batch_size_override": None, # None = auto
|
| 524 |
+
"max_tokens_override": None # None = auto
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
def detect_model_format(self, filename: str, repo: str) -> str:
|
| 528 |
+
"""Auto-detect model format/architecture from filename and repo"""
|
| 529 |
+
combined = f"{repo.lower()} {filename.lower()}"
|
| 530 |
+
|
| 531 |
+
for format_name, format_info in MODEL_FORMATS.items():
|
| 532 |
+
for pattern in format_info["pattern"]:
|
| 533 |
+
if pattern in combined:
|
| 534 |
+
logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
|
| 535 |
+
return format_name
|
| 536 |
+
|
| 537 |
+
logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
|
| 538 |
+
return "llama"
|
| 539 |
+
|
| 540 |
def detect_quantization(self, filename: str) -> dict:
|
| 541 |
"""Detect quantization method from filename and return optimizations"""
|
| 542 |
filename_upper = filename.upper()
|
|
|
|
| 633 |
logger.error(f"Scan error: {e}")
|
| 634 |
return []
|
| 635 |
|
| 636 |
+
def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
|
| 637 |
+
"""HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
|
| 638 |
+
try:
|
| 639 |
+
if not repo or not filename:
|
| 640 |
+
return "π΄ ERROR: Repository or filename missing"
|
| 641 |
+
|
| 642 |
+
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 643 |
+
|
| 644 |
+
# DETECT QUANTIZATION FROM FILENAME
|
| 645 |
+
quant_config = self.detect_quantization(filename)
|
| 646 |
+
|
| 647 |
+
# DETECT MODEL FORMAT/ARCHITECTURE
|
| 648 |
+
model_format = self.detect_model_format(filename, repo)
|
| 649 |
+
|
| 650 |
+
# Download with timeout protection
|
| 651 |
+
try:
|
| 652 |
+
path = hf_hub_download(
|
| 653 |
+
repo_id=repo,
|
| 654 |
+
filename=filename,
|
| 655 |
+
token=HF_TOKEN,
|
| 656 |
+
local_files_only=False
|
| 657 |
+
)
|
| 658 |
+
logger.info(f"[BOOT] Download complete: {path}")
|
| 659 |
+
except Exception as e:
|
| 660 |
+
logger.error(f"[BOOT] Download failed: {e}")
|
| 661 |
+
return f"π΄ DOWNLOAD FAILED: {str(e)}"
|
| 662 |
+
|
| 663 |
+
# Check if model is cached
|
| 664 |
+
is_cached = model_cache.is_cached(path)
|
| 665 |
+
cache_status = "π― CACHED" if is_cached else "π NEW"
|
| 666 |
+
|
| 667 |
+
# Validate before loading
|
| 668 |
+
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 669 |
+
if not valid:
|
| 670 |
+
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 671 |
+
return f"π΄ VALIDATION FAILED: {msg}"
|
| 672 |
+
|
| 673 |
+
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
|
| 674 |
+
|
| 675 |
+
# Load model with MAXIMUM PERFORMANCE SETTINGS
|
| 676 |
+
with self.kernel_lock:
|
| 677 |
+
# WRECK OLD MODEL
|
| 678 |
+
if self.llm:
|
| 679 |
+
logger.info("[BOOT] π£ WRECKING old model...")
|
| 680 |
+
try:
|
| 681 |
+
model_cache.wreck_old_model_cache()
|
| 682 |
+
del self.llm
|
| 683 |
+
self.llm = None
|
| 684 |
+
nuclear_ram_clear()
|
| 685 |
+
logger.info("[BOOT] β
Old model DESTROYED")
|
| 686 |
+
except Exception as e:
|
| 687 |
+
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 688 |
+
|
| 689 |
+
# Calculate optimal parameters with token purchases
|
| 690 |
+
vm = psutil.virtual_memory()
|
| 691 |
+
available_ram_gb = vm.available / (1024**3)
|
| 692 |
+
|
| 693 |
+
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 694 |
+
# Base calculation: use more RAM for batching on CPU
|
| 695 |
+
base_batch = int(512 * available_ram_gb / 8) # More aggressive base
|
| 696 |
+
optimal_batch = int(base_batch * quant_config["batch_multiplier"])
|
| 697 |
+
|
| 698 |
+
# Apply user's batch multiplier from token purchases
|
| 699 |
+
if session_id:
|
| 700 |
+
user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
|
| 701 |
+
optimal_batch = int(optimal_batch * user_batch_mult)
|
| 702 |
+
logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
|
| 703 |
+
|
| 704 |
+
# CPU can handle larger batches with quantized models
|
| 705 |
+
optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
|
| 706 |
+
|
| 707 |
+
# Context size
|
| 708 |
+
optimal_ctx = quant_config["ctx_size"]
|
| 709 |
+
|
| 710 |
+
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 711 |
+
if model_format == "gemma":
|
| 712 |
+
optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
|
| 713 |
+
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 714 |
+
|
| 715 |
+
# Thread optimization - use ALL threads on CPU (including hyperthreading)
|
| 716 |
+
optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
|
| 717 |
+
logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
|
| 718 |
+
|
| 719 |
+
try:
|
| 720 |
+
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
| 721 |
+
|
| 722 |
+
# Preload cache if available
|
| 723 |
+
if is_cached:
|
| 724 |
+
model_cache.preload_cache(path)
|
| 725 |
+
|
| 726 |
+
# ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
|
| 727 |
+
init_params = {
|
| 728 |
+
"model_path": path,
|
| 729 |
+
"n_ctx": optimal_ctx,
|
| 730 |
+
"n_threads": optimal_threads,
|
| 731 |
+
"n_threads_batch": optimal_threads,
|
| 732 |
+
"use_mmap": USE_MMAP, # Critical for CPU
|
| 733 |
+
"use_mlock": MLOCK_MODEL, # Let OS manage memory
|
| 734 |
+
"n_batch": optimal_batch, # MASSIVE batches for CPU
|
| 735 |
+
"n_gpu_layers": 0, # CPU-only
|
| 736 |
+
"rope_scaling_type": 0,
|
| 737 |
+
"rope_freq_scale": ROPE_SCALING,
|
| 738 |
+
"verbose": False,
|
| 739 |
+
"logits_all": False,
|
| 740 |
+
"embedding": False,
|
| 741 |
+
"f16_kv": False # Use quantized KV cache
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
# Add KV quantization only if not Gemma (Gemma can be finicky)
|
| 745 |
+
if model_format != "gemma" and KV_CACHE_QUANTIZATION:
|
| 746 |
+
init_params["type_k"] = 2
|
| 747 |
+
init_params["type_v"] = 2
|
| 748 |
+
logger.info("[OPTIM] KV cache quantization enabled (Q4)")
|
| 749 |
+
|
| 750 |
+
self.llm = Llama(**init_params)
|
| 751 |
+
|
| 752 |
+
self.active_model_info = {
|
| 753 |
+
"repo": repo,
|
| 754 |
+
"file": filename,
|
| 755 |
+
"quant": quant_config['type'],
|
| 756 |
+
"format": model_format
|
| 757 |
+
}
|
| 758 |
+
self.telemetry.track_load(repo, filename)
|
| 759 |
+
|
| 760 |
+
# Extract and cache signature
|
| 761 |
+
if not is_cached:
|
| 762 |
+
logger.info("[BOOT] Extracting cache signature...")
|
| 763 |
+
signature = model_cache.extract_cache_signature(path)
|
| 764 |
+
if signature:
|
| 765 |
+
model_cache.save_to_cache(path, signature)
|
| 766 |
+
|
| 767 |
+
# Warm-up
|
| 768 |
+
logger.info("[BOOT] Warming up model caches...")
|
| 769 |
+
try:
|
| 770 |
+
self.llm("Warmup", max_tokens=1, stream=False)
|
| 771 |
+
force_gc()
|
| 772 |
+
except:
|
| 773 |
+
pass
|
| 774 |
+
|
| 775 |
+
logger.info("[BOOT] π CPU-OPTIMIZED MODEL READY!")
|
| 776 |
+
return f"π’ {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
|
| 777 |
+
|
| 778 |
+
except Exception as e:
|
| 779 |
+
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 780 |
+
self.llm = None
|
| 781 |
+
nuclear_ram_clear()
|
| 782 |
+
return f"π΄ LOAD FAILED: {str(e)}"
|
| 783 |
+
|
| 784 |
+
except Exception as e:
|
| 785 |
+
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 786 |
+
nuclear_ram_clear()
|
| 787 |
+
return f"π΄ BOOT FAILURE: {str(e)}"
|
| 788 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 789 |
try:
|
| 790 |
if not repo or not filename:
|
|
|
|
| 942 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 943 |
return "β‘ Primed"
|
| 944 |
|
| 945 |
+
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
|
| 946 |
# Update activity timestamp
|
| 947 |
self.update_activity()
|
| 948 |
|
|
|
|
| 995 |
first_token_time = None
|
| 996 |
|
| 997 |
try:
|
| 998 |
+
# Get max tokens from user purchases
|
| 999 |
+
max_tokens = 2048
|
| 1000 |
+
if username:
|
| 1001 |
+
max_tokens = token_manager.get_purchases(username)["token_limit"]
|
| 1002 |
+
|
| 1003 |
+
# HYPER-OPTIMIZED CPU INFERENCE SETTINGS
|
| 1004 |
stream = self.llm(
|
| 1005 |
formatted_prompt,
|
| 1006 |
+
max_tokens=max_tokens,
|
| 1007 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 1008 |
stream=True,
|
| 1009 |
+
temperature=self.custom_params["temperature"],
|
| 1010 |
+
top_p=self.custom_params["top_p"],
|
| 1011 |
+
top_k=self.custom_params["top_k"],
|
| 1012 |
+
repeat_penalty=self.custom_params["repeat_penalty"],
|
| 1013 |
+
frequency_penalty=0.0,
|
| 1014 |
+
presence_penalty=0.0,
|
| 1015 |
+
tfs_z=1.0,
|
| 1016 |
+
typical_p=1.0,
|
| 1017 |
+
mirostat_mode=2, # CPU benefits from mirostat
|
| 1018 |
+
mirostat_tau=5.0,
|
| 1019 |
+
mirostat_eta=0.1,
|
| 1020 |
)
|
| 1021 |
|
| 1022 |
for chunk in stream:
|
|
|
|
| 1036 |
if tps > self.perf_stats["peak_tps"]:
|
| 1037 |
self.perf_stats["peak_tps"] = tps
|
| 1038 |
|
| 1039 |
+
# Charge tokens every second
|
| 1040 |
+
if int(elapsed * 1000) % 1000 < 100 and username: # Every ~1 second
|
| 1041 |
+
token_manager.charge_usage(username, elapsed * 1000)
|
| 1042 |
+
|
| 1043 |
# Update history with streaming content + performance metrics
|
| 1044 |
+
balance = token_manager.get_balance(username) if username else 0
|
| 1045 |
+
history[-1]["content"] = f"{response_text}\n\n`β‘ {tps} t/s | π― Peak: {self.perf_stats['peak_tps']:.1f} t/s | π° {balance:.2f} tokens`"
|
| 1046 |
yield history
|
| 1047 |
|
| 1048 |
+
# Final token charge for remaining time
|
| 1049 |
+
if username:
|
| 1050 |
+
token_manager.charge_usage(username, elapsed * 1000)
|
| 1051 |
+
|
| 1052 |
# Update global performance stats
|
| 1053 |
self.perf_stats["total_tokens"] += tokens_count
|
| 1054 |
self.perf_stats["total_time"] += elapsed
|
|
|
|
| 1172 |
# --- UI INTERFACE ---
|
| 1173 |
kernel = ZeroEngine()
|
| 1174 |
|
| 1175 |
+
# Session ID for token tracking
|
| 1176 |
+
username = token_manager.get_username()
|
| 1177 |
+
|
| 1178 |
+
with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
| 1179 |
+
# Header with Token Display
|
| 1180 |
+
with gr.Row():
|
| 1181 |
+
with gr.Column(scale=8):
|
| 1182 |
+
gr.HTML("""
|
| 1183 |
+
<div style='text-align: center; padding: 30px; border-radius: 24px;
|
| 1184 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 1185 |
+
margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
|
| 1186 |
+
<h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
|
| 1187 |
+
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
| 1188 |
+
font-family: Consolas, monospace;'>
|
| 1189 |
+
π°οΈ ZEROENGINE V0.2
|
| 1190 |
+
</h1>
|
| 1191 |
+
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 1192 |
+
CPU-Optimized | Token System | Custom Parameters | Auto-Format
|
| 1193 |
+
</p>
|
| 1194 |
+
</div>
|
| 1195 |
+
""")
|
| 1196 |
+
with gr.Column(scale=2):
|
| 1197 |
+
# Token Display
|
| 1198 |
+
gr.HTML("""
|
| 1199 |
+
<div style='text-align: center; padding: 20px; border-radius: 20px;
|
| 1200 |
+
background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
|
| 1201 |
+
margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
|
| 1202 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>π°</div>
|
| 1203 |
+
<div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
|
| 1204 |
+
100.00
|
| 1205 |
+
</div>
|
| 1206 |
+
<div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
|
| 1207 |
+
</div>
|
| 1208 |
+
""")
|
| 1209 |
+
token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
|
| 1210 |
+
end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
|
| 1211 |
+
session_status = gr.Markdown("", visible=False)
|
| 1212 |
|
| 1213 |
with gr.Row():
|
| 1214 |
with gr.Column(scale=8):
|
| 1215 |
chat_box = gr.Chatbot(
|
| 1216 |
label="Main Engine Feedback",
|
| 1217 |
+
height=600,
|
| 1218 |
show_label=False,
|
| 1219 |
autoscroll=True,
|
| 1220 |
container=True
|
|
|
|
| 1229 |
)
|
| 1230 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 1231 |
|
| 1232 |
+
with gr.Column(scale=4):
|
| 1233 |
+
# Hardware Status
|
| 1234 |
gr.Markdown("### π οΈ Hardware Status")
|
| 1235 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 1236 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 1237 |
|
| 1238 |
gr.Markdown("---")
|
| 1239 |
+
|
| 1240 |
+
# Model Control
|
| 1241 |
gr.Markdown("### π‘ Model Control")
|
| 1242 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 1243 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
|
|
| 1249 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 1250 |
|
| 1251 |
gr.Markdown("---")
|
| 1252 |
+
|
| 1253 |
+
# Custom Parameters
|
| 1254 |
+
gr.Markdown("### βοΈ Custom Parameters")
|
| 1255 |
+
temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
| 1256 |
+
top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
|
| 1257 |
+
top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
|
| 1258 |
+
repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
|
| 1259 |
+
|
| 1260 |
+
gr.Markdown("---")
|
| 1261 |
+
|
| 1262 |
+
# Token Purchases
|
| 1263 |
+
gr.Markdown("### π Token Upgrades")
|
| 1264 |
+
with gr.Row():
|
| 1265 |
+
batch_upgrade_btn = gr.Button("π Batch x2", size="sm", variant="secondary")
|
| 1266 |
+
token_upgrade_btn = gr.Button("π +1K Tokens", size="sm", variant="secondary")
|
| 1267 |
+
purchase_status = gr.Markdown("Ready to upgrade!")
|
| 1268 |
+
|
| 1269 |
+
gr.Markdown("---")
|
| 1270 |
+
|
| 1271 |
+
# Ghost Cache
|
| 1272 |
gr.Markdown("### π» Ghost Cache (Pre-Context)")
|
| 1273 |
ghost_buffer = gr.Textbox(
|
| 1274 |
label="Background Context",
|
|
|
|
| 1282 |
log_output = gr.Code(
|
| 1283 |
label="Kernel Logs",
|
| 1284 |
language="shell",
|
| 1285 |
+
value="[INIT] V0.2 System Ready.",
|
| 1286 |
lines=5
|
| 1287 |
)
|
| 1288 |
|
|
|
|
| 1290 |
def update_stats():
|
| 1291 |
try:
|
| 1292 |
m = ResourceMonitor.get_metrics()
|
| 1293 |
+
balance = token_manager.get_balance(session_id)
|
| 1294 |
+
return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
|
| 1295 |
except Exception as e:
|
| 1296 |
logger.error(f"Stats update error: {e}")
|
| 1297 |
+
return "Error", "Error", "0.00"
|
| 1298 |
return "Error", "Error"
|
| 1299 |
|
| 1300 |
def on_scan(repo):
|
|
|
|
| 1320 |
return
|
| 1321 |
|
| 1322 |
yield "βοΈ System: Initiating boot sequence...", gr.update()
|
| 1323 |
+
time.sleep(0.5)
|
| 1324 |
|
| 1325 |
+
result = kernel.boot_kernel(repo, file, session_id)
|
| 1326 |
yield result, gr.update()
|
| 1327 |
|
| 1328 |
except Exception as e:
|
| 1329 |
logger.error(f"Boot UI error: {e}")
|
| 1330 |
yield f"π΄ BOOT ERROR: {str(e)}", gr.update()
|
| 1331 |
+
|
| 1332 |
+
def on_batch_upgrade():
|
| 1333 |
+
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
| 1334 |
+
balance = token_manager.get_balance(session_id)
|
| 1335 |
+
return msg, f"{balance}"
|
| 1336 |
+
|
| 1337 |
+
def on_token_upgrade():
|
| 1338 |
+
success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
|
| 1339 |
+
balance = token_manager.get_balance(session_id)
|
| 1340 |
+
return msg, f"{balance}"
|
| 1341 |
+
|
| 1342 |
+
def on_end_session():
|
| 1343 |
+
msg = token_manager.end_session(session_id)
|
| 1344 |
+
return msg
|
| 1345 |
+
|
| 1346 |
+
def update_custom_params(temp, top_p, top_k, repeat_pen):
|
| 1347 |
+
kernel.custom_params["temperature"] = temp
|
| 1348 |
+
kernel.custom_params["top_p"] = top_p
|
| 1349 |
+
kernel.custom_params["top_k"] = int(top_k)
|
| 1350 |
+
kernel.custom_params["repeat_penalty"] = repeat_pen
|
| 1351 |
+
return "β
Parameters updated!"
|
| 1352 |
|
| 1353 |
+
# Timer for periodic stats updates (includes token balance)
|
| 1354 |
timer = gr.Timer(value=2)
|
| 1355 |
+
timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
|
| 1356 |
|
| 1357 |
# Event handlers
|
| 1358 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 1359 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 1360 |
|
| 1361 |
+
# Token purchases
|
| 1362 |
+
batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
|
| 1363 |
+
token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
|
| 1364 |
+
end_session_btn.click(on_end_session, None, [session_status])
|
| 1365 |
+
|
| 1366 |
+
# Custom parameter updates
|
| 1367 |
+
temperature_slider.change(update_custom_params,
|
| 1368 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1369 |
+
[purchase_status])
|
| 1370 |
+
top_p_slider.change(update_custom_params,
|
| 1371 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1372 |
+
[purchase_status])
|
| 1373 |
+
top_k_slider.change(update_custom_params,
|
| 1374 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1375 |
+
[purchase_status])
|
| 1376 |
+
repeat_penalty_slider.change(update_custom_params,
|
| 1377 |
+
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1378 |
+
[purchase_status])
|
| 1379 |
+
|
| 1380 |
+
# Ghost cache
|
| 1381 |
stitch_btn.click(
|
| 1382 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 1383 |
[ghost_buffer],
|
| 1384 |
[stitch_status]
|
| 1385 |
)
|
| 1386 |
|
| 1387 |
+
# Keyboard input preprocessing
|
| 1388 |
user_input.change(
|
| 1389 |
lambda x: kernel.preprocess_input(x),
|
| 1390 |
[user_input],
|
| 1391 |
None
|
| 1392 |
)
|
| 1393 |
|
| 1394 |
+
# Auto-boot enabled inference
|
| 1395 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 1396 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 1397 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|