Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 6 days ago

Commit

f7e811f

verified ·

1 Parent(s): 7de41ed

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -20

app.py CHANGED Viewed

@@ -16,7 +16,10 @@ from typing import List, Dict, Optional, Generator
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
-from llama_cpp import Llama
 # ==========================================
 # SYSTEM CONFIGURATION & CONSTANTS
@@ -24,7 +27,7 @@ from llama_cpp import Llama
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
-RAM_LIMIT_PCT = 0.50  # Strict 50% limit for model weights
 SYSTEM_RESERVE_MB = 250
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
@@ -62,7 +65,6 @@ class TelemetryManager:
     def track_generation(self, tokens: int):
         self.stats["total_tokens_generated"] += tokens
-        # Periodic sync could be added here
     def _sync_to_cloud(self):
         if not HF_TOKEN or not SPACE_ID:
@@ -105,11 +107,9 @@ class ResourceMonitor:
         total_ram_mb = vm.total / (1024**2)
         avail_ram_mb = vm.available / (1024**2)
-        # Rule 1: 50% Hard Cap
         if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
             return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
-        # Rule 2: Safety Buffer
         if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
             return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
@@ -146,15 +146,13 @@ class ZeroEngine:
                 return msg
             with self.kernel_lock:
-                # Clean up old instance
                 if self.llm:
                     del self.llm
-                # Initialize new instance with CPU Affinity Partitioning
                 self.llm = Llama(
                     model_path=path,
                     n_ctx=4096,
-                    n_threads=1, # Hard-partitioned to 1 vCPU for the active slot
                     use_mmap=True,
                     n_batch=512,
                     last_n_tokens_size=64,
@@ -179,8 +177,6 @@ class ZeroEngine:
             self.is_prefilling = True
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
-                # Prefix matching in llama-cpp happens automatically
-                # if we evaluate tokens and store them in the KV cache.
                 self.llm.eval(tokens)
                 logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
             except Exception as e:
@@ -197,11 +193,8 @@ class ZeroEngine:
             yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
             return
-        # Combine Ghost Terminal context with Active Input
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
-        # Prepare history for Llama-3 style chat templates if needed
-        # For V0.1 we use raw completion for maximum speed/minimal overhead
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
@@ -221,7 +214,6 @@ class ZeroEngine:
                 response_text += token
                 tokens_count += 1
-                # Calculate performance metrics
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
@@ -256,7 +248,6 @@ with gr.Blocks(
     """)
     with gr.Row():
-        # --- LEFT: CHAT ENGINE (FOCUS MODE) ---
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 type="messages",
@@ -276,7 +267,6 @@ with gr.Blocks(
                 with gr.Column(scale=1, min_width=50):
                     send_btn = gr.Button("EXE", variant="primary")
-        # --- RIGHT: ENGINE ROOM (SIDEBAR) ---
         with gr.Sidebar(label="Engine Room", open=True) as sidebar:
             gr.Markdown("### 📊 Resource Gauges")
             with gr.Row():
@@ -330,7 +320,6 @@ with gr.Blocks(
         res = kernel.stitch_cache(text)
         return f"Cache State: `{res}`"
-    # Event Mapping
     demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
@@ -343,12 +332,10 @@ with gr.Blocks(
     stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
-    # Inference Flow
     input_args = [user_input, chat_box, ghost_buffer]
     user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
-    # Clear Inputs
     user_input.submit(lambda: "", None, [user_input])
     user_input.submit(lambda: "", None, [ghost_buffer])

 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
+try:
+    from llama_cpp import Llama
+except ImportError:
+    from llama_cpp_pydist import Llama
 # ==========================================
 # SYSTEM CONFIGURATION & CONSTANTS
 HF_TOKEN = os.environ.get("HF_TOKEN")
 SPACE_ID = os.environ.get("SPACE_ID")
 LOG_FILE = "engine_telemetry.json"
+RAM_LIMIT_PCT = 0.50
 SYSTEM_RESERVE_MB = 250
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
     def track_generation(self, tokens: int):
         self.stats["total_tokens_generated"] += tokens
     def _sync_to_cloud(self):
         if not HF_TOKEN or not SPACE_ID:
         total_ram_mb = vm.total / (1024**2)
         avail_ram_mb = vm.available / (1024**2)
         if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
             return False, f"Model size ({file_size_mb:.1f}MB) exceeds 50% System RAM limit."
         if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
             return False, f"Insufficient headroom. Need {SYSTEM_RESERVE_MB}MB buffer."
                 return msg
             with self.kernel_lock:
                 if self.llm:
                     del self.llm
                 self.llm = Llama(
                     model_path=path,
                     n_ctx=4096,
+                    n_threads=1,
                     use_mmap=True,
                     n_batch=512,
                     last_n_tokens_size=64,
             self.is_prefilling = True
             try:
                 tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                 self.llm.eval(tokens)
                 logger.info(f"KV-Cache stitched for {len(tokens)} tokens.")
             except Exception as e:
             yield history + [{"role": "assistant", "content": "Engine offline. Please load a model in the Sidebar."}]
             return
         full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
         formatted_prompt = f"User: {full_input}\nAssistant: "
         response_text = ""
                 response_text += token
                 tokens_count += 1
                 elapsed = time.time() - start_time
                 tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
     """)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 type="messages",
                 with gr.Column(scale=1, min_width=50):
                     send_btn = gr.Button("EXE", variant="primary")
         with gr.Sidebar(label="Engine Room", open=True) as sidebar:
             gr.Markdown("### 📊 Resource Gauges")
             with gr.Row():
         res = kernel.stitch_cache(text)
         return f"Cache State: `{res}`"
     demo.load(update_system_stats, None, [ram_metric, cpu_metric], every=2)
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
     stitch_btn.click(on_stitch, [ghost_buffer], [stitch_status])
     input_args = [user_input, chat_box, ghost_buffer]
     user_input.submit(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     send_btn.click(kernel.inference_generator, input_args, [chat_box], concurrency_limit=2)
     user_input.submit(lambda: "", None, [user_input])
     user_input.submit(lambda: "", None, [ghost_buffer])