Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 2 days ago

Commit

40c03f1

verified ·

1 Parent(s): 45e66b4

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -12

app.py CHANGED Viewed

@@ -10,6 +10,115 @@ from typing import List, Dict, Optional, Generator
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 # Initialize logger early for startup functions
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -610,7 +719,7 @@ class TokenManager:
             return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
         return "No active session found."
-# Global token manager
 import math
 token_manager = TokenManager()
@@ -737,13 +846,20 @@ class ZeroEngine:
         return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
     def preprocess_input(self, text: str):
-        """Pre-process keyboard input in background (tensors ready before submit)"""
-        if not self.llm or not text or len(text) < 5:
             return
         def _preprocess():
             try:
-                logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
                 tokens = self.llm.tokenize(text.encode("utf-8"))
                 self.preprocessed_tokens = tokens
                 logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
@@ -751,15 +867,13 @@ class ZeroEngine:
                 logger.error(f"[PREPROCESS] Failed: {e}")
                 self.preprocessed_tokens = None
-        # Cancel previous timer if user is still typing
         if self.typing_timer:
             self.typing_timer.cancel()
-        # Start new timer - preprocess after 1 second of no typing
         self.typing_timer = threading.Timer(1.0, _preprocess)
         self.typing_timer.daemon = True
         self.typing_timer.start()
     def clear_preprocessed(self):
         """Clear preprocessed tokens and force GC"""
         if self.preprocessed_tokens:
@@ -1085,12 +1199,11 @@ class ZeroEngine:
             time.sleep(0.5)  # Brief pause for user to see the message
         # Check prompt cache for exact matches (instant response)
-        cache_key = f"{ghost_context}:{prompt}"
-        if cache_key in self.prompt_cache:
-            self.perf_stats["cache_hits"] += 1
-            logger.info(" CACHE HIT - Instant response!")
             history.append({"role": "user", "content": prompt})
-            history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
             yield history
             return
@@ -1180,6 +1293,12 @@ class ZeroEngine:
             # Aggressive GC after generation
             force_gc()
             logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")

 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
+from gradio_client import Client
+import hashlib
+# Backend processor connection
+BACKEND_URL = "turtle170/ZeroEngine-Backend"
+class BackendProcessor:
+    """Client for ZeroEngine-Backend processing"""
+    def __init__(self):
+        self.client = None
+        self.connected = False
+        self.last_connect_attempt = 0
+        self.connect_cooldown = 30  # seconds
+    def connect(self):
+        """Lazy connection with cooldown"""
+        current_time = time.time()
+        if self.connected:
+            return True
+        if current_time - self.last_connect_attempt < self.connect_cooldown:
+            return False
+        try:
+            self.last_connect_attempt = current_time
+            self.client = Client(BACKEND_URL)
+            self.connected = True
+            logger.info("[BACKEND] ✅ Connected to ZeroEngine-Backend")
+            return True
+        except Exception as e:
+            logger.error(f"[BACKEND] ❌ Connection failed: {e}")
+            self.connected = False
+            return False
+    def tokenize_async(self, text: str):
+        """Background tokenization"""
+        if not text or len(text) < 5:
+            return
+        def _background():
+            try:
+                if self.connect():
+                    result = self.client.predict(text, api_name="/predict")
+                    data = json.loads(result)
+                    if data.get("success"):
+                        logger.info(f"[BACKEND] Tokenized: ~{data['estimated_tokens']} tokens")
+            except Exception as e:
+                logger.warning(f"[BACKEND] Tokenize failed: {e}")
+        threading.Thread(target=_background, daemon=True).start()
+    def cache_response(self, prompt: str, response: str):
+        """Cache a response for instant retrieval"""
+        prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
+        def _background():
+            try:
+                if self.connect():
+                    result = self.client.predict(
+                        prompt_hash,
+                        response,
+                        api_name="/predict_3"
+                    )
+                    data = json.loads(result)
+                    if data.get("success"):
+                        logger.info(f"[BACKEND] Cached response: {prompt_hash}")
+            except Exception as e:
+                logger.warning(f"[BACKEND] Cache failed: {e}")
+        threading.Thread(target=_background, daemon=True).start()
+    def get_cached_response(self, prompt: str) -> Optional[str]:
+        """Try to get cached response (synchronous)"""
+        prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
+        try:
+            if self.connect():
+                result = self.client.predict(
+                    prompt_hash,
+                    api_name="/predict_4"
+                )
+                data = json.loads(result)
+                if data.get("success"):
+                    logger.info(f"[BACKEND] ⚡ CACHE HIT: {prompt_hash}")
+                    return data["response"]
+        except Exception as e:
+            logger.warning(f"[BACKEND] Cache retrieval failed: {e}")
+        return None
+    def charge_tokens_async(self, username: str, duration_ms: float):
+        """Calculate token cost asynchronously"""
+        def _background():
+            try:
+                if self.connect():
+                    result = self.client.predict(
+                        username,
+                        duration_ms,
+                        api_name="/predict_5"
+                    )
+                    data = json.loads(result)
+                    if data.get("success"):
+                        logger.info(f"[BACKEND] Charged {username}: {data['cost']} tokens")
+            except Exception as e:
+                logger.warning(f"[BACKEND] Charge failed: {e}")
+        threading.Thread(target=_background, daemon=True).start()
 # Initialize logger early for startup functions
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
             return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
         return "No active session found."
+backend = BackendProcessor()
 import math
 token_manager = TokenManager()
         return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
     def preprocess_input(self, text: str):
+        """Pre-process keyboard input with backend support"""
+        if not text or len(text) < 5:
+            return
+        # Send to backend for async tokenization
+        backend.tokenize_async(text)
+        # Also do local preprocessing if model loaded
+        if not self.llm:
             return
         def _preprocess():
             try:
+                logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars locally...")
                 tokens = self.llm.tokenize(text.encode("utf-8"))
                 self.preprocessed_tokens = tokens
                 logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
                 logger.error(f"[PREPROCESS] Failed: {e}")
                 self.preprocessed_tokens = None
         if self.typing_timer:
             self.typing_timer.cancel()
         self.typing_timer = threading.Timer(1.0, _preprocess)
         self.typing_timer.daemon = True
         self.typing_timer.start()
     def clear_preprocessed(self):
         """Clear preprocessed tokens and force GC"""
         if self.preprocessed_tokens:
             time.sleep(0.5)  # Brief pause for user to see the message
         # Check prompt cache for exact matches (instant response)
+        cached_response = backend.get_cached_response(full_input)
+        if cached_response:
+            logger.info("⚡ BACKEND CACHE HIT - Instant response!")
             history.append({"role": "user", "content": prompt})
+            history.append({"role": "assistant", "content": cached_response})
             yield history
             return
             # Aggressive GC after generation
             force_gc()
+            # Cache this response in backend for future use
+            backend.cache_response(full_input, response_text)
+            # Send token charge to backend (async)
+            if username:
+                backend.charge_tokens_async(username, elapsed * 1000)
             logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")