turtle170 commited on
Commit
40c03f1
·
verified ·
1 Parent(s): 45e66b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -12
app.py CHANGED
@@ -10,6 +10,115 @@ from typing import List, Dict, Optional, Generator
10
 
11
  import gradio as gr
12
  from huggingface_hub import HfApi, hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Initialize logger early for startup functions
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -610,7 +719,7 @@ class TokenManager:
610
  return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
611
  return "No active session found."
612
 
613
- # Global token manager
614
  import math
615
  token_manager = TokenManager()
616
 
@@ -737,13 +846,20 @@ class ZeroEngine:
737
  return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
738
 
739
  def preprocess_input(self, text: str):
740
- """Pre-process keyboard input in background (tensors ready before submit)"""
741
- if not self.llm or not text or len(text) < 5:
 
 
 
 
 
 
 
742
  return
743
 
744
  def _preprocess():
745
  try:
746
- logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
747
  tokens = self.llm.tokenize(text.encode("utf-8"))
748
  self.preprocessed_tokens = tokens
749
  logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
@@ -751,15 +867,13 @@ class ZeroEngine:
751
  logger.error(f"[PREPROCESS] Failed: {e}")
752
  self.preprocessed_tokens = None
753
 
754
- # Cancel previous timer if user is still typing
755
  if self.typing_timer:
756
  self.typing_timer.cancel()
757
 
758
- # Start new timer - preprocess after 1 second of no typing
759
  self.typing_timer = threading.Timer(1.0, _preprocess)
760
  self.typing_timer.daemon = True
761
  self.typing_timer.start()
762
-
763
  def clear_preprocessed(self):
764
  """Clear preprocessed tokens and force GC"""
765
  if self.preprocessed_tokens:
@@ -1085,12 +1199,11 @@ class ZeroEngine:
1085
  time.sleep(0.5) # Brief pause for user to see the message
1086
 
1087
  # Check prompt cache for exact matches (instant response)
1088
- cache_key = f"{ghost_context}:{prompt}"
1089
- if cache_key in self.prompt_cache:
1090
- self.perf_stats["cache_hits"] += 1
1091
- logger.info(" CACHE HIT - Instant response!")
1092
  history.append({"role": "user", "content": prompt})
1093
- history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
1094
  yield history
1095
  return
1096
 
@@ -1180,6 +1293,12 @@ class ZeroEngine:
1180
 
1181
  # Aggressive GC after generation
1182
  force_gc()
 
 
 
 
 
 
1183
 
1184
  logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
1185
 
 
10
 
11
  import gradio as gr
12
  from huggingface_hub import HfApi, hf_hub_download
13
+ from gradio_client import Client
14
+ import hashlib
15
+
16
+ # Backend processor connection
17
+ BACKEND_URL = "turtle170/ZeroEngine-Backend"
18
+
19
+ class BackendProcessor:
20
+ """Client for ZeroEngine-Backend processing"""
21
+
22
+ def __init__(self):
23
+ self.client = None
24
+ self.connected = False
25
+ self.last_connect_attempt = 0
26
+ self.connect_cooldown = 30 # seconds
27
+
28
+ def connect(self):
29
+ """Lazy connection with cooldown"""
30
+ current_time = time.time()
31
+
32
+ if self.connected:
33
+ return True
34
+
35
+ if current_time - self.last_connect_attempt < self.connect_cooldown:
36
+ return False
37
+
38
+ try:
39
+ self.last_connect_attempt = current_time
40
+ self.client = Client(BACKEND_URL)
41
+ self.connected = True
42
+ logger.info("[BACKEND] ✅ Connected to ZeroEngine-Backend")
43
+ return True
44
+ except Exception as e:
45
+ logger.error(f"[BACKEND] ❌ Connection failed: {e}")
46
+ self.connected = False
47
+ return False
48
+
49
+ def tokenize_async(self, text: str):
50
+ """Background tokenization"""
51
+ if not text or len(text) < 5:
52
+ return
53
+
54
+ def _background():
55
+ try:
56
+ if self.connect():
57
+ result = self.client.predict(text, api_name="/predict")
58
+ data = json.loads(result)
59
+ if data.get("success"):
60
+ logger.info(f"[BACKEND] Tokenized: ~{data['estimated_tokens']} tokens")
61
+ except Exception as e:
62
+ logger.warning(f"[BACKEND] Tokenize failed: {e}")
63
+
64
+ threading.Thread(target=_background, daemon=True).start()
65
+
66
+ def cache_response(self, prompt: str, response: str):
67
+ """Cache a response for instant retrieval"""
68
+ prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
69
+
70
+ def _background():
71
+ try:
72
+ if self.connect():
73
+ result = self.client.predict(
74
+ prompt_hash,
75
+ response,
76
+ api_name="/predict_3"
77
+ )
78
+ data = json.loads(result)
79
+ if data.get("success"):
80
+ logger.info(f"[BACKEND] Cached response: {prompt_hash}")
81
+ except Exception as e:
82
+ logger.warning(f"[BACKEND] Cache failed: {e}")
83
+
84
+ threading.Thread(target=_background, daemon=True).start()
85
+
86
+ def get_cached_response(self, prompt: str) -> Optional[str]:
87
+ """Try to get cached response (synchronous)"""
88
+ prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
89
+
90
+ try:
91
+ if self.connect():
92
+ result = self.client.predict(
93
+ prompt_hash,
94
+ api_name="/predict_4"
95
+ )
96
+ data = json.loads(result)
97
+ if data.get("success"):
98
+ logger.info(f"[BACKEND] ⚡ CACHE HIT: {prompt_hash}")
99
+ return data["response"]
100
+ except Exception as e:
101
+ logger.warning(f"[BACKEND] Cache retrieval failed: {e}")
102
+
103
+ return None
104
+
105
+ def charge_tokens_async(self, username: str, duration_ms: float):
106
+ """Calculate token cost asynchronously"""
107
+ def _background():
108
+ try:
109
+ if self.connect():
110
+ result = self.client.predict(
111
+ username,
112
+ duration_ms,
113
+ api_name="/predict_5"
114
+ )
115
+ data = json.loads(result)
116
+ if data.get("success"):
117
+ logger.info(f"[BACKEND] Charged {username}: {data['cost']} tokens")
118
+ except Exception as e:
119
+ logger.warning(f"[BACKEND] Charge failed: {e}")
120
+
121
+ threading.Thread(target=_background, daemon=True).start()
122
 
123
  # Initialize logger early for startup functions
124
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 
719
  return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
720
  return "No active session found."
721
 
722
+ backend = BackendProcessor()
723
  import math
724
  token_manager = TokenManager()
725
 
 
846
  return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
847
 
848
  def preprocess_input(self, text: str):
849
+ """Pre-process keyboard input with backend support"""
850
+ if not text or len(text) < 5:
851
+ return
852
+
853
+ # Send to backend for async tokenization
854
+ backend.tokenize_async(text)
855
+
856
+ # Also do local preprocessing if model loaded
857
+ if not self.llm:
858
  return
859
 
860
  def _preprocess():
861
  try:
862
+ logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars locally...")
863
  tokens = self.llm.tokenize(text.encode("utf-8"))
864
  self.preprocessed_tokens = tokens
865
  logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
 
867
  logger.error(f"[PREPROCESS] Failed: {e}")
868
  self.preprocessed_tokens = None
869
 
 
870
  if self.typing_timer:
871
  self.typing_timer.cancel()
872
 
 
873
  self.typing_timer = threading.Timer(1.0, _preprocess)
874
  self.typing_timer.daemon = True
875
  self.typing_timer.start()
876
+
877
  def clear_preprocessed(self):
878
  """Clear preprocessed tokens and force GC"""
879
  if self.preprocessed_tokens:
 
1199
  time.sleep(0.5) # Brief pause for user to see the message
1200
 
1201
  # Check prompt cache for exact matches (instant response)
1202
+ cached_response = backend.get_cached_response(full_input)
1203
+ if cached_response:
1204
+ logger.info(" BACKEND CACHE HIT - Instant response!")
 
1205
  history.append({"role": "user", "content": prompt})
1206
+ history.append({"role": "assistant", "content": cached_response})
1207
  yield history
1208
  return
1209
 
 
1293
 
1294
  # Aggressive GC after generation
1295
  force_gc()
1296
+ # Cache this response in backend for future use
1297
+ backend.cache_response(full_input, response_text)
1298
+
1299
+ # Send token charge to backend (async)
1300
+ if username:
1301
+ backend.charge_tokens_async(username, elapsed * 1000)
1302
 
1303
  logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
1304