ScottzillaSystems commited on
Commit
15095ff
·
verified ·
1 Parent(s): 210ab91

Deploy failsafe watchdog: app.py

Browse files
Files changed (1) hide show
  1. app.py +445 -0
app.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 🛡️ FAILSAFE WATCHDOG - Immortal Guardian Space
4
+ Monitors Agent Zero workspace, auto-heals on failure, persists state via Mem0
5
+ Chimera Protocol: ZeroGPU/FreeCPU only - no credit consumption
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import time
12
+ import hashlib
13
+ import asyncio
14
+ import threading
15
+ import requests
16
+ from datetime import datetime, timedelta
17
+ from typing import Dict, List, Optional, Any
18
+ from dataclasses import dataclass, asdict
19
+ from pathlib import Path
20
+
21
+ import gradio as gr
22
+ from huggingface_hub import HfApi
23
+
24
+ # Optional: Mem0 for persistent memory
25
+ try:
26
+ from mem0 import Memory
27
+ MEM0_AVAILABLE = True
28
+ except ImportError:
29
+ MEM0_AVAILABLE = False
30
+ print("⚠️ Mem0 not installed - using local JSON fallback")
31
+
32
+ # Configuration
33
+ WATCHDOG_CONFIG = {
34
+ "target_workspace": "ScottzillaSystems/agent-zero",
35
+ "check_interval": 30, # seconds
36
+ "failure_threshold": 3,
37
+ "mem0_enabled": True,
38
+ "self_heal_enabled": True,
39
+ "backup_watchdog_count": 2,
40
+ "hf_token": os.environ.get("HF_TOKEN"),
41
+ "mem0_api_key": os.environ.get("MEM0_API_KEY")
42
+ }
43
+
44
+ @dataclass
45
+ class HealthCheck:
46
+ timestamp: str
47
+ status: str # healthy, degraded, failed
48
+ latency_ms: float
49
+ error_patterns: List[str]
50
+ action_taken: Optional[str] = None
51
+
52
+ class Mem0Persistence:
53
+ """Cross-restart memory via Mem0 or local fallback"""
54
+
55
+ def __init__(self, api_key: Optional[str] = None):
56
+ self.api_key = api_key or os.environ.get("MEM0_API_KEY")
57
+ self.user_id = "failsafe_watchdog"
58
+
59
+ if MEM0_AVAILABLE and self.api_key:
60
+ try:
61
+ self.memory = Memory(api_key=self.api_key)
62
+ self.mode = "mem0_cloud"
63
+ print("✅ Mem0 cloud persistence active")
64
+ except Exception as e:
65
+ print(f"⚠️ Mem0 init failed: {e}, using local fallback")
66
+ self.mode = "local_json"
67
+ else:
68
+ self.mode = "local_json"
69
+ print("✅ Local JSON persistence active")
70
+
71
+ self.local_path = Path("/data/watchdog_memory.json")
72
+ self._ensure_local_storage()
73
+
74
+ def _ensure_local_storage(self):
75
+ if self.mode == "local_json":
76
+ self.local_path.parent.mkdir(parents=True, exist_ok=True)
77
+ if not self.local_path.exists():
78
+ self.local_path.write_text(json.dumps({"incidents": [], "state": {}}))
79
+
80
+ def add(self, message: str, metadata: Dict = None):
81
+ """Store memory entry"""
82
+ if self.mode == "mem0_cloud":
83
+ try:
84
+ self.memory.add(message, user_id=self.user_id, metadata=metadata or {})
85
+ return True
86
+ except Exception as e:
87
+ print(f"Mem0 add failed: {e}, falling back to local")
88
+
89
+ # Local fallback
90
+ data = json.loads(self.local_path.read_text())
91
+ data["incidents"].append({
92
+ "timestamp": datetime.utcnow().isoformat(),
93
+ "message": message,
94
+ "metadata": metadata or {}
95
+ })
96
+ self.local_path.write_text(json.dumps(data, indent=2))
97
+ return True
98
+
99
+ def search(self, query: str, limit: int = 10) -> List[Dict]:
100
+ """Search memory"""
101
+ if self.mode == "mem0_cloud":
102
+ try:
103
+ return self.memory.search(query, user_id=self.user_id, limit=limit)
104
+ except Exception as e:
105
+ print(f"Mem0 search failed: {e}")
106
+
107
+ # Local fallback - simple text search
108
+ data = json.loads(self.local_path.read_text())
109
+ results = []
110
+ for incident in data["incidents"]:
111
+ if query.lower() in incident["message"].lower():
112
+ results.append(incident)
113
+ return results[-limit:]
114
+
115
+ def get_state(self) -> Dict:
116
+ """Get current watchdog state"""
117
+ if self.mode == "local_json":
118
+ data = json.loads(self.local_path.read_text())
119
+ return data.get("state", {})
120
+ return {}
121
+
122
+ def save_state(self, state: Dict):
123
+ """Save watchdog state"""
124
+ if self.mode == "local_json":
125
+ data = json.loads(self.local_path.read_text())
126
+ data["state"] = state
127
+ self.local_path.write_text(json.dumps(data, indent=2))
128
+
129
+ class WorkspaceMonitor:
130
+ """Monitors Agent Zero workspace health"""
131
+
132
+ FAILURE_PATTERNS = {
133
+ "rate_limit": ["Key limit exceeded", "429", "quota exceeded", "rate limit"],
134
+ "fd_exhaustion": ["Too many open files", "EMFILE", "Errno 24"],
135
+ "memory_pressure": ["MemoryError", "OOM", "Killed process", "out of memory"],
136
+ "api_blackout": ["Connection refused", "Timeout", "Name resolution failed"],
137
+ "auth_failure": ["401", "403", "Unauthorized", "Invalid token"]
138
+ }
139
+
140
+ def __init__(self, target_space: str, hf_token: str):
141
+ self.target = target_space
142
+ self.hf_token = hf_token
143
+ self.api = HfApi(token=hf_token)
144
+ self.history: List[HealthCheck] = []
145
+ self.consecutive_failures = 0
146
+
147
+ async def check_health(self) -> HealthCheck:
148
+ """Perform health check on target workspace"""
149
+ start = time.time()
150
+ error_patterns = []
151
+ status = "healthy"
152
+ action = None
153
+
154
+ try:
155
+ # Check HF Space runtime status
156
+ space_info = self.api.space_info(self.target)
157
+ runtime = getattr(space_info, 'runtime', {})
158
+ stage = runtime.get('stage', 'UNKNOWN')
159
+
160
+ if stage == 'RUNNING':
161
+ status = "healthy"
162
+ elif stage == 'BUILD_ERROR':
163
+ status = "failed"
164
+ error_patterns.append("build_error")
165
+ action = "trigger_rebuild"
166
+ elif stage == 'SLEEPING':
167
+ status = "degraded"
168
+ action = "wake_space"
169
+ else:
170
+ status = "degraded"
171
+
172
+ except Exception as e:
173
+ status = "failed"
174
+ error_msg = str(e).lower()
175
+
176
+ # Pattern matching
177
+ for pattern_type, patterns in self.FAILURE_PATTERNS.items():
178
+ if any(p.lower() in error_msg for p in patterns):
179
+ error_patterns.append(pattern_type)
180
+
181
+ if not error_patterns:
182
+ error_patterns.append("unknown")
183
+
184
+ action = self._determine_action(error_patterns)
185
+
186
+ latency = (time.time() - start) * 1000
187
+
188
+ check = HealthCheck(
189
+ timestamp=datetime.utcnow().isoformat(),
190
+ status=status,
191
+ latency_ms=latency,
192
+ error_patterns=error_patterns,
193
+ action_taken=action
194
+ )
195
+
196
+ self.history.append(check)
197
+ if len(self.history) > 100:
198
+ self.history = self.history[-100:]
199
+
200
+ # Track consecutive failures
201
+ if status == "failed":
202
+ self.consecutive_failures += 1
203
+ else:
204
+ self.consecutive_failures = 0
205
+
206
+ return check
207
+
208
+ def _determine_action(self, error_patterns: List[str]) -> str:
209
+ """Determine remediation action from error patterns"""
210
+ if "fd_exhaustion" in error_patterns:
211
+ return "restart_container"
212
+ elif "rate_limit" in error_patterns:
213
+ return "switch_to_local_inference"
214
+ elif "memory_pressure" in error_patterns:
215
+ return "scale_down_agents"
216
+ elif "build_error" in error_patterns:
217
+ return "trigger_rebuild"
218
+ else:
219
+ return "restart_container"
220
+
221
+ async def execute_remediation(self, action: str) -> bool:
222
+ """Execute remediation action"""
223
+ print(f"🔧 Executing remediation: {action}")
224
+
225
+ try:
226
+ if action == "restart_container":
227
+ # Trigger space restart via empty commit
228
+ self.api.upload_file(
229
+ repo_id=self.target,
230
+ repo_type="space",
231
+ path_in_repo=".watchdog_trigger",
232
+ path_or_fileobj=f"restart_{int(time.time())}".encode(),
233
+ commit_message="[FAILSAFE] Auto-restart due to health check failure"
234
+ )
235
+ return True
236
+
237
+ elif action == "trigger_rebuild":
238
+ # Touch README to trigger rebuild
239
+ self.api.upload_file(
240
+ repo_id=self.target,
241
+ repo_type="space",
242
+ path_in_repo="README.md",
243
+ path_or_fileobj=b"# Trigger rebuild\n",
244
+ commit_message="[FAILSAFE] Trigger rebuild"
245
+ )
246
+ return True
247
+
248
+ elif action == "wake_space":
249
+ # Send request to wake sleeping space
250
+ requests.get(f"https://{self.target.replace('/', '-')}.hf.space", timeout=10)
251
+ return True
252
+
253
+ except Exception as e:
254
+ print(f"❌ Remediation failed: {e}")
255
+ return False
256
+
257
+ return False
258
+
259
+ class SelfReplication:
260
+ """Spawns backup watchdogs if current instance degrades"""
261
+
262
+ def __init__(self, hf_token: str, mem0: Mem0Persistence):
263
+ self.hf_token = hf_token
264
+ self.mem0 = mem0
265
+ self.api = HfApi(token=hf_token)
266
+ self.backup_spaces = [
267
+ "ScottzillaSystems/failsafe-watchdog-beta",
268
+ "ScottzillaSystems/failsafe-watchdog-gamma"
269
+ ]
270
+
271
+ def check_own_health(self) -> bool:
272
+ """Check if this watchdog is healthy"""
273
+ # Simple checks
274
+ if not self.hf_token:
275
+ return False
276
+
277
+ # Check if we can write to memory
278
+ try:
279
+ self.mem0.add("health_check", {"type": "self_check"})
280
+ return True
281
+ except:
282
+ return False
283
+
284
+ def spawn_backup(self) -> Optional[str]:
285
+ """Spawn backup watchdog instance"""
286
+ for backup_space in self.backup_spaces:
287
+ try:
288
+ # Check if backup exists
289
+ try:
290
+ self.api.space_info(backup_space)
291
+ print(f"✅ Backup {backup_space} already exists")
292
+ return backup_space
293
+ except:
294
+ # Create backup space
295
+ self.api.create_repo(
296
+ repo_id=backup_space,
297
+ repo_type="space",
298
+ space_sdk="gradio"
299
+ )
300
+ print(f"🚀 Spawned backup watchdog: {backup_space}")
301
+ return backup_space
302
+ except Exception as e:
303
+ print(f"⚠️ Failed to spawn {backup_space}: {e}")
304
+
305
+ return None
306
+
307
+ class FailsafeWatchdog:
308
+ """Main watchdog orchestrator"""
309
+
310
+ def __init__(self):
311
+ self.config = WATCHDOG_CONFIG
312
+ self.mem0 = Mem0Persistence(self.config["mem0_api_key"])
313
+ self.monitor = WorkspaceMonitor(
314
+ self.config["target_workspace"],
315
+ self.config["hf_token"]
316
+ )
317
+ self.replication = SelfReplication(
318
+ self.config["hf_token"],
319
+ self.mem0
320
+ )
321
+ self.running = False
322
+ self.status_log: List[Dict] = []
323
+
324
+ # Restore state from Mem0
325
+ self._restore_state()
326
+
327
+ def _restore_state(self):
328
+ """Restore previous state from persistent memory"""
329
+ state = self.mem0.get_state()
330
+ if state:
331
+ print(f"📥 Restored state: {state.get('restart_count', 0)} previous restarts")
332
+
333
+ async def run(self):
334
+ """Main watchdog loop"""
335
+ self.running = True
336
+ print("🛡️ Failsafe Watchdog started")
337
+
338
+ while self.running:
339
+ try:
340
+ # Health check
341
+ check = await self.monitor.check_health()
342
+
343
+ # Log to Mem0
344
+ if check.status in ["failed", "degraded"]:
345
+ self.mem0.add(
346
+ f"Health check {check.status}: {', '.join(check.error_patterns)}",
347
+ {"type": "health_check", "status": check.status}
348
+ )
349
+
350
+ # Execute remediation if needed
351
+ if check.action_taken and self.monitor.consecutive_failures >= self.config["failure_threshold"]:
352
+ success = await self.monitor.execute_remediation(check.action_taken)
353
+
354
+ self.mem0.add(
355
+ f"Remediation {check.action_taken}: {'success' if success else 'failed'}",
356
+ {"type": "remediation", "action": check.action_taken, "success": success}
357
+ )
358
+
359
+ if success:
360
+ self.monitor.consecutive_failures = 0
361
+
362
+ # Self-replication check
363
+ if not self.replication.check_own_health():
364
+ backup = self.replication.spawn_backup()
365
+ if backup:
366
+ self.mem0.add(
367
+ f"Self-replication triggered: spawned {backup}",
368
+ {"type": "self_replication", "backup": backup}
369
+ )
370
+
371
+ # Update status log
372
+ self.status_log.append(asdict(check))
373
+ if len(self.status_log) > 50:
374
+ self.status_log = self.status_log[-50:]
375
+
376
+ # Save state
377
+ self.mem0.save_state({
378
+ "last_check": check.timestamp,
379
+ "restart_count": len([s for s in self.status_log if s.get("action_taken")]),
380
+ "consecutive_failures": self.monitor.consecutive_failures
381
+ })
382
+
383
+ await asyncio.sleep(self.config["check_interval"])
384
+
385
+ except Exception as e:
386
+ print(f"❌ Watchdog error: {e}")
387
+ await asyncio.sleep(5)
388
+
389
+ def get_dashboard_data(self) -> Dict:
390
+ """Get data for Gradio dashboard"""
391
+ recent_checks = self.status_log[-10:]
392
+
393
+ return {
394
+ "status": "running" if self.running else "stopped",
395
+ "target": self.config["target_workspace"],
396
+ "last_check": recent_checks[-1] if recent_checks else None,
397
+ "health_history": recent_checks,
398
+ "consecutive_failures": self.monitor.consecutive_failures,
399
+ "mem0_mode": self.mem0.mode,
400
+ "incidents": self.mem0.search("failed", limit=5) if hasattr(self, 'mem0') else []
401
+ }
402
+
403
+ # Gradio UI
404
+ watchdog = FailsafeWatchdog()
405
+
406
+ def start_watchdog():
407
+ if not watchdog.running:
408
+ asyncio.create_task(watchdog.run())
409
+ return "🛡️ Watchdog started"
410
+
411
+ def stop_watchdog():
412
+ watchdog.running = False
413
+ return "⏹️ Watchdog stopped"
414
+
415
+ def get_status():
416
+ return json.dumps(watchdog.get_dashboard_data(), indent=2, default=str)
417
+
418
+ def view_incidents():
419
+ incidents = watchdog.mem0.search("incident", limit=10)
420
+ return json.dumps(incidents, indent=2, default=str)
421
+
422
+ with gr.Blocks(title="Failsafe Watchdog") as demo:
423
+ gr.Markdown("# 🛡️ Failsafe Watchdog - Immortal Guardian")
424
+ gr.Markdown("Monitors Agent Zero workspace, auto-heals on failure, persists via Mem0")
425
+
426
+ with gr.Row():
427
+ start_btn = gr.Button("▶️ Start Watchdog", variant="primary")
428
+ stop_btn = gr.Button("⏹️ Stop Watchdog", variant="secondary")
429
+
430
+ with gr.Row():
431
+ status_btn = gr.Button("📊 Get Status")
432
+ incidents_btn = gr.Button("📋 View Incidents")
433
+
434
+ output = gr.Textbox(label="Output", lines=20)
435
+
436
+ start_btn.click(start_watchdog, outputs=output)
437
+ stop_btn.click(stop_watchdog, outputs=output)
438
+ status_btn.click(get_status, outputs=output)
439
+ incidents_btn.click(view_incidents, outputs=output)
440
+
441
+ # Auto-refresh status
442
+ demo.load(get_status, outputs=output, every=30)
443
+
444
+ if __name__ == "__main__":
445
+ demo.launch(server_name="0.0.0.0", server_port=7860)