petter2025 commited on
Commit
a81efd4
·
verified ·
1 Parent(s): 220196d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -124
app.py CHANGED
@@ -1,19 +1,20 @@
1
  import os
2
  import json
3
  import random
 
4
  import datetime
5
  import numpy as np
6
  import gradio as gr
7
  import requests
8
- import faiss
9
- from fastapi import FastAPI, Body, Header, HTTPException
10
- from pydantic import BaseModel
11
  from sentence_transformers import SentenceTransformer
12
- from filelock import FileLock
13
 
14
  # === Config ===
15
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
16
- API_KEY = os.getenv("API_KEY", "").strip()
 
 
 
17
 
18
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
19
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
@@ -22,8 +23,6 @@ HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
22
  VECTOR_DIM = 384
23
  INDEX_FILE = "incident_vectors.index"
24
  TEXTS_FILE = "incident_texts.json"
25
- LOCK_FILE = "faiss_save.lock"
26
-
27
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
28
 
29
  if os.path.exists(INDEX_FILE):
@@ -34,200 +33,218 @@ else:
34
  index = faiss.IndexFlatL2(VECTOR_DIM)
35
  incident_texts = []
36
 
37
-
38
- # === Safe persistence ===
39
  def save_index():
40
- with FileLock(LOCK_FILE):
41
- faiss.write_index(index, INDEX_FILE)
42
- with open(TEXTS_FILE, "w") as f:
43
- json.dump(incident_texts, f)
44
-
45
 
46
- # === Core logic ===
47
  events = []
48
 
49
-
50
  def detect_anomaly(event):
51
  """Adaptive threshold-based anomaly detection."""
52
  latency = event["latency"]
53
  error_rate = event["error_rate"]
54
 
55
- # Occasionally flag random anomaly for testing
56
- if random.random() < 0.25:
57
- return True
58
-
59
- return latency > 150 or error_rate > 0.05
60
-
61
 
62
  def call_huggingface_analysis(prompt):
63
- """Uses HF Inference API or local fallback."""
64
  if not HF_TOKEN:
65
- return "Offline mode: simulated analysis."
 
 
 
 
 
 
 
 
66
 
67
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  payload = {
69
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
70
- "prompt": prompt,
71
- "max_tokens": 200,
72
- "temperature": 0.3,
73
  }
74
- response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
75
  if response.status_code == 200:
76
  result = response.json()
77
- return result.get("choices", [{}])[0].get("text", "").strip()
 
 
 
 
78
  else:
79
- return f"Error {response.status_code}: {response.text}"
80
  except Exception as e:
81
- return f"Error generating analysis: {e}"
82
-
83
 
84
  def simulate_healing(event):
85
  actions = [
86
  "Restarted container",
87
  "Scaled up instance",
88
  "Cleared queue backlog",
89
- "No actionable step detected.",
90
  ]
91
  return random.choice(actions)
92
 
93
-
94
  def analyze_event(component, latency, error_rate):
 
95
  event = {
96
- "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
97
  "component": component,
98
  "latency": latency,
99
- "error_rate": error_rate,
100
  }
101
 
102
  is_anomaly = detect_anomaly(event)
103
  event["anomaly"] = is_anomaly
104
  event["status"] = "Anomaly" if is_anomaly else "Normal"
105
 
 
106
  prompt = (
107
  f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
108
  f"Status: {event['status']}\n\n"
109
  "Provide a one-line reliability insight or root cause analysis."
110
  )
111
 
112
- # AI Reliability analysis
113
  analysis = call_huggingface_analysis(prompt)
114
  event["analysis"] = analysis
115
 
116
- # Simulated self-healing
117
  healing_action = simulate_healing(event)
118
  event["healing_action"] = healing_action
119
 
120
- # === Vector learning & persistence ===
121
  vector_text = f"{component} {latency} {error_rate} {analysis}"
122
  vec = model.encode([vector_text])
123
  index.add(np.array(vec, dtype=np.float32))
124
  incident_texts.append(vector_text)
125
  save_index()
126
 
127
- # Similar incident lookup
128
  if len(incident_texts) > 1:
129
  D, I = index.search(vec, k=min(3, len(incident_texts)))
130
  similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
131
  if similar:
132
- event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:100]}...)."
 
 
133
  else:
134
  event["healing_action"] += " - Not enough incidents stored yet."
135
 
136
  events.append(event)
137
- return event
138
-
139
-
140
- # === FastAPI backend ===
141
- app = FastAPI(title="Agentic Reliability Framework API")
142
-
143
 
144
- class AddEventModel(BaseModel):
145
- component: str
146
- latency: float
147
- error_rate: float
148
-
149
-
150
- def verify_api_key(provided_key: str):
151
- if not API_KEY:
152
- return True # dev mode
153
- return provided_key == API_KEY
154
-
155
-
156
- @app.post("/add-event")
157
- def add_event(
158
- payload: AddEventModel = Body(...),
159
- x_api_key: str = Header(None, alias="X-API-Key"),
160
- ):
161
- """Add a telemetry event (secured via API key)."""
162
- if not verify_api_key(x_api_key):
163
- raise HTTPException(status_code=401, detail="Unauthorized: invalid API key.")
164
-
165
- try:
166
- event = analyze_event(payload.component, payload.latency, payload.error_rate)
167
- return {"status": "ok", "event": event}
168
- except Exception as e:
169
- raise HTTPException(status_code=500, detail=f"Failed to add event: {e}")
170
-
171
-
172
- # === Gradio Dashboard ===
173
  def submit_event(component, latency, error_rate):
174
- event = analyze_event(component, latency, error_rate)
 
175
 
 
176
  table = [
177
- [
178
- e["timestamp"],
179
- e["component"],
180
- e["latency"],
181
- e["error_rate"],
182
- e["status"],
183
- e["analysis"],
184
- e["healing_action"],
185
- ]
186
- for e in events[-20:]
187
  ]
188
 
189
  return (
190
- f"✅ Event Processed ({event['status']})",
191
  gr.Dataframe(
192
- headers=[
193
- "timestamp",
194
- "component",
195
- "latency",
196
- "error_rate",
197
- "status",
198
- "analysis",
199
- "healing_action",
200
- ],
201
- value=table,
202
- ),
203
  )
204
 
205
-
206
- with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
207
- gr.Markdown(
208
- "## 🧠 Agentic Reliability Framework MVP\n"
209
- "Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory"
210
- )
 
 
211
  with gr.Row():
212
- component = gr.Textbox(label="Component", value="api-service")
213
- latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
214
- error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
215
- submit = gr.Button("🚀 Submit Telemetry Event")
216
- output_text = gr.Textbox(label="Detection Output")
217
- table_output = gr.Dataframe(
218
- headers=[
219
- "timestamp",
220
- "component",
221
- "latency",
222
- "error_rate",
223
- "status",
224
- "analysis",
225
- "healing_action",
226
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  )
228
- submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
229
-
230
 
231
  if __name__ == "__main__":
232
- demo.launch(server_name="0.0.0.0", server_port=7860)
233
-
 
 
 
 
1
  import os
2
  import json
3
  import random
4
+ import time
5
  import datetime
6
  import numpy as np
7
  import gradio as gr
8
  import requests
 
 
 
9
  from sentence_transformers import SentenceTransformer
10
+ import faiss
11
 
12
  # === Config ===
13
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
14
+ if not HF_TOKEN:
15
+ print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
16
+ else:
17
+ print("✅ Hugging Face token loaded successfully.")
18
 
19
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
20
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 
23
  VECTOR_DIM = 384
24
  INDEX_FILE = "incident_vectors.index"
25
  TEXTS_FILE = "incident_texts.json"
 
 
26
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
 
28
  if os.path.exists(INDEX_FILE):
 
33
  index = faiss.IndexFlatL2(VECTOR_DIM)
34
  incident_texts = []
35
 
 
 
36
  def save_index():
37
+ faiss.write_index(index, INDEX_FILE)
38
+ with open(TEXTS_FILE, "w") as f:
39
+ json.dump(incident_texts, f)
 
 
40
 
41
+ # === Event Memory ===
42
  events = []
43
 
 
44
  def detect_anomaly(event):
45
  """Adaptive threshold-based anomaly detection."""
46
  latency = event["latency"]
47
  error_rate = event["error_rate"]
48
 
49
+ # Remove random forcing for production - use actual thresholds only
50
+ latency_anomaly = latency > 150
51
+ error_anomaly = error_rate > 0.05
52
+
53
+ return latency_anomaly or error_anomaly
 
54
 
55
  def call_huggingface_analysis(prompt):
56
+ """Use HF Inference API or fallback simulation."""
57
  if not HF_TOKEN:
58
+ # Enhanced fallback analysis
59
+ fallback_insights = [
60
+ "High latency detected - possible resource contention or network issues",
61
+ "Error rate increase suggests recent deployment instability",
62
+ "Latency spike correlates with increased user traffic patterns",
63
+ "Intermittent failures indicate potential dependency service degradation",
64
+ "Performance degradation detected - consider scaling compute resources"
65
+ ]
66
+ return random.choice(fallback_insights)
67
 
68
  try:
69
+ # Enhanced prompt for better analysis
70
+ enhanced_prompt = f"""
71
+ As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
72
+
73
+ {prompt}
74
+
75
+ Focus on:
76
+ - Potential infrastructure or application issues
77
+ - Correlation between metrics
78
+ - Business impact assessment
79
+ - Recommended investigation areas
80
+
81
+ Provide 1-2 sentences maximum with actionable insights.
82
+ """
83
+
84
  payload = {
85
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
86
+ "prompt": enhanced_prompt,
87
+ "max_tokens": 150,
88
+ "temperature": 0.4,
89
  }
90
+ response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=15)
91
  if response.status_code == 200:
92
  result = response.json()
93
+ analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
94
+ # Clean up any extra formatting from the response
95
+ if analysis_text and len(analysis_text) > 10:
96
+ return analysis_text.split('\n')[0] # Take first line if multiple
97
+ return analysis_text
98
  else:
99
+ return f"API Error {response.status_code}: Service temporarily unavailable"
100
  except Exception as e:
101
+ return f"Analysis service error: {str(e)}"
 
102
 
103
  def simulate_healing(event):
104
  actions = [
105
  "Restarted container",
106
  "Scaled up instance",
107
  "Cleared queue backlog",
108
+ "No actionable step detected."
109
  ]
110
  return random.choice(actions)
111
 
 
112
  def analyze_event(component, latency, error_rate):
113
+ # Ensure unique timestamps with higher precision
114
  event = {
115
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3],
116
  "component": component,
117
  "latency": latency,
118
+ "error_rate": error_rate
119
  }
120
 
121
  is_anomaly = detect_anomaly(event)
122
  event["anomaly"] = is_anomaly
123
  event["status"] = "Anomaly" if is_anomaly else "Normal"
124
 
125
+ # Build enhanced textual prompt
126
  prompt = (
127
  f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
128
  f"Status: {event['status']}\n\n"
129
  "Provide a one-line reliability insight or root cause analysis."
130
  )
131
 
132
+ # Analysis
133
  analysis = call_huggingface_analysis(prompt)
134
  event["analysis"] = analysis
135
 
136
+ # Healing simulation
137
  healing_action = simulate_healing(event)
138
  event["healing_action"] = healing_action
139
 
140
+ # === Vector learning ===
141
  vector_text = f"{component} {latency} {error_rate} {analysis}"
142
  vec = model.encode([vector_text])
143
  index.add(np.array(vec, dtype=np.float32))
144
  incident_texts.append(vector_text)
145
  save_index()
146
 
147
+ # Find similar incidents
148
  if len(incident_texts) > 1:
149
  D, I = index.search(vec, k=min(3, len(incident_texts)))
150
  similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
151
  if similar:
152
+ # Extract meaningful part from similar incident
153
+ similar_preview = similar[0][:100] + "..." if len(similar[0]) > 100 else similar[0]
154
+ event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar_preview})."
155
  else:
156
  event["healing_action"] += " - Not enough incidents stored yet."
157
 
158
  events.append(event)
159
+ return json.dumps(event, indent=2)
 
 
 
 
 
160
 
161
+ # === UI ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  def submit_event(component, latency, error_rate):
163
+ result = analyze_event(component, latency, error_rate)
164
+ parsed = json.loads(result)
165
 
166
+ # Display last 15 events to keep table manageable
167
  table = [
168
+ [e["timestamp"], e["component"], e["latency"], e["error_rate"],
169
+ e["status"], e["analysis"], e["healing_action"]]
170
+ for e in events[-15:]
 
 
 
 
 
 
 
171
  ]
172
 
173
  return (
174
+ f"✅ Event Processed ({parsed['status']})",
175
  gr.Dataframe(
176
+ headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
177
+ value=table
178
+ )
 
 
 
 
 
 
 
 
179
  )
180
 
181
+ with gr.Blocks(title="🧠 Agentic Reliability Framework MVP", theme="soft") as demo:
182
+ gr.Markdown("""
183
+ # 🧠 Agentic Reliability Framework MVP
184
+ **Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory**
185
+
186
+ *Monitor your services in real-time with AI-powered reliability engineering*
187
+ """)
188
+
189
  with gr.Row():
190
+ with gr.Column(scale=1):
191
+ gr.Markdown("### 📊 Telemetry Input")
192
+ component = gr.Textbox(
193
+ label="Component",
194
+ value="api-service",
195
+ info="Name of the service being monitored"
196
+ )
197
+ latency = gr.Slider(
198
+ minimum=10,
199
+ maximum=400,
200
+ value=100,
201
+ step=1,
202
+ label="Latency (ms)",
203
+ info="Alert threshold: >150ms"
204
+ )
205
+ error_rate = gr.Slider(
206
+ minimum=0,
207
+ maximum=0.2,
208
+ value=0.02,
209
+ step=0.001,
210
+ label="Error Rate",
211
+ info="Alert threshold: >0.05"
212
+ )
213
+ submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
214
+
215
+ with gr.Column(scale=2):
216
+ gr.Markdown("### 🔍 Live Analysis")
217
+ output_text = gr.Textbox(
218
+ label="Detection Output",
219
+ placeholder="Submit an event to see analysis results...",
220
+ lines=2
221
+ )
222
+ gr.Markdown("### 📈 Recent Events")
223
+ table_output = gr.Dataframe(
224
+ headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
225
+ label="Event History",
226
+ height=400,
227
+ wrap=True
228
+ )
229
+
230
+ # Add some explanation
231
+ with gr.Accordion("ℹ️ How it works", open=False):
232
+ gr.Markdown("""
233
+ - **Anomaly Detection**: Flags events with latency >150ms or error rate >5%
234
+ - **AI Analysis**: Uses Mistral-8x7B for root cause analysis via Hugging Face
235
+ - **Vector Memory**: Stores incidents in FAISS for similarity search
236
+ - **Self-Healing**: Simulates automated recovery actions based on historical patterns
237
+ """)
238
+
239
+ submit.click(
240
+ fn=submit_event,
241
+ inputs=[component, latency, error_rate],
242
+ outputs=[output_text, table_output]
243
  )
 
 
244
 
245
  if __name__ == "__main__":
246
+ demo.launch(
247
+ server_name="0.0.0.0",
248
+ server_port=7860,
249
+ share=False
250
+ )