petter2025 commited on
Commit
9fa5ff3
·
verified ·
1 Parent(s): 499356a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -126
app.py CHANGED
@@ -1,152 +1,160 @@
1
  import os
2
  import random
3
- import datetime
 
4
  import numpy as np
5
  import gradio as gr
6
- import requests
7
  from sentence_transformers import SentenceTransformer
8
  import faiss
9
-
10
- # === Hugging Face Token (auto pulled from secrets) ===
11
- HF_API_TOKEN = os.getenv("HF_API_TOKEN")
12
-
13
- # === In-memory store for events ===
14
- recent_events = []
15
-
16
- # === Vector-based post-incident memory ===
17
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
18
- dimension = 384 # embedding size
19
- index = faiss.IndexFlatL2(dimension)
20
- incident_texts = [] # metadata for recall
21
-
22
- # === Helper: store + recall similar anomalies ===
23
- def store_incident_vector(event, analysis):
24
- """Embed and store context of each anomaly."""
25
- context = f"Component: {event['component']} | Latency: {event['latency']} | ErrorRate: {event['error_rate']} | Analysis: {analysis}"
26
- embedding = embedding_model.encode(context)
27
- index.add(np.array([embedding]).astype('float32'))
28
- incident_texts.append(context)
29
-
30
- def find_similar_incidents(event):
31
- """Return top-3 similar incidents (if exist)."""
32
- if index.ntotal == 0:
33
- return []
34
- query = f"Component: {event['component']} | Latency: {event['latency']} | ErrorRate: {event['error_rate']}"
35
- q_embed = embedding_model.encode(query)
36
- D, I = index.search(np.array([q_embed]).astype('float32'), 3)
37
- results = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
38
- return results
39
-
40
- # === Hugging Face Inference API (for text analysis simulation) ===
41
- def analyze_event_with_hf(event):
 
 
 
 
 
 
 
 
 
42
  try:
43
- headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
44
- payload = {
45
- "inputs": f"Analyze system reliability for component {event['component']} with latency {event['latency']} and error rate {event['error_rate']}."
46
- }
47
  response = requests.post(
48
- "https://api-inference.huggingface.co/models/distilbert-base-uncased",
49
  headers=headers,
50
- json=payload,
51
- timeout=10
52
  )
53
  if response.status_code == 200:
54
- return response.json()
 
 
 
55
  else:
56
- return f"Error generating analysis: {response.text}"
57
  except Exception as e:
58
- return f"Error generating analysis: {str(e)}"
59
-
60
- # === Forced anomaly toggle logic ===
61
- run_counter = 0
62
- def force_anomaly():
63
- global run_counter
64
- run_counter += 1
65
- # Every 3rd run will be forced to trigger an anomaly
66
- return run_counter % 3 == 0
67
-
68
- # === Generate Telemetry Event ===
69
- def simulate_event():
70
- components = ["api-service", "data-ingestor", "model-runner", "queue-worker"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  event = {
72
- "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
73
- "component": random.choice(components),
74
- "latency": round(random.uniform(50, 350), 2),
75
- "error_rate": round(random.uniform(0.01, 0.2), 3),
76
- }
77
- return event
78
-
79
- # === Main processing logic ===
80
- def process_event():
81
- event = simulate_event()
82
-
83
- # === Adaptive thresholding + forced anomaly ===
84
- is_forced = force_anomaly()
85
- if is_forced or event["latency"] > 150 or event["error_rate"] > 0.05:
86
- status = "Anomaly"
87
- analysis = analyze_event_with_hf(event)
88
- store_incident_vector(event, str(analysis))
89
-
90
- # AI-driven "self-healing" simulation
91
- healing_action = "Restarted container" if random.random() < 0.3 else "No actionable step detected."
92
-
93
- # Check similarity with past incidents
94
- similar = find_similar_incidents(event)
95
- if similar:
96
- healing_action += f" Found {len(similar)} similar incidents (e.g., {similar[0][:80]}...)."
97
-
98
- else:
99
- status = "Normal"
100
- analysis = "-"
101
- healing_action = "-"
102
-
103
- event_record = {
104
- "timestamp": event["timestamp"],
105
- "component": event["component"],
106
- "latency": event["latency"],
107
- "error_rate": event["error_rate"],
108
- "analysis": analysis,
109
- "status": status,
110
- "healing_action": healing_action
111
  }
112
 
113
- recent_events.append(event_record)
114
- if len(recent_events) > 20:
115
- recent_events.pop(0)
116
 
117
- return (
118
- f"✅ Event Processed ({status})",
119
- gr.update(value=create_table(recent_events))
120
- )
121
 
122
- # === Display helper for Gradio ===
123
- def create_table(events):
124
- if not events:
125
- return "No events yet."
126
- headers = list(events[0].keys())
127
- table = "<table><tr>" + "".join(f"<th>{h}</th>" for h in headers) + "</tr>"
128
- for e in events:
129
- table += "<tr>" + "".join(f"<td>{e[h]}</td>" for h in headers) + "</tr>"
130
- table += "</table>"
131
- return table
132
-
133
- # === Gradio UI ===
134
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
135
- gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
136
- gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory")
137
 
138
  with gr.Row():
139
- submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary")
 
 
 
 
 
 
140
 
141
- detection_output = gr.Textbox(label="Detection Output", interactive=False)
142
- recent_table = gr.HTML(label="Recent Events (Last 20)", value="No events yet.")
143
 
144
- submit_btn.click(fn=process_event, outputs=[detection_output, recent_table])
 
145
 
146
- gr.Markdown("---")
147
- gr.Markdown("### Recent Events (Last 20)")
148
- gr.Column([recent_table])
 
 
149
 
150
- # === Launch app ===
 
 
151
  if __name__ == "__main__":
152
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
  import random
3
+ import time
4
+ import pandas as pd
5
  import numpy as np
6
  import gradio as gr
7
+ import torch
8
  from sentence_transformers import SentenceTransformer
9
  import faiss
10
+ import requests
11
+ from dotenv import load_dotenv
12
+
13
+ # ========================
14
+ # Initialization
15
+ # ========================
16
+ load_dotenv()
17
+
18
+ HF_API_TOKEN = (os.getenv("HF_API_TOKEN") or "").strip()
19
+ HF_INFERENCE_ENDPOINT = "https://api-inference.huggingface.co/models/distilbert-base-uncased"
20
+
21
+ # fallback in case the token isn't available
22
+ if not HF_API_TOKEN:
23
+ print("⚠️ Warning: No HF_API_TOKEN found using read-only mode (no inference calls).")
24
+
25
+ # Vector memory setup
26
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
27
+ embedding_dim = 384
28
+ index = faiss.IndexFlatL2(embedding_dim)
29
+ incident_memory = [] # stores {vector, metadata}
30
+
31
+ # Helper: create embeddings
32
+ def embed_text(text):
33
+ vector = embedder.encode([text], convert_to_numpy=True)
34
+ return vector
35
+
36
+ # ========================
37
+ # Core Functions
38
+ # ========================
39
+ def detect_anomaly(event):
40
+ """Simple adaptive anomaly detection."""
41
+ # Random anomaly forcing for test verification
42
+ force_anomaly = random.random() < 0.25 # 25% of events become anomalies automatically
43
+ if force_anomaly or event["latency"] > 150 or event["error_rate"] > 0.05:
44
+ return "Anomaly"
45
+ return "Normal"
46
+
47
+ def analyze_with_hf_api(text):
48
+ """Call Hugging Face Inference API safely."""
49
+ if not HF_API_TOKEN:
50
+ return "⚠️ No API token — running offline simulation."
51
+ headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
52
  try:
 
 
 
 
53
  response = requests.post(
54
+ HF_INFERENCE_ENDPOINT,
55
  headers=headers,
56
+ json={"inputs": text},
57
+ timeout=5
58
  )
59
  if response.status_code == 200:
60
+ result = response.json()
61
+ if isinstance(result, list):
62
+ return result[0].get("label", "No label")
63
+ return str(result)
64
  else:
65
+ return f"Error {response.status_code}: {response.text}"
66
  except Exception as e:
67
+ return f"Error generating analysis: {e}"
68
+
69
+ def simulate_healing(event):
70
+ """Simulate automated remediation based on anomaly context."""
71
+ actions = [
72
+ "Restarted container",
73
+ "Scaled up pods",
74
+ "Cleared queue backlog",
75
+ "Purged cache and retried"
76
+ ]
77
+ if event["status"] == "Anomaly":
78
+ return random.choice(actions)
79
+ return "-"
80
+
81
+ def add_to_vector_memory(event):
82
+ """Store event context in FAISS for post-incident learning."""
83
+ text = f"Component: {event['component']} | Latency: {event['latency']} | ErrorRate: {event['error_rate']} | Analysis: {event['analysis']}"
84
+ vector = embed_text(text)
85
+ index.add(vector)
86
+ incident_memory.append({
87
+ "vector": vector,
88
+ "metadata": text
89
+ })
90
+ return len(incident_memory)
91
+
92
+ def find_similar_events(event, top_k=3):
93
+ """Find semantically similar past incidents."""
94
+ if len(incident_memory) < 3:
95
+ return "Not enough incidents stored yet."
96
+ text = f"Component: {event['component']} | Latency: {event['latency']} | ErrorRate: {event['error_rate']} | Analysis: {event['analysis']}"
97
+ query_vec = embed_text(text)
98
+ distances, indices = index.search(query_vec, top_k)
99
+ results = [incident_memory[i]["metadata"] for i in indices[0] if i < len(incident_memory)]
100
+ return f"Found {len(results)} similar incidents (e.g., {results[0][:100]}...)." if results else "No matches found."
101
+
102
+ # ========================
103
+ # Event Handling
104
+ # ========================
105
+ events = []
106
+
107
+ def process_event(component, latency, error_rate):
108
  event = {
109
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
110
+ "component": component,
111
+ "latency": float(latency),
112
+ "error_rate": float(error_rate),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
 
115
+ event["status"] = detect_anomaly(event)
116
+ event["analysis"] = analyze_with_hf_api(f"{component} latency={latency}, error={error_rate}")
117
+ event["healing_action"] = simulate_healing(event)
118
 
119
+ # Vector memory & similarity learning
120
+ add_to_vector_memory(event)
121
+ event["healing_action"] += " " + find_similar_events(event)
 
122
 
123
+ events.append(event)
124
+ if len(events) > 20:
125
+ events.pop(0)
126
+
127
+ df = pd.DataFrame(events)
128
+ return " Event Processed", df
129
+
130
+ # ========================
131
+ # Gradio UI
132
+ # ========================
133
+ with gr.Blocks(title="Agentic Reliability Framework MVP") as demo:
134
+ gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + vector memory")
 
 
 
135
 
136
  with gr.Row():
137
+ component_input = gr.Dropdown(
138
+ ["api-service", "data-ingestor", "queue-worker", "model-runner"],
139
+ label="Component",
140
+ value="api-service"
141
+ )
142
+ latency_input = gr.Number(label="Latency (ms)", value=random.uniform(50, 200))
143
+ error_input = gr.Number(label="Error Rate", value=random.uniform(0.01, 0.15))
144
 
145
+ submit_btn = gr.Button("🚀 Submit Telemetry Event")
 
146
 
147
+ output_text = gr.Textbox(label="Detection Output")
148
+ output_table = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "analysis", "status", "healing_action"], label="Recent Events (Last 20)")
149
 
150
+ submit_btn.click(
151
+ fn=process_event,
152
+ inputs=[component_input, latency_input, error_input],
153
+ outputs=[output_text, output_table]
154
+ )
155
 
156
+ # ========================
157
+ # Launch
158
+ # ========================
159
  if __name__ == "__main__":
160
  demo.launch(server_name="0.0.0.0", server_port=7860)