petter2025 commited on
Commit
ba59239
·
verified ·
1 Parent(s): 82009c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -89
app.py CHANGED
@@ -1,117 +1,197 @@
1
- import gradio as gr
2
- import time
3
  import random
4
- import requests
 
5
  import numpy as np
6
  import pandas as pd
7
- import faiss
 
 
8
  from sentence_transformers import SentenceTransformer
 
9
 
10
- # --- CONFIG ---
11
- HF_TOKEN = (open(".env", "r").read().strip() if ".env" else None) or ""
12
- HF_TOKEN = HF_TOKEN.strip() if HF_TOKEN else ""
13
- if not HF_TOKEN:
14
- print("⚠️ No Hugging Face token found. Running in fallback mode (local inference).")
15
-
16
- API_URL = "https://router.huggingface.co/hf-inference"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
- incident_embeddings = []
20
- incident_texts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- recent_events = []
23
 
24
- # --- SIMULATION HELPERS ---
25
- def simulate_healing_action(component: str) -> str:
26
- actions = [
 
 
 
27
  "Restarted container",
 
28
  "Cleared queue backlog",
29
- "Rebalanced load",
30
- "No actionable step detected.",
31
- ]
32
- return random.choice(actions)
33
-
34
- def detect_anomaly(latency: float, error_rate: float) -> bool:
35
- # Simple adaptive anomaly threshold
36
- score = latency * error_rate
37
- threshold = random.uniform(5, 25)
38
- return score > threshold
39
-
40
- def embed_incident(text: str):
41
- emb = model.encode([text], normalize_embeddings=True)
42
- return np.array(emb).astype("float32")
43
-
44
- def find_similar_incidents(new_text: str, top_k=3):
45
- if not incident_embeddings:
46
- return "Not enough incidents stored yet."
47
- new_emb = embed_incident(new_text)
48
- index = faiss.IndexFlatIP(len(new_emb[0]))
49
- index.add(np.vstack(incident_embeddings))
50
- scores, ids = index.search(new_emb, top_k)
51
- similar = [
52
- f"Component: {incident_texts[i]['component']} | Latency: {incident_texts[i]['latency']} | ErrorRate: {incident_texts[i]['error_rate']} | Analysis: {incident_texts[i]['analysis'][:60]}..."
53
- for i in ids[0] if i < len(incident_texts)
54
  ]
55
- return f"Found {len(similar)} similar incidents ({'; '.join(similar)})."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # --- MAIN PROCESS ---
58
  def process_event(component, latency, error_rate):
59
- timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
60
- anomaly = detect_anomaly(latency, error_rate)
61
-
62
- # --- Analysis step ---
63
- payload = {
64
- "inputs": f"Component {component} showing latency {latency} and error rate {error_rate}.",
65
- }
66
-
67
- try:
68
- headers = {"Authorization": f"Bearer {HF_TOKEN.strip()}"}
69
- response = requests.post(f"{API_URL}/facebook/bart-large-mnli", headers=headers, json=payload)
70
- if response.status_code == 200:
71
- analysis = response.json().get("generated_text", "No analysis output.")
72
- else:
73
- analysis = f"Error {response.status_code}: {response.text}"
74
- except Exception as e:
75
- analysis = f"Error generating analysis: {str(e)}"
76
-
77
- status = "Anomaly" if anomaly else "Normal"
78
- healing_action = simulate_healing_action(component) if anomaly else "-"
79
- similar_info = find_similar_incidents(analysis)
80
-
81
  event = {
82
- "timestamp": timestamp,
83
  "component": component,
84
  "latency": latency,
85
  "error_rate": error_rate,
86
- "status": status,
87
- "analysis": analysis,
88
- "healing_action": f"{healing_action} {similar_info}" if anomaly else f"- {similar_info}",
89
  }
 
 
 
 
 
90
 
91
- recent_events.append(event)
92
- if len(recent_events) > 20:
93
- recent_events.pop(0)
 
94
 
95
- # --- Store vector memory ---
96
- incident_embeddings.append(embed_incident(analysis))
97
- incident_texts.append(event)
98
 
99
- return f"✅ Event Processed", pd.DataFrame(recent_events)
100
 
101
- # --- GRADIO UI ---
 
 
102
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
103
  gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
104
- gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory")
 
 
105
 
106
- with gr.Row():
107
- component = gr.Textbox(label="Component", value="api-service")
108
- latency = gr.Number(label="Latency (ms)", value=random.uniform(50, 200))
109
- error_rate = gr.Number(label="Error Rate", value=random.uniform(0.01, 0.2))
110
 
111
- submit = gr.Button("🚀 Submit Telemetry Event")
112
- output_text = gr.Textbox(label="Detection Output")
113
- output_table = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"], label="Recent Events (Last 20)")
114
 
115
- submit.click(process_event, inputs=[component, latency, error_rate], outputs=[output_text, output_table])
116
 
117
- demo.launch()
 
 
 
 
 
1
+ import os
 
2
  import random
3
+ import json
4
+ import time
5
  import numpy as np
6
  import pandas as pd
7
+ import requests
8
+ from datetime import datetime
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
  from sentence_transformers import SentenceTransformer
11
+ import gradio as gr
12
 
13
+ # ============================
14
+ # SAFE TOKEN LOAD
15
+ # ============================
16
+ HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
17
+ if not HF_TOKEN and os.path.exists(".env"):
18
+ try:
19
+ with open(".env", "r") as f:
20
+ HF_TOKEN = f.read().strip()
21
+ except Exception:
22
+ HF_TOKEN = ""
23
+
24
+ if HF_TOKEN:
25
+ print("✅ Hugging Face token loaded successfully.")
26
+ else:
27
+ print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
28
+
29
+ # ============================
30
+ # GLOBAL CONFIG
31
+ # ============================
32
+ HF_API_URL = "https://router.huggingface.co/hf-inference"
33
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
34
+
35
+ # Load a lightweight sentence transformer for embedding incidents
36
+ model = SentenceTransformer("all-MiniLM-L6-v2")
37
+
38
+ # Vector memory store (in-memory for now)
39
+ incident_memory = []
40
+
41
+ # ============================
42
+ # ANOMALY DETECTION
43
+ # ============================
44
+ def detect_anomaly(event):
45
+ """
46
+ Detects anomalies based on latency/error_rate thresholds.
47
+ Forces an anomaly randomly for validation.
48
+ """
49
+ force_anomaly = random.random() < 0.25 # ~25% forced anomaly rate
50
+ if force_anomaly or event["latency"] > 150 or event["error_rate"] > 0.05:
51
+ return True
52
+ return False
53
+
54
+
55
+ # ============================
56
+ # AI ANALYSIS + HEALING
57
+ # ============================
58
+ def analyze_event(event):
59
+ """
60
+ Send event to HF Inference API for analysis, fallback locally if needed.
61
+ """
62
+ prompt = (
63
+ f"Analyze this telemetry event and suggest a healing action:\n"
64
+ f"Component: {event['component']}\n"
65
+ f"Latency: {event['latency']}\n"
66
+ f"Error Rate: {event['error_rate']}\n"
67
+ f"Detected Anomaly: {event['anomaly']}\n"
68
+ )
69
+
70
+ if not HF_TOKEN:
71
+ return "Local mode: analysis unavailable (no token).", "No action taken."
72
 
73
+ try:
74
+ response = requests.post(
75
+ f"{HF_API_URL}/mistralai/Mixtral-8x7B-Instruct-v0.1",
76
+ headers=headers,
77
+ json={"inputs": prompt},
78
+ timeout=10,
79
+ )
80
+ if response.status_code == 200:
81
+ result = response.json()
82
+ text = (
83
+ result[0]["generated_text"]
84
+ if isinstance(result, list) and "generated_text" in result[0]
85
+ else str(result)
86
+ )
87
+ return text, choose_healing_action(event, text)
88
+ else:
89
+ return f"Error {response.status_code}: {response.text}", "No actionable step detected."
90
+ except Exception as e:
91
+ return f"Error generating analysis: {e}", "No actionable step detected."
92
 
 
93
 
94
+ # ============================
95
+ # HEALING SIMULATION
96
+ # ============================
97
+ def choose_healing_action(event, analysis_text):
98
+ """Simulates an automated healing response."""
99
+ possible_actions = [
100
  "Restarted container",
101
+ "Scaled service replicas",
102
  "Cleared queue backlog",
103
+ "Invalidated cache",
104
+ "Re-deployed model endpoint",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  ]
106
+ if "restart" in analysis_text.lower():
107
+ return "Restarted container"
108
+ elif "scale" in analysis_text.lower():
109
+ return "Scaled service replicas"
110
+ elif "cache" in analysis_text.lower():
111
+ return "Invalidated cache"
112
+ return random.choice(possible_actions)
113
+
114
+
115
+ # ============================
116
+ # VECTOR SIMILARITY ENGINE
117
+ # ============================
118
+ def record_and_search_similar(event, analysis_text):
119
+ """
120
+ Store each event as a vector and retrieve similar past incidents.
121
+ """
122
+ description = (
123
+ f"Component: {event['component']} | "
124
+ f"Latency: {event['latency']} | "
125
+ f"ErrorRate: {event['error_rate']} | "
126
+ f"Analysis: {analysis_text}"
127
+ )
128
+ embedding = model.encode(description)
129
+
130
+ similar_info = ""
131
+ if incident_memory:
132
+ existing_embeddings = np.array([e["embedding"] for e in incident_memory])
133
+ sims = cosine_similarity([embedding], existing_embeddings)[0]
134
+ top_indices = sims.argsort()[-3:][::-1]
135
+ similar = [
136
+ incident_memory[i]["description"]
137
+ for i in top_indices
138
+ if sims[i] > 0.7
139
+ ]
140
+ if similar:
141
+ similar_info = f"Found {len(similar)} similar incidents (e.g., {similar[0][:150]}...)."
142
+
143
+ incident_memory.append({"embedding": embedding, "description": description})
144
+ return similar_info
145
+
146
+
147
+ # ============================
148
+ # EVENT HANDLER
149
+ # ============================
150
+ event_log = []
151
 
 
152
  def process_event(component, latency, error_rate):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  event = {
154
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
155
  "component": component,
156
  "latency": latency,
157
  "error_rate": error_rate,
 
 
 
158
  }
159
+ event["anomaly"] = detect_anomaly(event)
160
+ status = "Anomaly" if event["anomaly"] else "Normal"
161
+ analysis, healing = analyze_event(event)
162
+ similar = record_and_search_similar(event, analysis)
163
+ healing = f"{healing} {similar}".strip()
164
 
165
+ event["status"] = status
166
+ event["analysis"] = analysis
167
+ event["healing_action"] = healing
168
+ event_log.append(event)
169
 
170
+ df = pd.DataFrame(event_log[-20:])
171
+ return f"✅ Event Processed ({status})", df
 
172
 
 
173
 
174
+ # ============================
175
+ # GRADIO UI
176
+ # ============================
177
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
178
  gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
179
+ gr.Markdown(
180
+ "Adaptive anomaly detection + AI-driven self-healing + vector memory"
181
+ )
182
 
183
+ component = gr.Textbox(label="Component", value="api-service")
184
+ latency = gr.Slider(10, 400, value=100, label="Latency (ms)")
185
+ error_rate = gr.Slider(0.0, 0.2, value=0.02, label="Error Rate")
 
186
 
187
+ submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
188
+ output = gr.Textbox(label="Detection Output")
189
+ table = gr.Dataframe(label="Recent Events (Last 20)")
190
 
191
+ submit.click(process_event, [component, latency, error_rate], [output, table])
192
 
193
+ # ============================
194
+ # ENTRY POINT
195
+ # ============================
196
+ if __name__ == "__main__":
197
+ demo.launch(server_name="0.0.0.0", server_port=7860)