petter2025 commited on
Commit
e94f0ea
·
verified ·
1 Parent(s): 0b2d10e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -168
app.py CHANGED
@@ -1,205 +1,161 @@
1
  import os
2
- import random
3
  import json
 
4
  import time
 
5
  import numpy as np
6
- import pandas as pd
7
  import requests
8
- from datetime import datetime
9
- from sklearn.metrics.pairwise import cosine_similarity
10
  from sentence_transformers import SentenceTransformer
11
  import faiss
12
- import gradio as gr
13
 
14
- # ============================
15
- # SAFE TOKEN LOAD
16
- # ============================
17
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
18
- if not HF_TOKEN and os.path.exists(".env"):
19
- try:
20
- with open(".env", "r") as f:
21
- HF_TOKEN = f.read().strip()
22
- except Exception:
23
- HF_TOKEN = ""
24
-
25
- if HF_TOKEN:
26
- print("✅ Hugging Face token loaded successfully.")
27
- else:
28
  print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
 
 
29
 
30
- # ============================
31
- # CONFIG
32
- # ============================
33
- HF_API_URL = "https://router.huggingface.co/hf-inference"
34
- headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
35
- DATA_DIR = "./data"
36
- os.makedirs(DATA_DIR, exist_ok=True)
37
-
38
- # ============================
39
- # MODEL + FAISS SETUP
40
- # ============================
41
- model = SentenceTransformer("all-MiniLM-L6-v2")
42
- VECTOR_DIM = model.get_sentence_embedding_dimension()
43
- FAISS_PATH = os.path.join(DATA_DIR, "incident_memory.faiss")
44
- META_PATH = os.path.join(DATA_DIR, "incidents.json")
45
-
46
- # Load or initialize FAISS index
47
- if os.path.exists(FAISS_PATH):
48
- try:
49
- index = faiss.read_index(FAISS_PATH)
50
- with open(META_PATH, "r") as f:
51
- incident_memory = json.load(f)
52
- print(f"✅ Loaded {len(incident_memory)} past incidents from FAISS.")
53
- except Exception:
54
- print("⚠️ Failed to load FAISS index. Starting fresh.")
55
- index = faiss.IndexFlatL2(VECTOR_DIM)
56
- incident_memory = []
57
  else:
58
  index = faiss.IndexFlatL2(VECTOR_DIM)
59
- incident_memory = []
 
 
 
 
 
 
 
 
60
 
61
- # ============================
62
- # ANOMALY DETECTION
63
- # ============================
64
  def detect_anomaly(event):
65
- """Detects anomalies based on latency/error_rate thresholds, with forced random noise."""
66
- force_anomaly = random.random() < 0.25
67
- if force_anomaly or event["latency"] > 150 or event["error_rate"] > 0.05:
 
 
 
68
  return True
69
- return False
70
 
71
- # ============================
72
- # AI ANALYSIS + HEALING
73
- # ============================
74
- def analyze_event(event):
75
- prompt = (
76
- f"Analyze this telemetry event and suggest a healing action:\n"
77
- f"Component: {event['component']}\n"
78
- f"Latency: {event['latency']}\n"
79
- f"Error Rate: {event['error_rate']}\n"
80
- f"Detected Anomaly: {event['anomaly']}\n"
81
- )
82
 
 
 
83
  if not HF_TOKEN:
84
- return "Local mode: analysis unavailable (no token).", "No action taken."
85
 
86
  try:
87
- response = requests.post(
88
- f"{HF_API_URL}/mistralai/Mixtral-8x7B-Instruct-v0.1",
89
- headers=headers,
90
- json={"inputs": prompt},
91
- timeout=10,
92
- )
 
93
  if response.status_code == 200:
94
  result = response.json()
95
- text = (
96
- result[0]["generated_text"]
97
- if isinstance(result, list) and "generated_text" in result[0]
98
- else str(result)
99
- )
100
- return text, choose_healing_action(event, text)
101
  else:
102
- return f"Error {response.status_code}: {response.text}", "No actionable step detected."
103
  except Exception as e:
104
- return f"Error generating analysis: {e}", "No actionable step detected."
105
 
106
- # ============================
107
- # HEALING SIMULATION
108
- # ============================
109
- def choose_healing_action(event, analysis_text):
110
- possible_actions = [
111
  "Restarted container",
112
- "Scaled service replicas",
113
  "Cleared queue backlog",
114
- "Invalidated cache",
115
- "Re-deployed model endpoint",
116
  ]
117
- if "restart" in analysis_text.lower():
118
- return "Restarted container"
119
- elif "scale" in analysis_text.lower():
120
- return "Scaled service replicas"
121
- elif "cache" in analysis_text.lower():
122
- return "Invalidated cache"
123
- return random.choice(possible_actions)
124
-
125
- # ============================
126
- # VECTOR SIMILARITY + FAISS PERSISTENCE
127
- # ============================
128
- def record_and_search_similar(event, analysis_text):
129
- """Store each event vector in FAISS and search for similar incidents."""
130
- description = (
131
- f"Component: {event['component']} | "
132
- f"Latency: {event['latency']} | "
133
- f"ErrorRate: {event['error_rate']} | "
134
- f"Analysis: {analysis_text}"
135
- )
136
- embedding = model.encode(description).astype("float32").reshape(1, -1)
137
-
138
- similar_info = ""
139
- if len(incident_memory) > 0 and index.ntotal > 0:
140
- k = min(3, len(incident_memory))
141
- D, I = index.search(embedding, k)
142
- similar = [incident_memory[i]["description"] for i in I[0] if D[0][0] < 0.5]
143
- if similar:
144
- similar_info = f"Found {len(similar)} similar incidents (e.g., {similar[0][:120]}...)."
145
-
146
- # Store new entry
147
- incident_memory.append({"description": description})
148
- index.add(embedding)
149
 
150
- # Persist FAISS + metadata
151
- faiss.write_index(index, FAISS_PATH)
152
- with open(META_PATH, "w") as f:
153
- json.dump(incident_memory, f)
154
-
155
- return similar_info
156
-
157
- # ============================
158
- # EVENT HANDLER
159
- # ============================
160
- event_log = []
161
-
162
- def process_event(component, latency, error_rate):
163
  event = {
164
- "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
165
  "component": component,
166
  "latency": latency,
167
- "error_rate": error_rate,
168
  }
169
 
170
- event["anomaly"] = detect_anomaly(event)
171
- status = "Anomaly" if event["anomaly"] else "Normal"
172
- analysis, healing = analyze_event(event)
173
- similar = record_and_search_similar(event, analysis)
174
- healing = f"{healing} {similar}".strip()
 
 
 
 
 
175
 
176
- event["status"] = status
 
177
  event["analysis"] = analysis
178
- event["healing_action"] = healing
179
- event_log.append(event)
180
-
181
- df = pd.DataFrame(event_log[-20:])
182
- return f"✅ Event Processed ({status})", df
183
-
184
- # ============================
185
- # GRADIO UI
186
- # ============================
187
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
188
- gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
189
- gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory (FAISS persistent)")
190
-
191
- component = gr.Textbox(label="Component", value="api-service")
192
- latency = gr.Slider(10, 400, value=100, label="Latency (ms)")
193
- error_rate = gr.Slider(0.0, 0.2, value=0.02, label="Error Rate")
194
-
195
- submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
196
- output = gr.Textbox(label="Detection Output")
197
- table = gr.Dataframe(label="Recent Events (Last 20)")
198
-
199
- submit.click(process_event, [component, latency, error_rate], [output, table])
200
-
201
- # ============================
202
- # ENTRY POINT
203
- # ============================
204
- if __name__ == "__main__":
205
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import json
3
+ import random
4
  import time
5
+ import datetime
6
  import numpy as np
7
+ import gradio as gr
8
  import requests
 
 
9
  from sentence_transformers import SentenceTransformer
10
  import faiss
 
11
 
12
+ # === Config ===
 
 
13
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
14
+ if not HF_TOKEN:
 
 
 
 
 
 
 
 
 
15
  print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
16
+ else:
17
+ print("✅ Hugging Face token loaded successfully.")
18
 
19
+ HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
20
+ HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
21
+
22
+ # === FAISS Setup ===
23
+ VECTOR_DIM = 384
24
+ INDEX_FILE = "incident_vectors.index"
25
+ TEXTS_FILE = "incident_texts.json"
26
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+
28
+ if os.path.exists(INDEX_FILE):
29
+ index = faiss.read_index(INDEX_FILE)
30
+ with open(TEXTS_FILE, "r") as f:
31
+ incident_texts = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  else:
33
  index = faiss.IndexFlatL2(VECTOR_DIM)
34
+ incident_texts = []
35
+
36
+ def save_index():
37
+ faiss.write_index(index, INDEX_FILE)
38
+ with open(TEXTS_FILE, "w") as f:
39
+ json.dump(incident_texts, f)
40
+
41
+ # === Event Memory ===
42
+ events = []
43
 
 
 
 
44
  def detect_anomaly(event):
45
+ """Adaptive threshold-based anomaly detection."""
46
+ latency = event["latency"]
47
+ error_rate = event["error_rate"]
48
+
49
+ # Force random anomaly occasionally for testing
50
+ if random.random() < 0.25:
51
  return True
 
52
 
53
+ return latency > 150 or error_rate > 0.05
 
 
 
 
 
 
 
 
 
 
54
 
55
+ def call_huggingface_analysis(prompt):
56
+ """Use HF Inference API or fallback simulation."""
57
  if not HF_TOKEN:
58
+ return "Offline mode: simulated analysis."
59
 
60
  try:
61
+ payload = {
62
+ "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
63
+ "prompt": prompt,
64
+ "max_tokens": 200,
65
+ "temperature": 0.3,
66
+ }
67
+ response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
68
  if response.status_code == 200:
69
  result = response.json()
70
+ return result.get("choices", [{}])[0].get("text", "").strip()
 
 
 
 
 
71
  else:
72
+ return f"Error {response.status_code}: {response.text}"
73
  except Exception as e:
74
+ return f"Error generating analysis: {e}"
75
 
76
+ def simulate_healing(event):
77
+ actions = [
 
 
 
78
  "Restarted container",
79
+ "Scaled up instance",
80
  "Cleared queue backlog",
81
+ "No actionable step detected."
 
82
  ]
83
+ return random.choice(actions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def analyze_event(component, latency, error_rate):
 
 
 
 
 
 
 
 
 
 
 
 
86
  event = {
87
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
88
  "component": component,
89
  "latency": latency,
90
+ "error_rate": error_rate
91
  }
92
 
93
+ is_anomaly = detect_anomaly(event)
94
+ event["anomaly"] = is_anomaly
95
+ event["status"] = "Anomaly" if is_anomaly else "Normal"
96
+
97
+ # Build textual prompt
98
+ prompt = (
99
+ f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
100
+ f"Status: {event['status']}\n\n"
101
+ "Provide a one-line reliability insight or root cause analysis."
102
+ )
103
 
104
+ # Analysis
105
+ analysis = call_huggingface_analysis(prompt)
106
  event["analysis"] = analysis
107
+
108
+ # Healing simulation
109
+ healing_action = simulate_healing(event)
110
+ event["healing_action"] = healing_action
111
+
112
+ # === Vector learning ===
113
+ vector_text = f"{component} {latency} {error_rate} {analysis}"
114
+ vec = model.encode([vector_text])
115
+ index.add(np.array(vec, dtype=np.float32))
116
+ incident_texts.append(vector_text)
117
+ save_index()
118
+
119
+ # Find similar incidents
120
+ if len(incident_texts) > 1:
121
+ D, I = index.search(vec, k=min(3, len(incident_texts)))
122
+ similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
123
+ if similar:
124
+ event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:120]}...)."
125
+ else:
126
+ event["healing_action"] += " - Not enough incidents stored yet."
127
+
128
+ events.append(event)
129
+ return json.dumps(event, indent=2)
130
+
131
+ # === UI ===
132
+ def submit_event(component, latency, error_rate):
133
+ result = analyze_event(component, latency, error_rate)
134
+ parsed = json.loads(result)
135
+
136
+ table = [
137
+ [e["timestamp"], e["component"], e["latency"], e["error_rate"],
138
+ e["status"], e["analysis"], e["healing_action"]]
139
+ for e in events[-20:]
140
+ ]
141
+
142
+ return (
143
+ f"✅ Event Processed ({parsed['status']})",
144
+ gr.Dataframe(
145
+ headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
146
+ value=table
147
+ )
148
+ )
149
+
150
+ with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
151
+ gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + vector memory (FAISS persistent)")
152
+ with gr.Row():
153
+ component = gr.Textbox(label="Component", value="api-service")
154
+ latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
155
+ error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
156
+ submit = gr.Button("🚀 Submit Telemetry Event")
157
+ output_text = gr.Textbox(label="Detection Output")
158
+ table_output = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"])
159
+ submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
160
+
161
+ demo.launch(server_name="0.0.0.0", server_port=7860)