Update app.py
Browse filesimport os
import json
import random
import datetime
import numpy as np
import gradio as gr
import requests
import faiss
from fastapi import FastAPI, Body, Header, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from filelock import FileLock
# === Config ===
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
API_KEY = os.getenv("API_KEY", "").strip()
HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
# === FAISS Setup ===
VECTOR_DIM = 384
INDEX_FILE = "incident_vectors.index"
TEXTS_FILE = "incident_texts.json"
LOCK_FILE = "faiss_save.lock"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
if os.path.exists(INDEX_FILE):
index = faiss.read_index(INDEX_FILE)
with open(TEXTS_FILE, "r") as f:
incident_texts = json.load(f)
else:
index = faiss.IndexFlatL2(VECTOR_DIM)
incident_texts = []
# === Safe persistence ===
def save_index():
with FileLock(LOCK_FILE):
faiss.write_index(index, INDEX_FILE)
with open(TEXTS_FILE, "w") as f:
json.dump(incident_texts, f)
# === Core logic ===
events = []
def detect_anomaly(event):
"""Adaptive threshold-based anomaly detection."""
latency = event["latency"]
error_rate = event["error_rate"]
# Occasionally flag random anomaly for testing
if random.random() < 0.25:
return True
return latency > 150 or error_rate > 0.05
def call_huggingface_analysis(prompt):
"""Uses HF Inference API or local fallback."""
if not HF_TOKEN:
return "Offline mode: simulated analysis."
try:
payload = {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"prompt": prompt,
"max_tokens": 200,
"temperature": 0.3,
}
response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
if response.status_code == 200:
result = response.json()
return result.get("choices", [{}])[0].get("text", "").strip()
else:
return f"Error {response.status_code}: {response.text}"
except Exception as e:
return f"Error generating analysis: {e}"
def simulate_healing(event):
actions = [
"Restarted container",
"Scaled up instance",
"Cleared queue backlog",
"No actionable step detected.",
]
return random.choice(actions)
def analyze_event(component, latency, error_rate):
event = {
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"component": component,
"latency": latency,
"error_rate": error_rate,
}
is_anomaly = detect_anomaly(event)
event["anomaly"] = is_anomaly
event["status"] = "Anomaly" if is_anomaly else "Normal"
prompt = (
f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
f"Status: {event['status']}\n\n"
"Provide a one-line reliability insight or root cause analysis."
)
# AI Reliability analysis
analysis = call_huggingface_analysis(prompt)
event["analysis"] = analysis
# Simulated self-healing
healing_action = simulate_healing(event)
event["healing_action"] = healing_action
# === Vector learning & persistence ===
vector_text = f"{component} {latency} {error_rate} {analysis}"
vec = model.encode([vector_text])
index.add(np.array(vec, dtype=np.float32))
incident_texts.append(vector_text)
save_index()
# Similar incident lookup
if len(incident_texts) > 1:
D, I = index.search(vec, k=min(3, len(incident_texts)))
similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
if similar:
event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:100]}...)."
else:
event["healing_action"] += " - Not enough incidents stored yet."
events.append(event)
return event
# === FastAPI backend ===
app = FastAPI(title="Agentic Reliability Framework API")
class AddEventModel(BaseModel):
component: str
latency: float
error_rate: float
def verify_api_key(provided_key: str):
if not API_KEY:
return True # dev mode
return provided_key == API_KEY
@app .post("/add-event")
def add_event(
payload: AddEventModel = Body(...),
x_api_key: str = Header(None, alias="X-API-Key"),
):
"""Add a telemetry event (secured via API key)."""
if not verify_api_key(x_api_key):
raise HTTPException(status_code=401, detail="Unauthorized: invalid API key.")
try:
event = analyze_event(payload.component, payload.latency, payload.error_rate)
return {"status": "ok", "event": event}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to add event: {e}")
# === Gradio Dashboard ===
def submit_event(component, latency, error_rate):
event = analyze_event(component, latency, error_rate)
table = [
[
e["timestamp"],
e["component"],
e["latency"],
e["error_rate"],
e["status"],
e["analysis"],
e["healing_action"],
]
for e in events[-20:]
]
return (
f"✅ Event Processed ({event['status']})",
gr.Dataframe(
headers=[
"timestamp",
"component",
"latency",
"error_rate",
"status",
"analysis",
"healing_action",
],
value=table,
),
)
with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
gr.Markdown(
"## 🧠 Agentic Reliability Framework MVP\n"
"Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory"
)
with gr.Row():
component = gr.Textbox(label="Component", value="api-service")
latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
submit = gr.Button("🚀 Submit Telemetry Event")
output_text = gr.Textbox(label="Detection Output")
table_output = gr.Dataframe(
headers=[
"timestamp",
"component",
"latency",
"error_rate",
"status",
"analysis",
"healing_action",
]
)
submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|
@@ -1,92 +1,68 @@
|
|
| 1 |
-
# app.py - Agentic Reliability Framework MVP
|
| 2 |
-
# Drop-in replacement: supports Gradio UI + FastAPI REST endpoints (/semantic-search, /add-event, /recent-events)
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import random
|
| 6 |
import datetime
|
| 7 |
-
import threading
|
| 8 |
import numpy as np
|
| 9 |
import gradio as gr
|
| 10 |
import requests
|
| 11 |
import faiss
|
| 12 |
-
from fastapi import FastAPI,
|
| 13 |
-
from
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
from filelock import FileLock
|
| 16 |
-
import uvicorn
|
| 17 |
-
from pydantic import BaseModel, Field
|
| 18 |
|
| 19 |
# === Config ===
|
| 20 |
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
|
|
|
|
|
|
|
| 21 |
HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
|
| 22 |
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
# === Persistence / FAISS config ===
|
| 27 |
VECTOR_DIM = 384
|
| 28 |
INDEX_FILE = "incident_vectors.index"
|
| 29 |
TEXTS_FILE = "incident_texts.json"
|
| 30 |
-
LOCK_FILE = "
|
| 31 |
|
| 32 |
-
# Sentence-transformers model (small and fast)
|
| 33 |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
except Exception as e:
|
| 43 |
-
print(f"⚠️ Failed to load index/texts: {e} — creating new in-memory index.")
|
| 44 |
-
return faiss.IndexFlatL2(VECTOR_DIM), []
|
| 45 |
|
| 46 |
-
index, incident_texts = load_faiss_index()
|
| 47 |
|
|
|
|
| 48 |
def save_index():
|
| 49 |
-
"""Persist FAISS + metadata atomically using a file lock."""
|
| 50 |
with FileLock(LOCK_FILE):
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
except Exception as e:
|
| 56 |
-
print(f"⚠️ Error saving index/texts: {e}")
|
| 57 |
-
|
| 58 |
-
# === In-memory events list ===
|
| 59 |
-
events = []
|
| 60 |
|
| 61 |
# === Core logic ===
|
|
|
|
|
|
|
|
|
|
| 62 |
def detect_anomaly(event):
|
|
|
|
| 63 |
latency = event["latency"]
|
| 64 |
error_rate = event["error_rate"]
|
| 65 |
-
|
|
|
|
| 66 |
if random.random() < 0.25:
|
| 67 |
return True
|
|
|
|
| 68 |
return latency > 150 or error_rate > 0.05
|
| 69 |
|
| 70 |
-
def local_reliability_analysis(prompt: str):
|
| 71 |
-
"""Local fallback analysis using semantic similarity and simple heuristic text reply."""
|
| 72 |
-
try:
|
| 73 |
-
embedding = model.encode([prompt])
|
| 74 |
-
# store the prompt as a data point (so local memory grows)
|
| 75 |
-
index.add(np.array(embedding, dtype=np.float32))
|
| 76 |
-
incident_texts.append(prompt)
|
| 77 |
-
save_index()
|
| 78 |
-
if len(incident_texts) > 1:
|
| 79 |
-
D, I = index.search(np.array(embedding, dtype=np.float32), k=min(3, len(incident_texts)))
|
| 80 |
-
similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
|
| 81 |
-
return f"Local insight: found {len(similar)} similar incident(s)."
|
| 82 |
-
return "Local insight: first incident stored."
|
| 83 |
-
except Exception as e:
|
| 84 |
-
return f"Local analysis error: {e}"
|
| 85 |
|
| 86 |
-
def call_huggingface_analysis(prompt
|
| 87 |
-
"""
|
| 88 |
if not HF_TOKEN:
|
| 89 |
-
return
|
| 90 |
|
| 91 |
try:
|
| 92 |
payload = {
|
|
@@ -95,168 +71,163 @@ def call_huggingface_analysis(prompt: str):
|
|
| 95 |
"max_tokens": 200,
|
| 96 |
"temperature": 0.3,
|
| 97 |
}
|
| 98 |
-
|
| 99 |
-
if
|
| 100 |
-
result =
|
| 101 |
-
|
| 102 |
-
text = ""
|
| 103 |
-
if isinstance(result, dict):
|
| 104 |
-
# common HF completion shape
|
| 105 |
-
choices = result.get("choices") or []
|
| 106 |
-
if choices:
|
| 107 |
-
text = choices[0].get("text") or choices[0].get("message", {}).get("content", "")
|
| 108 |
-
else:
|
| 109 |
-
text = result.get("generated_text") or ""
|
| 110 |
-
elif isinstance(result, list) and result:
|
| 111 |
-
text = result[0].get("text", "")
|
| 112 |
-
return (text or local_reliability_analysis(prompt)).strip()
|
| 113 |
else:
|
| 114 |
-
|
| 115 |
-
return local_reliability_analysis(prompt)
|
| 116 |
except Exception as e:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
|
| 120 |
def simulate_healing(event):
|
| 121 |
actions = [
|
| 122 |
"Restarted container",
|
| 123 |
"Scaled up instance",
|
| 124 |
"Cleared queue backlog",
|
| 125 |
-
"No actionable step detected."
|
| 126 |
]
|
| 127 |
return random.choice(actions)
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
event = {
|
| 132 |
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 133 |
"component": component,
|
| 134 |
-
"latency":
|
| 135 |
-
"error_rate":
|
| 136 |
}
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
|
| 140 |
prompt = (
|
| 141 |
f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
|
| 142 |
-
f"Status: {event['status']}\n\
|
|
|
|
| 143 |
)
|
| 144 |
|
|
|
|
| 145 |
analysis = call_huggingface_analysis(prompt)
|
| 146 |
event["analysis"] = analysis
|
| 147 |
-
event["healing_action"] = simulate_healing(event)
|
| 148 |
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
except Exception as e:
|
| 169 |
-
print(f"⚠️ Error searching index: {e}")
|
| 170 |
|
| 171 |
events.append(event)
|
| 172 |
-
# keep events bounded to reasonable size
|
| 173 |
-
if len(events) > 1000:
|
| 174 |
-
events.pop(0)
|
| 175 |
return event
|
| 176 |
|
| 177 |
-
# === FastAPI app + models ===
|
| 178 |
-
app = FastAPI(title="Agentic Reliability API", version="0.3")
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
allow_credentials=True,
|
| 184 |
-
allow_methods=["*"],
|
| 185 |
-
allow_headers=["*"],
|
| 186 |
-
)
|
| 187 |
|
| 188 |
class AddEventModel(BaseModel):
|
| 189 |
-
component: str
|
| 190 |
-
latency: float
|
| 191 |
-
error_rate: float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
@app.post("/add-event")
|
| 194 |
-
def add_event(
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 199 |
try:
|
| 200 |
event = analyze_event(payload.component, payload.latency, payload.error_rate)
|
| 201 |
return {"status": "ok", "event": event}
|
| 202 |
except Exception as e:
|
| 203 |
raise HTTPException(status_code=500, detail=f"Failed to add event: {e}")
|
| 204 |
|
| 205 |
-
@app.get("/recent-events")
|
| 206 |
-
def recent_events(n: int = Query(20, ge=1, le=200, description="Number of recent events to return")):
|
| 207 |
-
"""Return the most recent processed events (default: 20)."""
|
| 208 |
-
sliced = events[-n:]
|
| 209 |
-
return {"count": len(sliced), "events": sliced[::-1]} # newest first
|
| 210 |
-
|
| 211 |
-
@app.get("/semantic-search")
|
| 212 |
-
def semantic_search(query: str = Query(..., description="Search query for reliability memory"), k: int = 3):
|
| 213 |
-
"""Perform semantic similarity search over stored reliability incidents."""
|
| 214 |
-
if not incident_texts:
|
| 215 |
-
return {"results": [], "message": "No incidents in memory yet."}
|
| 216 |
-
try:
|
| 217 |
-
embedding = model.encode([query])
|
| 218 |
-
D, I = index.search(np.array(embedding, dtype=np.float32), k=min(k, len(incident_texts)))
|
| 219 |
-
results = []
|
| 220 |
-
for rank, idx in enumerate(I[0]):
|
| 221 |
-
if idx < len(incident_texts):
|
| 222 |
-
results.append({"text": incident_texts[idx], "distance": float(D[0][rank])})
|
| 223 |
-
return {"query": query, "results": results}
|
| 224 |
-
except Exception as e:
|
| 225 |
-
raise HTTPException(status_code=500, detail=f"Semantic search failed: {e}")
|
| 226 |
|
| 227 |
-
# === Gradio
|
| 228 |
def submit_event(component, latency, error_rate):
|
| 229 |
-
|
|
|
|
| 230 |
table = [
|
| 231 |
-
[
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
for e in events[-20:]
|
| 234 |
]
|
|
|
|
| 235 |
return (
|
| 236 |
-
f"✅ Event Processed ({
|
| 237 |
gr.Dataframe(
|
| 238 |
-
headers=[
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
|
|
|
|
| 243 |
with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
|
| 244 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 245 |
with gr.Row():
|
| 246 |
component = gr.Textbox(label="Component", value="api-service")
|
| 247 |
latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
|
| 248 |
error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
|
| 249 |
submit = gr.Button("🚀 Submit Telemetry Event")
|
| 250 |
output_text = gr.Textbox(label="Detection Output")
|
| 251 |
-
table_output = gr.Dataframe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
|
| 253 |
|
| 254 |
-
# === Launch both servers (Gradio UI + FastAPI) in same process ===
|
| 255 |
-
def start_gradio():
|
| 256 |
-
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
| 257 |
|
| 258 |
if __name__ == "__main__":
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
t.start()
|
| 262 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import random
|
| 4 |
import datetime
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import gradio as gr
|
| 7 |
import requests
|
| 8 |
import faiss
|
| 9 |
+
from fastapi import FastAPI, Body, Header, HTTPException
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from filelock import FileLock
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# === Config ===
|
| 15 |
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
|
| 16 |
+
API_KEY = os.getenv("API_KEY", "").strip()
|
| 17 |
+
|
| 18 |
HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
|
| 19 |
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 20 |
|
| 21 |
+
# === FAISS Setup ===
|
|
|
|
|
|
|
| 22 |
VECTOR_DIM = 384
|
| 23 |
INDEX_FILE = "incident_vectors.index"
|
| 24 |
TEXTS_FILE = "incident_texts.json"
|
| 25 |
+
LOCK_FILE = "faiss_save.lock"
|
| 26 |
|
|
|
|
| 27 |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 28 |
|
| 29 |
+
if os.path.exists(INDEX_FILE):
|
| 30 |
+
index = faiss.read_index(INDEX_FILE)
|
| 31 |
+
with open(TEXTS_FILE, "r") as f:
|
| 32 |
+
incident_texts = json.load(f)
|
| 33 |
+
else:
|
| 34 |
+
index = faiss.IndexFlatL2(VECTOR_DIM)
|
| 35 |
+
incident_texts = []
|
|
|
|
|
|
|
|
|
|
| 36 |
|
|
|
|
| 37 |
|
| 38 |
+
# === Safe persistence ===
|
| 39 |
def save_index():
|
|
|
|
| 40 |
with FileLock(LOCK_FILE):
|
| 41 |
+
faiss.write_index(index, INDEX_FILE)
|
| 42 |
+
with open(TEXTS_FILE, "w") as f:
|
| 43 |
+
json.dump(incident_texts, f)
|
| 44 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# === Core logic ===
|
| 47 |
+
events = []
|
| 48 |
+
|
| 49 |
+
|
| 50 |
def detect_anomaly(event):
|
| 51 |
+
"""Adaptive threshold-based anomaly detection."""
|
| 52 |
latency = event["latency"]
|
| 53 |
error_rate = event["error_rate"]
|
| 54 |
+
|
| 55 |
+
# Occasionally flag random anomaly for testing
|
| 56 |
if random.random() < 0.25:
|
| 57 |
return True
|
| 58 |
+
|
| 59 |
return latency > 150 or error_rate > 0.05
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
def call_huggingface_analysis(prompt):
|
| 63 |
+
"""Uses HF Inference API or local fallback."""
|
| 64 |
if not HF_TOKEN:
|
| 65 |
+
return "Offline mode: simulated analysis."
|
| 66 |
|
| 67 |
try:
|
| 68 |
payload = {
|
|
|
|
| 71 |
"max_tokens": 200,
|
| 72 |
"temperature": 0.3,
|
| 73 |
}
|
| 74 |
+
response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
|
| 75 |
+
if response.status_code == 200:
|
| 76 |
+
result = response.json()
|
| 77 |
+
return result.get("choices", [{}])[0].get("text", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
else:
|
| 79 |
+
return f"Error {response.status_code}: {response.text}"
|
|
|
|
| 80 |
except Exception as e:
|
| 81 |
+
return f"Error generating analysis: {e}"
|
| 82 |
+
|
| 83 |
|
| 84 |
def simulate_healing(event):
|
| 85 |
actions = [
|
| 86 |
"Restarted container",
|
| 87 |
"Scaled up instance",
|
| 88 |
"Cleared queue backlog",
|
| 89 |
+
"No actionable step detected.",
|
| 90 |
]
|
| 91 |
return random.choice(actions)
|
| 92 |
|
| 93 |
+
|
| 94 |
+
def analyze_event(component, latency, error_rate):
|
| 95 |
event = {
|
| 96 |
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 97 |
"component": component,
|
| 98 |
+
"latency": latency,
|
| 99 |
+
"error_rate": error_rate,
|
| 100 |
}
|
| 101 |
+
|
| 102 |
+
is_anomaly = detect_anomaly(event)
|
| 103 |
+
event["anomaly"] = is_anomaly
|
| 104 |
+
event["status"] = "Anomaly" if is_anomaly else "Normal"
|
| 105 |
|
| 106 |
prompt = (
|
| 107 |
f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
|
| 108 |
+
f"Status: {event['status']}\n\n"
|
| 109 |
+
"Provide a one-line reliability insight or root cause analysis."
|
| 110 |
)
|
| 111 |
|
| 112 |
+
# AI Reliability analysis
|
| 113 |
analysis = call_huggingface_analysis(prompt)
|
| 114 |
event["analysis"] = analysis
|
|
|
|
| 115 |
|
| 116 |
+
# Simulated self-healing
|
| 117 |
+
healing_action = simulate_healing(event)
|
| 118 |
+
event["healing_action"] = healing_action
|
| 119 |
+
|
| 120 |
+
# === Vector learning & persistence ===
|
| 121 |
+
vector_text = f"{component} {latency} {error_rate} {analysis}"
|
| 122 |
+
vec = model.encode([vector_text])
|
| 123 |
+
index.add(np.array(vec, dtype=np.float32))
|
| 124 |
+
incident_texts.append(vector_text)
|
| 125 |
+
save_index()
|
| 126 |
+
|
| 127 |
+
# Similar incident lookup
|
| 128 |
+
if len(incident_texts) > 1:
|
| 129 |
+
D, I = index.search(vec, k=min(3, len(incident_texts)))
|
| 130 |
+
similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
|
| 131 |
+
if similar:
|
| 132 |
+
event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:100]}...)."
|
| 133 |
+
else:
|
| 134 |
+
event["healing_action"] += " - Not enough incidents stored yet."
|
|
|
|
|
|
|
| 135 |
|
| 136 |
events.append(event)
|
|
|
|
|
|
|
|
|
|
| 137 |
return event
|
| 138 |
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
# === FastAPI backend ===
|
| 141 |
+
app = FastAPI(title="Agentic Reliability Framework API")
|
| 142 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
class AddEventModel(BaseModel):
|
| 145 |
+
component: str
|
| 146 |
+
latency: float
|
| 147 |
+
error_rate: float
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def verify_api_key(provided_key: str):
|
| 151 |
+
if not API_KEY:
|
| 152 |
+
return True # dev mode
|
| 153 |
+
return provided_key == API_KEY
|
| 154 |
+
|
| 155 |
|
| 156 |
@app.post("/add-event")
|
| 157 |
+
def add_event(
|
| 158 |
+
payload: AddEventModel = Body(...),
|
| 159 |
+
x_api_key: str = Header(None, alias="X-API-Key"),
|
| 160 |
+
):
|
| 161 |
+
"""Add a telemetry event (secured via API key)."""
|
| 162 |
+
if not verify_api_key(x_api_key):
|
| 163 |
+
raise HTTPException(status_code=401, detail="Unauthorized: invalid API key.")
|
| 164 |
+
|
| 165 |
try:
|
| 166 |
event = analyze_event(payload.component, payload.latency, payload.error_rate)
|
| 167 |
return {"status": "ok", "event": event}
|
| 168 |
except Exception as e:
|
| 169 |
raise HTTPException(status_code=500, detail=f"Failed to add event: {e}")
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
+
# === Gradio Dashboard ===
|
| 173 |
def submit_event(component, latency, error_rate):
|
| 174 |
+
event = analyze_event(component, latency, error_rate)
|
| 175 |
+
|
| 176 |
table = [
|
| 177 |
+
[
|
| 178 |
+
e["timestamp"],
|
| 179 |
+
e["component"],
|
| 180 |
+
e["latency"],
|
| 181 |
+
e["error_rate"],
|
| 182 |
+
e["status"],
|
| 183 |
+
e["analysis"],
|
| 184 |
+
e["healing_action"],
|
| 185 |
+
]
|
| 186 |
for e in events[-20:]
|
| 187 |
]
|
| 188 |
+
|
| 189 |
return (
|
| 190 |
+
f"✅ Event Processed ({event['status']})",
|
| 191 |
gr.Dataframe(
|
| 192 |
+
headers=[
|
| 193 |
+
"timestamp",
|
| 194 |
+
"component",
|
| 195 |
+
"latency",
|
| 196 |
+
"error_rate",
|
| 197 |
+
"status",
|
| 198 |
+
"analysis",
|
| 199 |
+
"healing_action",
|
| 200 |
+
],
|
| 201 |
+
value=table,
|
| 202 |
+
),
|
| 203 |
)
|
| 204 |
|
| 205 |
+
|
| 206 |
with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
|
| 207 |
+
gr.Markdown(
|
| 208 |
+
"## 🧠 Agentic Reliability Framework MVP\n"
|
| 209 |
+
"Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory"
|
| 210 |
+
)
|
| 211 |
with gr.Row():
|
| 212 |
component = gr.Textbox(label="Component", value="api-service")
|
| 213 |
latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
|
| 214 |
error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
|
| 215 |
submit = gr.Button("🚀 Submit Telemetry Event")
|
| 216 |
output_text = gr.Textbox(label="Detection Output")
|
| 217 |
+
table_output = gr.Dataframe(
|
| 218 |
+
headers=[
|
| 219 |
+
"timestamp",
|
| 220 |
+
"component",
|
| 221 |
+
"latency",
|
| 222 |
+
"error_rate",
|
| 223 |
+
"status",
|
| 224 |
+
"analysis",
|
| 225 |
+
"healing_action",
|
| 226 |
+
]
|
| 227 |
+
)
|
| 228 |
submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
|
| 229 |
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
if __name__ == "__main__":
|
| 232 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 233 |
+
|
|
|
|
|
|