Update app.py
Browse files
app.py
CHANGED
|
@@ -1,117 +1,197 @@
|
|
| 1 |
-
import
|
| 2 |
-
import time
|
| 3 |
import random
|
| 4 |
-
import
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
-
import
|
|
|
|
|
|
|
| 8 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
recent_events = []
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
"Restarted container",
|
|
|
|
| 28 |
"Cleared queue backlog",
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
]
|
| 32 |
-
return random.choice(actions)
|
| 33 |
-
|
| 34 |
-
def detect_anomaly(latency: float, error_rate: float) -> bool:
|
| 35 |
-
# Simple adaptive anomaly threshold
|
| 36 |
-
score = latency * error_rate
|
| 37 |
-
threshold = random.uniform(5, 25)
|
| 38 |
-
return score > threshold
|
| 39 |
-
|
| 40 |
-
def embed_incident(text: str):
|
| 41 |
-
emb = model.encode([text], normalize_embeddings=True)
|
| 42 |
-
return np.array(emb).astype("float32")
|
| 43 |
-
|
| 44 |
-
def find_similar_incidents(new_text: str, top_k=3):
|
| 45 |
-
if not incident_embeddings:
|
| 46 |
-
return "Not enough incidents stored yet."
|
| 47 |
-
new_emb = embed_incident(new_text)
|
| 48 |
-
index = faiss.IndexFlatIP(len(new_emb[0]))
|
| 49 |
-
index.add(np.vstack(incident_embeddings))
|
| 50 |
-
scores, ids = index.search(new_emb, top_k)
|
| 51 |
-
similar = [
|
| 52 |
-
f"Component: {incident_texts[i]['component']} | Latency: {incident_texts[i]['latency']} | ErrorRate: {incident_texts[i]['error_rate']} | Analysis: {incident_texts[i]['analysis'][:60]}..."
|
| 53 |
-
for i in ids[0] if i < len(incident_texts)
|
| 54 |
]
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
# --- MAIN PROCESS ---
|
| 58 |
def process_event(component, latency, error_rate):
|
| 59 |
-
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
| 60 |
-
anomaly = detect_anomaly(latency, error_rate)
|
| 61 |
-
|
| 62 |
-
# --- Analysis step ---
|
| 63 |
-
payload = {
|
| 64 |
-
"inputs": f"Component {component} showing latency {latency} and error rate {error_rate}.",
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
try:
|
| 68 |
-
headers = {"Authorization": f"Bearer {HF_TOKEN.strip()}"}
|
| 69 |
-
response = requests.post(f"{API_URL}/facebook/bart-large-mnli", headers=headers, json=payload)
|
| 70 |
-
if response.status_code == 200:
|
| 71 |
-
analysis = response.json().get("generated_text", "No analysis output.")
|
| 72 |
-
else:
|
| 73 |
-
analysis = f"Error {response.status_code}: {response.text}"
|
| 74 |
-
except Exception as e:
|
| 75 |
-
analysis = f"Error generating analysis: {str(e)}"
|
| 76 |
-
|
| 77 |
-
status = "Anomaly" if anomaly else "Normal"
|
| 78 |
-
healing_action = simulate_healing_action(component) if anomaly else "-"
|
| 79 |
-
similar_info = find_similar_incidents(analysis)
|
| 80 |
-
|
| 81 |
event = {
|
| 82 |
-
"timestamp":
|
| 83 |
"component": component,
|
| 84 |
"latency": latency,
|
| 85 |
"error_rate": error_rate,
|
| 86 |
-
"status": status,
|
| 87 |
-
"analysis": analysis,
|
| 88 |
-
"healing_action": f"{healing_action} {similar_info}" if anomaly else f"- {similar_info}",
|
| 89 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
incident_texts.append(event)
|
| 98 |
|
| 99 |
-
return f"✅ Event Processed", pd.DataFrame(recent_events)
|
| 100 |
|
| 101 |
-
#
|
|
|
|
|
|
|
| 102 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 103 |
gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
|
| 104 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
error_rate = gr.Number(label="Error Rate", value=random.uniform(0.01, 0.2))
|
| 110 |
|
| 111 |
-
submit = gr.Button("🚀 Submit Telemetry Event")
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
-
submit.click(process_event,
|
| 116 |
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
|
|
|
| 2 |
import random
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
+
import requests
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
from sentence_transformers import SentenceTransformer
|
| 11 |
+
import gradio as gr
|
| 12 |
|
| 13 |
+
# ============================
|
| 14 |
+
# SAFE TOKEN LOAD
|
| 15 |
+
# ============================
|
| 16 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
|
| 17 |
+
if not HF_TOKEN and os.path.exists(".env"):
|
| 18 |
+
try:
|
| 19 |
+
with open(".env", "r") as f:
|
| 20 |
+
HF_TOKEN = f.read().strip()
|
| 21 |
+
except Exception:
|
| 22 |
+
HF_TOKEN = ""
|
| 23 |
+
|
| 24 |
+
if HF_TOKEN:
|
| 25 |
+
print("✅ Hugging Face token loaded successfully.")
|
| 26 |
+
else:
|
| 27 |
+
print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
|
| 28 |
+
|
| 29 |
+
# ============================
|
| 30 |
+
# GLOBAL CONFIG
|
| 31 |
+
# ============================
|
| 32 |
+
HF_API_URL = "https://router.huggingface.co/hf-inference"
|
| 33 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 34 |
+
|
| 35 |
+
# Load a lightweight sentence transformer for embedding incidents
|
| 36 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 37 |
+
|
| 38 |
+
# Vector memory store (in-memory for now)
|
| 39 |
+
incident_memory = []
|
| 40 |
+
|
| 41 |
+
# ============================
|
| 42 |
+
# ANOMALY DETECTION
|
| 43 |
+
# ============================
|
| 44 |
+
def detect_anomaly(event):
|
| 45 |
+
"""
|
| 46 |
+
Detects anomalies based on latency/error_rate thresholds.
|
| 47 |
+
Forces an anomaly randomly for validation.
|
| 48 |
+
"""
|
| 49 |
+
force_anomaly = random.random() < 0.25 # ~25% forced anomaly rate
|
| 50 |
+
if force_anomaly or event["latency"] > 150 or event["error_rate"] > 0.05:
|
| 51 |
+
return True
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ============================
|
| 56 |
+
# AI ANALYSIS + HEALING
|
| 57 |
+
# ============================
|
| 58 |
+
def analyze_event(event):
|
| 59 |
+
"""
|
| 60 |
+
Send event to HF Inference API for analysis, fallback locally if needed.
|
| 61 |
+
"""
|
| 62 |
+
prompt = (
|
| 63 |
+
f"Analyze this telemetry event and suggest a healing action:\n"
|
| 64 |
+
f"Component: {event['component']}\n"
|
| 65 |
+
f"Latency: {event['latency']}\n"
|
| 66 |
+
f"Error Rate: {event['error_rate']}\n"
|
| 67 |
+
f"Detected Anomaly: {event['anomaly']}\n"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if not HF_TOKEN:
|
| 71 |
+
return "Local mode: analysis unavailable (no token).", "No action taken."
|
| 72 |
|
| 73 |
+
try:
|
| 74 |
+
response = requests.post(
|
| 75 |
+
f"{HF_API_URL}/mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 76 |
+
headers=headers,
|
| 77 |
+
json={"inputs": prompt},
|
| 78 |
+
timeout=10,
|
| 79 |
+
)
|
| 80 |
+
if response.status_code == 200:
|
| 81 |
+
result = response.json()
|
| 82 |
+
text = (
|
| 83 |
+
result[0]["generated_text"]
|
| 84 |
+
if isinstance(result, list) and "generated_text" in result[0]
|
| 85 |
+
else str(result)
|
| 86 |
+
)
|
| 87 |
+
return text, choose_healing_action(event, text)
|
| 88 |
+
else:
|
| 89 |
+
return f"Error {response.status_code}: {response.text}", "No actionable step detected."
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return f"Error generating analysis: {e}", "No actionable step detected."
|
| 92 |
|
|
|
|
| 93 |
|
| 94 |
+
# ============================
|
| 95 |
+
# HEALING SIMULATION
|
| 96 |
+
# ============================
|
| 97 |
+
def choose_healing_action(event, analysis_text):
|
| 98 |
+
"""Simulates an automated healing response."""
|
| 99 |
+
possible_actions = [
|
| 100 |
"Restarted container",
|
| 101 |
+
"Scaled service replicas",
|
| 102 |
"Cleared queue backlog",
|
| 103 |
+
"Invalidated cache",
|
| 104 |
+
"Re-deployed model endpoint",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
]
|
| 106 |
+
if "restart" in analysis_text.lower():
|
| 107 |
+
return "Restarted container"
|
| 108 |
+
elif "scale" in analysis_text.lower():
|
| 109 |
+
return "Scaled service replicas"
|
| 110 |
+
elif "cache" in analysis_text.lower():
|
| 111 |
+
return "Invalidated cache"
|
| 112 |
+
return random.choice(possible_actions)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# ============================
|
| 116 |
+
# VECTOR SIMILARITY ENGINE
|
| 117 |
+
# ============================
|
| 118 |
+
def record_and_search_similar(event, analysis_text):
|
| 119 |
+
"""
|
| 120 |
+
Store each event as a vector and retrieve similar past incidents.
|
| 121 |
+
"""
|
| 122 |
+
description = (
|
| 123 |
+
f"Component: {event['component']} | "
|
| 124 |
+
f"Latency: {event['latency']} | "
|
| 125 |
+
f"ErrorRate: {event['error_rate']} | "
|
| 126 |
+
f"Analysis: {analysis_text}"
|
| 127 |
+
)
|
| 128 |
+
embedding = model.encode(description)
|
| 129 |
+
|
| 130 |
+
similar_info = ""
|
| 131 |
+
if incident_memory:
|
| 132 |
+
existing_embeddings = np.array([e["embedding"] for e in incident_memory])
|
| 133 |
+
sims = cosine_similarity([embedding], existing_embeddings)[0]
|
| 134 |
+
top_indices = sims.argsort()[-3:][::-1]
|
| 135 |
+
similar = [
|
| 136 |
+
incident_memory[i]["description"]
|
| 137 |
+
for i in top_indices
|
| 138 |
+
if sims[i] > 0.7
|
| 139 |
+
]
|
| 140 |
+
if similar:
|
| 141 |
+
similar_info = f"Found {len(similar)} similar incidents (e.g., {similar[0][:150]}...)."
|
| 142 |
+
|
| 143 |
+
incident_memory.append({"embedding": embedding, "description": description})
|
| 144 |
+
return similar_info
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
# ============================
|
| 148 |
+
# EVENT HANDLER
|
| 149 |
+
# ============================
|
| 150 |
+
event_log = []
|
| 151 |
|
|
|
|
| 152 |
def process_event(component, latency, error_rate):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
event = {
|
| 154 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 155 |
"component": component,
|
| 156 |
"latency": latency,
|
| 157 |
"error_rate": error_rate,
|
|
|
|
|
|
|
|
|
|
| 158 |
}
|
| 159 |
+
event["anomaly"] = detect_anomaly(event)
|
| 160 |
+
status = "Anomaly" if event["anomaly"] else "Normal"
|
| 161 |
+
analysis, healing = analyze_event(event)
|
| 162 |
+
similar = record_and_search_similar(event, analysis)
|
| 163 |
+
healing = f"{healing} {similar}".strip()
|
| 164 |
|
| 165 |
+
event["status"] = status
|
| 166 |
+
event["analysis"] = analysis
|
| 167 |
+
event["healing_action"] = healing
|
| 168 |
+
event_log.append(event)
|
| 169 |
|
| 170 |
+
df = pd.DataFrame(event_log[-20:])
|
| 171 |
+
return f"✅ Event Processed ({status})", df
|
|
|
|
| 172 |
|
|
|
|
| 173 |
|
| 174 |
+
# ============================
|
| 175 |
+
# GRADIO UI
|
| 176 |
+
# ============================
|
| 177 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 178 |
gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
|
| 179 |
+
gr.Markdown(
|
| 180 |
+
"Adaptive anomaly detection + AI-driven self-healing + vector memory"
|
| 181 |
+
)
|
| 182 |
|
| 183 |
+
component = gr.Textbox(label="Component", value="api-service")
|
| 184 |
+
latency = gr.Slider(10, 400, value=100, label="Latency (ms)")
|
| 185 |
+
error_rate = gr.Slider(0.0, 0.2, value=0.02, label="Error Rate")
|
|
|
|
| 186 |
|
| 187 |
+
submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
|
| 188 |
+
output = gr.Textbox(label="Detection Output")
|
| 189 |
+
table = gr.Dataframe(label="Recent Events (Last 20)")
|
| 190 |
|
| 191 |
+
submit.click(process_event, [component, latency, error_rate], [output, table])
|
| 192 |
|
| 193 |
+
# ============================
|
| 194 |
+
# ENTRY POINT
|
| 195 |
+
# ============================
|
| 196 |
+
if __name__ == "__main__":
|
| 197 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|