File size: 4,310 Bytes
82009c8 9fa5ff3 82009c8 5c55cb5 82009c8 5c55cb5 82009c8 9fa5ff3 82009c8 9fa5ff3 82009c8 9fa5ff3 82009c8 9fa5ff3 82009c8 9fa5ff3 82009c8 9fa5ff3 82009c8 5c55cb5 82009c8 9fa5ff3 82009c8 5c55cb5 d97b7c8 82009c8 d97b7c8 82009c8 d97b7c8 82009c8 9fa5ff3 82009c8 d97b7c8 5c55cb5 82009c8 d97b7c8 82009c8 9fa5ff3 82009c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | import gradio as gr
import time
import random
import requests
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
# --- CONFIG ---
HF_TOKEN = (open(".env", "r").read().strip() if ".env" else None) or ""
HF_TOKEN = HF_TOKEN.strip() if HF_TOKEN else ""
if not HF_TOKEN:
print("⚠️ No Hugging Face token found. Running in fallback mode (local inference).")
API_URL = "https://router.huggingface.co/hf-inference"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
incident_embeddings = []
incident_texts = []
recent_events = []
# --- SIMULATION HELPERS ---
def simulate_healing_action(component: str) -> str:
actions = [
"Restarted container",
"Cleared queue backlog",
"Rebalanced load",
"No actionable step detected.",
]
return random.choice(actions)
def detect_anomaly(latency: float, error_rate: float) -> bool:
# Simple adaptive anomaly threshold
score = latency * error_rate
threshold = random.uniform(5, 25)
return score > threshold
def embed_incident(text: str):
emb = model.encode([text], normalize_embeddings=True)
return np.array(emb).astype("float32")
def find_similar_incidents(new_text: str, top_k=3):
if not incident_embeddings:
return "Not enough incidents stored yet."
new_emb = embed_incident(new_text)
index = faiss.IndexFlatIP(len(new_emb[0]))
index.add(np.vstack(incident_embeddings))
scores, ids = index.search(new_emb, top_k)
similar = [
f"Component: {incident_texts[i]['component']} | Latency: {incident_texts[i]['latency']} | ErrorRate: {incident_texts[i]['error_rate']} | Analysis: {incident_texts[i]['analysis'][:60]}..."
for i in ids[0] if i < len(incident_texts)
]
return f"Found {len(similar)} similar incidents ({'; '.join(similar)})."
# --- MAIN PROCESS ---
def process_event(component, latency, error_rate):
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
anomaly = detect_anomaly(latency, error_rate)
# --- Analysis step ---
payload = {
"inputs": f"Component {component} showing latency {latency} and error rate {error_rate}.",
}
try:
headers = {"Authorization": f"Bearer {HF_TOKEN.strip()}"}
response = requests.post(f"{API_URL}/facebook/bart-large-mnli", headers=headers, json=payload)
if response.status_code == 200:
analysis = response.json().get("generated_text", "No analysis output.")
else:
analysis = f"Error {response.status_code}: {response.text}"
except Exception as e:
analysis = f"Error generating analysis: {str(e)}"
status = "Anomaly" if anomaly else "Normal"
healing_action = simulate_healing_action(component) if anomaly else "-"
similar_info = find_similar_incidents(analysis)
event = {
"timestamp": timestamp,
"component": component,
"latency": latency,
"error_rate": error_rate,
"status": status,
"analysis": analysis,
"healing_action": f"{healing_action} {similar_info}" if anomaly else f"- {similar_info}",
}
recent_events.append(event)
if len(recent_events) > 20:
recent_events.pop(0)
# --- Store vector memory ---
incident_embeddings.append(embed_incident(analysis))
incident_texts.append(event)
return f"✅ Event Processed", pd.DataFrame(recent_events)
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory")
with gr.Row():
component = gr.Textbox(label="Component", value="api-service")
latency = gr.Number(label="Latency (ms)", value=random.uniform(50, 200))
error_rate = gr.Number(label="Error Rate", value=random.uniform(0.01, 0.2))
submit = gr.Button("🚀 Submit Telemetry Event")
output_text = gr.Textbox(label="Detection Output")
output_table = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"], label="Recent Events (Last 20)")
submit.click(process_event, inputs=[component, latency, error_rate], outputs=[output_text, output_table])
demo.launch()
|