petter2025's picture
Update app.py
82009c8 verified
raw
history blame
4.31 kB
import gradio as gr
import time
import random
import requests
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
# --- CONFIG ---
HF_TOKEN = (open(".env", "r").read().strip() if ".env" else None) or ""
HF_TOKEN = HF_TOKEN.strip() if HF_TOKEN else ""
if not HF_TOKEN:
print("⚠️ No Hugging Face token found. Running in fallback mode (local inference).")
API_URL = "https://router.huggingface.co/hf-inference"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
incident_embeddings = []
incident_texts = []
recent_events = []
# --- SIMULATION HELPERS ---
def simulate_healing_action(component: str) -> str:
actions = [
"Restarted container",
"Cleared queue backlog",
"Rebalanced load",
"No actionable step detected.",
]
return random.choice(actions)
def detect_anomaly(latency: float, error_rate: float) -> bool:
# Simple adaptive anomaly threshold
score = latency * error_rate
threshold = random.uniform(5, 25)
return score > threshold
def embed_incident(text: str):
emb = model.encode([text], normalize_embeddings=True)
return np.array(emb).astype("float32")
def find_similar_incidents(new_text: str, top_k=3):
if not incident_embeddings:
return "Not enough incidents stored yet."
new_emb = embed_incident(new_text)
index = faiss.IndexFlatIP(len(new_emb[0]))
index.add(np.vstack(incident_embeddings))
scores, ids = index.search(new_emb, top_k)
similar = [
f"Component: {incident_texts[i]['component']} | Latency: {incident_texts[i]['latency']} | ErrorRate: {incident_texts[i]['error_rate']} | Analysis: {incident_texts[i]['analysis'][:60]}..."
for i in ids[0] if i < len(incident_texts)
]
return f"Found {len(similar)} similar incidents ({'; '.join(similar)})."
# --- MAIN PROCESS ---
def process_event(component, latency, error_rate):
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
anomaly = detect_anomaly(latency, error_rate)
# --- Analysis step ---
payload = {
"inputs": f"Component {component} showing latency {latency} and error rate {error_rate}.",
}
try:
headers = {"Authorization": f"Bearer {HF_TOKEN.strip()}"}
response = requests.post(f"{API_URL}/facebook/bart-large-mnli", headers=headers, json=payload)
if response.status_code == 200:
analysis = response.json().get("generated_text", "No analysis output.")
else:
analysis = f"Error {response.status_code}: {response.text}"
except Exception as e:
analysis = f"Error generating analysis: {str(e)}"
status = "Anomaly" if anomaly else "Normal"
healing_action = simulate_healing_action(component) if anomaly else "-"
similar_info = find_similar_incidents(analysis)
event = {
"timestamp": timestamp,
"component": component,
"latency": latency,
"error_rate": error_rate,
"status": status,
"analysis": analysis,
"healing_action": f"{healing_action} {similar_info}" if anomaly else f"- {similar_info}",
}
recent_events.append(event)
if len(recent_events) > 20:
recent_events.pop(0)
# --- Store vector memory ---
incident_embeddings.append(embed_incident(analysis))
incident_texts.append(event)
return f"✅ Event Processed", pd.DataFrame(recent_events)
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory")
with gr.Row():
component = gr.Textbox(label="Component", value="api-service")
latency = gr.Number(label="Latency (ms)", value=random.uniform(50, 200))
error_rate = gr.Number(label="Error Rate", value=random.uniform(0.01, 0.2))
submit = gr.Button("🚀 Submit Telemetry Event")
output_text = gr.Textbox(label="Detection Output")
output_table = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"], label="Recent Events (Last 20)")
submit.click(process_event, inputs=[component, latency, error_rate], outputs=[output_text, output_table])
demo.launch()