Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 7, 2025

Commit

e94f0ea

verified ·

1 Parent(s): 0b2d10e

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -168

app.py CHANGED Viewed

@@ -1,205 +1,161 @@
 import os
-import random
 import json
 import time
 import numpy as np
-import pandas as pd
 import requests
-from datetime import datetime
-from sklearn.metrics.pairwise import cosine_similarity
 from sentence_transformers import SentenceTransformer
 import faiss
-import gradio as gr
-# ============================
-# SAFE TOKEN LOAD
-# ============================
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
-if not HF_TOKEN and os.path.exists(".env"):
-    try:
-        with open(".env", "r") as f:
-            HF_TOKEN = f.read().strip()
-    except Exception:
-        HF_TOKEN = ""
-if HF_TOKEN:
-    print("✅ Hugging Face token loaded successfully.")
-else:
     print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
-# ============================
-# CONFIG
-# ============================
-HF_API_URL = "https://router.huggingface.co/hf-inference"
-headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-DATA_DIR = "./data"
-os.makedirs(DATA_DIR, exist_ok=True)
-# ============================
-# MODEL + FAISS SETUP
-# ============================
-model = SentenceTransformer("all-MiniLM-L6-v2")
-VECTOR_DIM = model.get_sentence_embedding_dimension()
-FAISS_PATH = os.path.join(DATA_DIR, "incident_memory.faiss")
-META_PATH = os.path.join(DATA_DIR, "incidents.json")
-# Load or initialize FAISS index
-if os.path.exists(FAISS_PATH):
-    try:
-        index = faiss.read_index(FAISS_PATH)
-        with open(META_PATH, "r") as f:
-            incident_memory = json.load(f)
-        print(f"✅ Loaded {len(incident_memory)} past incidents from FAISS.")
-    except Exception:
-        print("⚠️ Failed to load FAISS index. Starting fresh.")
-        index = faiss.IndexFlatL2(VECTOR_DIM)
-        incident_memory = []
 else:
     index = faiss.IndexFlatL2(VECTOR_DIM)
-    incident_memory = []
-# ============================
-# ANOMALY DETECTION
-# ============================
 def detect_anomaly(event):
-    """Detects anomalies based on latency/error_rate thresholds, with forced random noise."""
-    force_anomaly = random.random() < 0.25
-    if force_anomaly or event["latency"] > 150 or event["error_rate"] > 0.05:
         return True
-    return False
-# ============================
-# AI ANALYSIS + HEALING
-# ============================
-def analyze_event(event):
-    prompt = (
-        f"Analyze this telemetry event and suggest a healing action:\n"
-        f"Component: {event['component']}\n"
-        f"Latency: {event['latency']}\n"
-        f"Error Rate: {event['error_rate']}\n"
-        f"Detected Anomaly: {event['anomaly']}\n"
-    )
     if not HF_TOKEN:
-        return "Local mode: analysis unavailable (no token).", "No action taken."
     try:
-        response = requests.post(
-            f"{HF_API_URL}/mistralai/Mixtral-8x7B-Instruct-v0.1",
-            headers=headers,
-            json={"inputs": prompt},
-            timeout=10,
-        )
         if response.status_code == 200:
             result = response.json()
-            text = (
-                result[0]["generated_text"]
-                if isinstance(result, list) and "generated_text" in result[0]
-                else str(result)
-            )
-            return text, choose_healing_action(event, text)
         else:
-            return f"Error {response.status_code}: {response.text}", "No actionable step detected."
     except Exception as e:
-        return f"Error generating analysis: {e}", "No actionable step detected."
-# ============================
-# HEALING SIMULATION
-# ============================
-def choose_healing_action(event, analysis_text):
-    possible_actions = [
         "Restarted container",
-        "Scaled service replicas",
         "Cleared queue backlog",
-        "Invalidated cache",
-        "Re-deployed model endpoint",
     ]
-    if "restart" in analysis_text.lower():
-        return "Restarted container"
-    elif "scale" in analysis_text.lower():
-        return "Scaled service replicas"
-    elif "cache" in analysis_text.lower():
-        return "Invalidated cache"
-    return random.choice(possible_actions)
-# ============================
-# VECTOR SIMILARITY + FAISS PERSISTENCE
-# ============================
-def record_and_search_similar(event, analysis_text):
-    """Store each event vector in FAISS and search for similar incidents."""
-    description = (
-        f"Component: {event['component']} | "
-        f"Latency: {event['latency']} | "
-        f"ErrorRate: {event['error_rate']} | "
-        f"Analysis: {analysis_text}"
-    )
-    embedding = model.encode(description).astype("float32").reshape(1, -1)
-    similar_info = ""
-    if len(incident_memory) > 0 and index.ntotal > 0:
-        k = min(3, len(incident_memory))
-        D, I = index.search(embedding, k)
-        similar = [incident_memory[i]["description"] for i in I[0] if D[0][0] < 0.5]
-        if similar:
-            similar_info = f"Found {len(similar)} similar incidents (e.g., {similar[0][:120]}...)."
-    # Store new entry
-    incident_memory.append({"description": description})
-    index.add(embedding)
-    # Persist FAISS + metadata
-    faiss.write_index(index, FAISS_PATH)
-    with open(META_PATH, "w") as f:
-        json.dump(incident_memory, f)
-    return similar_info
-# ============================
-# EVENT HANDLER
-# ============================
-event_log = []
-def process_event(component, latency, error_rate):
     event = {
-        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         "component": component,
         "latency": latency,
-        "error_rate": error_rate,
     }
-    event["anomaly"] = detect_anomaly(event)
-    status = "Anomaly" if event["anomaly"] else "Normal"
-    analysis, healing = analyze_event(event)
-    similar = record_and_search_similar(event, analysis)
-    healing = f"{healing} {similar}".strip()
-    event["status"] = status
     event["analysis"] = analysis
-    event["healing_action"] = healing
-    event_log.append(event)
-    df = pd.DataFrame(event_log[-20:])
-    return f"✅ Event Processed ({status})", df
-# ============================
-# GRADIO UI
-# ============================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🧠 Agentic Reliability Framework MVP")
-    gr.Markdown("Adaptive anomaly detection + AI-driven self-healing + vector memory (FAISS persistent)")
-    component = gr.Textbox(label="Component", value="api-service")
-    latency = gr.Slider(10, 400, value=100, label="Latency (ms)")
-    error_rate = gr.Slider(0.0, 0.2, value=0.02, label="Error Rate")
-    submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
-    output = gr.Textbox(label="Detection Output")
-    table = gr.Dataframe(label="Recent Events (Last 20)")
-    submit.click(process_event, [component, latency, error_rate], [output, table])
-# ============================
-# ENTRY POINT
-# ============================
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import json
+import random
 import time
+import datetime
 import numpy as np
+import gradio as gr
 import requests
 from sentence_transformers import SentenceTransformer
 import faiss
+# === Config ===
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
+if not HF_TOKEN:
     print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
+else:
+    print("✅ Hugging Face token loaded successfully.")
+HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
+HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+# === FAISS Setup ===
+VECTOR_DIM = 384
+INDEX_FILE = "incident_vectors.index"
+TEXTS_FILE = "incident_texts.json"
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+if os.path.exists(INDEX_FILE):
+    index = faiss.read_index(INDEX_FILE)
+    with open(TEXTS_FILE, "r") as f:
+        incident_texts = json.load(f)
 else:
     index = faiss.IndexFlatL2(VECTOR_DIM)
+    incident_texts = []
+def save_index():
+    faiss.write_index(index, INDEX_FILE)
+    with open(TEXTS_FILE, "w") as f:
+        json.dump(incident_texts, f)
+# === Event Memory ===
+events = []
 def detect_anomaly(event):
+    """Adaptive threshold-based anomaly detection."""
+    latency = event["latency"]
+    error_rate = event["error_rate"]
+    # Force random anomaly occasionally for testing
+    if random.random() < 0.25:
         return True
+    return latency > 150 or error_rate > 0.05
+def call_huggingface_analysis(prompt):
+    """Use HF Inference API or fallback simulation."""
     if not HF_TOKEN:
+        return "Offline mode: simulated analysis."
     try:
+        payload = {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "prompt": prompt,
+            "max_tokens": 200,
+            "temperature": 0.3,
+        }
+        response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
         if response.status_code == 200:
             result = response.json()
+            return result.get("choices", [{}])[0].get("text", "").strip()
         else:
+            return f"Error {response.status_code}: {response.text}"
     except Exception as e:
+        return f"Error generating analysis: {e}"
+def simulate_healing(event):
+    actions = [
         "Restarted container",
+        "Scaled up instance",
         "Cleared queue backlog",
+        "No actionable step detected."
     ]
+    return random.choice(actions)
+def analyze_event(component, latency, error_rate):
     event = {
+        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         "component": component,
         "latency": latency,
+        "error_rate": error_rate
     }
+    is_anomaly = detect_anomaly(event)
+    event["anomaly"] = is_anomaly
+    event["status"] = "Anomaly" if is_anomaly else "Normal"
+    # Build textual prompt
+    prompt = (
+        f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
+        f"Status: {event['status']}\n\n"
+        "Provide a one-line reliability insight or root cause analysis."
+    )
+    # Analysis
+    analysis = call_huggingface_analysis(prompt)
     event["analysis"] = analysis
+    # Healing simulation
+    healing_action = simulate_healing(event)
+    event["healing_action"] = healing_action
+    # === Vector learning ===
+    vector_text = f"{component} {latency} {error_rate} {analysis}"
+    vec = model.encode([vector_text])
+    index.add(np.array(vec, dtype=np.float32))
+    incident_texts.append(vector_text)
+    save_index()
+    # Find similar incidents
+    if len(incident_texts) > 1:
+        D, I = index.search(vec, k=min(3, len(incident_texts)))
+        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
+        if similar:
+            event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:120]}...)."
+    else:
+        event["healing_action"] += " - Not enough incidents stored yet."
+    events.append(event)
+    return json.dumps(event, indent=2)
+# === UI ===
+def submit_event(component, latency, error_rate):
+    result = analyze_event(component, latency, error_rate)
+    parsed = json.loads(result)
+    table = [
+        [e["timestamp"], e["component"], e["latency"], e["error_rate"],
+         e["status"], e["analysis"], e["healing_action"]]
+        for e in events[-20:]
+    ]
+    return (
+        f"✅ Event Processed ({parsed['status']})",
+        gr.Dataframe(
+            headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
+            value=table
+        )
+    )
+with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
+    gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + vector memory (FAISS persistent)")
+    with gr.Row():
+        component = gr.Textbox(label="Component", value="api-service")
+        latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
+        error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
+    submit = gr.Button("🚀 Submit Telemetry Event")
+    output_text = gr.Textbox(label="Detection Output")
+    table_output = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"])
+    submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
+demo.launch(server_name="0.0.0.0", server_port=7860)