Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 8, 2025

Commit

414407c

verified ·

1 Parent(s): cd4a63c

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -47

app.py CHANGED Viewed

@@ -1,61 +1,75 @@
 import os
 import json
 import random
-import time
 import datetime
 import numpy as np
 import gradio as gr
 import requests
-from sentence_transformers import SentenceTransformer
 import faiss
 # === Config ===
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
-if not HF_TOKEN:
-    print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
-else:
-    print("✅ Hugging Face token loaded successfully.")
 HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
 HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-# === FAISS Setup ===
 VECTOR_DIM = 384
 INDEX_FILE = "incident_vectors.index"
 TEXTS_FILE = "incident_texts.json"
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-if os.path.exists(INDEX_FILE):
-    index = faiss.read_index(INDEX_FILE)
-    with open(TEXTS_FILE, "r") as f:
-        incident_texts = json.load(f)
-else:
-    index = faiss.IndexFlatL2(VECTOR_DIM)
-    incident_texts = []
 def save_index():
-    faiss.write_index(index, INDEX_FILE)
-    with open(TEXTS_FILE, "w") as f:
-        json.dump(incident_texts, f)
 # === Event Memory ===
 events = []
 def detect_anomaly(event):
-    """Adaptive threshold-based anomaly detection."""
     latency = event["latency"]
     error_rate = event["error_rate"]
-    # Force random anomaly occasionally for testing
     if random.random() < 0.25:
         return True
     return latency > 150 or error_rate > 0.05
 def call_huggingface_analysis(prompt):
-    """Use HF Inference API or fallback simulation."""
     if not HF_TOKEN:
-        return "Offline mode: simulated analysis."
     try:
         payload = {
@@ -67,11 +81,13 @@ def call_huggingface_analysis(prompt):
         response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
         if response.status_code == 200:
             result = response.json()
-            return result.get("choices", [{}])[0].get("text", "").strip()
         else:
-            return f"Error {response.status_code}: {response.text}"
     except Exception as e:
-        return f"Error generating analysis: {e}"
 def simulate_healing(event):
     actions = [
@@ -87,36 +103,30 @@ def analyze_event(component, latency, error_rate):
         "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         "component": component,
         "latency": latency,
-        "error_rate": error_rate
     }
-    is_anomaly = detect_anomaly(event)
-    event["anomaly"] = is_anomaly
-    event["status"] = "Anomaly" if is_anomaly else "Normal"
-    # Build textual prompt
     prompt = (
         f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
         f"Status: {event['status']}\n\n"
-        "Provide a one-line reliability insight or root cause analysis."
     )
-    # Analysis
     analysis = call_huggingface_analysis(prompt)
     event["analysis"] = analysis
-    # Healing simulation
-    healing_action = simulate_healing(event)
-    event["healing_action"] = healing_action
-    # === Vector learning ===
-    vector_text = f"{component} {latency} {error_rate} {analysis}"
-    vec = model.encode([vector_text])
     index.add(np.array(vec, dtype=np.float32))
-    incident_texts.append(vector_text)
     save_index()
-    # Find similar incidents
     if len(incident_texts) > 1:
         D, I = index.search(vec, k=min(3, len(incident_texts)))
         similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
@@ -143,19 +153,21 @@ def submit_event(component, latency, error_rate):
         f"✅ Event Processed ({parsed['status']})",
         gr.Dataframe(
             headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
-            value=table
-        )
     )
 with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
-    gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + vector memory (FAISS persistent)")
     with gr.Row():
         component = gr.Textbox(label="Component", value="api-service")
         latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
         error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
     submit = gr.Button("🚀 Submit Telemetry Event")
     output_text = gr.Textbox(label="Detection Output")
-    table_output = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"])
     submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import json
 import random
 import datetime
 import numpy as np
 import gradio as gr
 import requests
 import faiss
+from sentence_transformers import SentenceTransformer
+from filelock import FileLock
 # === Config ===
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
 HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
 HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+print("✅ Hugging Face token loaded." if HF_TOKEN else "⚠️ No HF token found, using local analysis mode.")
+# === Persistent FAISS Setup ===
 VECTOR_DIM = 384
 INDEX_FILE = "incident_vectors.index"
 TEXTS_FILE = "incident_texts.json"
+LOCK_FILE = "incident.lock"
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def load_faiss_index():
+    if os.path.exists(INDEX_FILE) and os.path.exists(TEXTS_FILE):
+        index = faiss.read_index(INDEX_FILE)
+        with open(TEXTS_FILE, "r") as f:
+            texts = json.load(f)
+        return index, texts
+    else:
+        return faiss.IndexFlatL2(VECTOR_DIM), []
+index, incident_texts = load_faiss_index()
 def save_index():
+    """Persist FAISS + metadata safely."""
+    with FileLock(LOCK_FILE):
+        faiss.write_index(index, INDEX_FILE)
+        with open(TEXTS_FILE, "w") as f:
+            json.dump(incident_texts, f)
 # === Event Memory ===
 events = []
+# === Core Logic ===
 def detect_anomaly(event):
     latency = event["latency"]
     error_rate = event["error_rate"]
+    # Occasional forced anomaly for testing
     if random.random() < 0.25:
         return True
     return latency > 150 or error_rate > 0.05
+def local_reliability_analysis(prompt: str):
+    """Local semantic fallback analysis via vector similarity."""
+    embedding = model.encode([prompt])
+    index.add(np.array(embedding, dtype=np.float32))
+    incident_texts.append(prompt)
+    save_index()
+    if len(incident_texts) > 1:
+        D, I = index.search(np.array(embedding, dtype=np.float32), k=min(3, len(incident_texts)))
+        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
+        return f"Local insight: {len(similar)} similar reliability events detected."
+    else:
+        return "Local insight: Initial incident stored."
 def call_huggingface_analysis(prompt):
+    """Hybrid HF/local analysis with graceful fallback."""
     if not HF_TOKEN:
+        return local_reliability_analysis(prompt)
     try:
         payload = {
         response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
         if response.status_code == 200:
             result = response.json()
+            return result.get("choices", [{}])[0].get("text", "").strip() or local_reliability_analysis(prompt)
         else:
+            print(f"⚠️ HF router error {response.status_code}: {response.text[:80]}...")
+            return local_reliability_analysis(prompt)
     except Exception as e:
+        print(f"⚠️ HF inference error: {e}")
+        return local_reliability_analysis(prompt)
 def simulate_healing(event):
     actions = [
         "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         "component": component,
         "latency": latency,
+        "error_rate": error_rate,
     }
+    event["anomaly"] = detect_anomaly(event)
+    event["status"] = "Anomaly" if event["anomaly"] else "Normal"
     prompt = (
         f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
         f"Status: {event['status']}\n\n"
+        "Provide a short reliability insight or root cause."
     )
     analysis = call_huggingface_analysis(prompt)
     event["analysis"] = analysis
+    event["healing_action"] = simulate_healing(event)
+    # Vector memory persistence
+    vec_text = f"{component} {latency} {error_rate} {analysis}"
+    vec = model.encode([vec_text])
     index.add(np.array(vec, dtype=np.float32))
+    incident_texts.append(vec_text)
     save_index()
+    # Retrieve similar
     if len(incident_texts) > 1:
         D, I = index.search(vec, k=min(3, len(incident_texts)))
         similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
         f"✅ Event Processed ({parsed['status']})",
         gr.Dataframe(
             headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
+            value=table,
+        ),
     )
 with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
+    gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + persistent FAISS memory.")
     with gr.Row():
         component = gr.Textbox(label="Component", value="api-service")
         latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
         error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
     submit = gr.Button("🚀 Submit Telemetry Event")
     output_text = gr.Textbox(label="Detection Output")
+    table_output = gr.Dataframe(
+        headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"]
+    )
     submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])
 demo.launch(server_name="0.0.0.0", server_port=7860)