File size: 6,248 Bytes
ba59239
 
e94f0ea
 
5c55cb5
e94f0ea
ba59239
0b2d10e
414407c
 
82009c8
e94f0ea
ba59239
e94f0ea
 
 
414407c
 
 
e94f0ea
 
 
414407c
e94f0ea
 
414407c
 
 
 
 
 
 
 
 
 
e94f0ea
 
414407c
 
 
 
 
e94f0ea
 
 
ba59239
414407c
ba59239
e94f0ea
 
414407c
e94f0ea
ba59239
e94f0ea
ba59239
414407c
 
 
 
 
 
 
 
 
 
 
 
 
e94f0ea
414407c
ba59239
414407c
82009c8
ba59239
e94f0ea
 
 
 
 
 
 
ba59239
 
414407c
ba59239
414407c
 
ba59239
414407c
 
82009c8
e94f0ea
 
9fa5ff3
e94f0ea
9fa5ff3
e94f0ea
82009c8
e94f0ea
ba59239
e94f0ea
5c55cb5
e94f0ea
9fa5ff3
82009c8
414407c
5c55cb5
0b2d10e
414407c
 
e94f0ea
 
 
 
414407c
e94f0ea
d97b7c8
e94f0ea
ba59239
414407c
e94f0ea
414407c
 
 
e94f0ea
414407c
e94f0ea
 
414407c
e94f0ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414407c
 
e94f0ea
 
 
414407c
e94f0ea
 
 
 
 
 
414407c
 
 
e94f0ea
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import json
import random
import datetime
import numpy as np
import gradio as gr
import requests
import faiss
from sentence_transformers import SentenceTransformer
from filelock import FileLock

# === Config ===
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}

print("✅ Hugging Face token loaded." if HF_TOKEN else "⚠️ No HF token found, using local analysis mode.")

# === Persistent FAISS Setup ===
VECTOR_DIM = 384
INDEX_FILE = "incident_vectors.index"
TEXTS_FILE = "incident_texts.json"
LOCK_FILE = "incident.lock"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def load_faiss_index():
    if os.path.exists(INDEX_FILE) and os.path.exists(TEXTS_FILE):
        index = faiss.read_index(INDEX_FILE)
        with open(TEXTS_FILE, "r") as f:
            texts = json.load(f)
        return index, texts
    else:
        return faiss.IndexFlatL2(VECTOR_DIM), []

index, incident_texts = load_faiss_index()

def save_index():
    """Persist FAISS + metadata safely."""
    with FileLock(LOCK_FILE):
        faiss.write_index(index, INDEX_FILE)
        with open(TEXTS_FILE, "w") as f:
            json.dump(incident_texts, f)

# === Event Memory ===
events = []

# === Core Logic ===
def detect_anomaly(event):
    latency = event["latency"]
    error_rate = event["error_rate"]
    # Occasional forced anomaly for testing
    if random.random() < 0.25:
        return True
    return latency > 150 or error_rate > 0.05

def local_reliability_analysis(prompt: str):
    """Local semantic fallback analysis via vector similarity."""
    embedding = model.encode([prompt])
    index.add(np.array(embedding, dtype=np.float32))
    incident_texts.append(prompt)
    save_index()
    if len(incident_texts) > 1:
        D, I = index.search(np.array(embedding, dtype=np.float32), k=min(3, len(incident_texts)))
        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
        return f"Local insight: {len(similar)} similar reliability events detected."
    else:
        return "Local insight: Initial incident stored."

def call_huggingface_analysis(prompt):
    """Hybrid HF/local analysis with graceful fallback."""
    if not HF_TOKEN:
        return local_reliability_analysis(prompt)

    try:
        payload = {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "prompt": prompt,
            "max_tokens": 200,
            "temperature": 0.3,
        }
        response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
        if response.status_code == 200:
            result = response.json()
            return result.get("choices", [{}])[0].get("text", "").strip() or local_reliability_analysis(prompt)
        else:
            print(f"⚠️ HF router error {response.status_code}: {response.text[:80]}...")
            return local_reliability_analysis(prompt)
    except Exception as e:
        print(f"⚠️ HF inference error: {e}")
        return local_reliability_analysis(prompt)

def simulate_healing(event):
    actions = [
        "Restarted container",
        "Scaled up instance",
        "Cleared queue backlog",
        "No actionable step detected."
    ]
    return random.choice(actions)

def analyze_event(component, latency, error_rate):
    event = {
        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "component": component,
        "latency": latency,
        "error_rate": error_rate,
    }

    event["anomaly"] = detect_anomaly(event)
    event["status"] = "Anomaly" if event["anomaly"] else "Normal"

    prompt = (
        f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
        f"Status: {event['status']}\n\n"
        "Provide a short reliability insight or root cause."
    )

    analysis = call_huggingface_analysis(prompt)
    event["analysis"] = analysis
    event["healing_action"] = simulate_healing(event)

    # Vector memory persistence
    vec_text = f"{component} {latency} {error_rate} {analysis}"
    vec = model.encode([vec_text])
    index.add(np.array(vec, dtype=np.float32))
    incident_texts.append(vec_text)
    save_index()

    # Retrieve similar
    if len(incident_texts) > 1:
        D, I = index.search(vec, k=min(3, len(incident_texts)))
        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
        if similar:
            event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:120]}...)."
    else:
        event["healing_action"] += " - Not enough incidents stored yet."

    events.append(event)
    return json.dumps(event, indent=2)

# === UI ===
def submit_event(component, latency, error_rate):
    result = analyze_event(component, latency, error_rate)
    parsed = json.loads(result)

    table = [
        [e["timestamp"], e["component"], e["latency"], e["error_rate"],
         e["status"], e["analysis"], e["healing_action"]]
        for e in events[-20:]
    ]

    return (
        f"✅ Event Processed ({parsed['status']})",
        gr.Dataframe(
            headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
            value=table,
        ),
    )

with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
    gr.Markdown("## 🧠 Agentic Reliability Framework MVP\nAdaptive anomaly detection + AI-driven self-healing + persistent FAISS memory.")
    with gr.Row():
        component = gr.Textbox(label="Component", value="api-service")
        latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
        error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
    submit = gr.Button("🚀 Submit Telemetry Event")
    output_text = gr.Textbox(label="Detection Output")
    table_output = gr.Dataframe(
        headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"]
    )
    submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])

demo.launch(server_name="0.0.0.0", server_port=7860)