File size: 6,738 Bytes
ba59239
 
e94f0ea
 
5c55cb5
e94f0ea
ba59239
0b2d10e
1eb0dc5
 
414407c
 
82009c8
e94f0ea
ba59239
1eb0dc5
 
e94f0ea
 
 
1eb0dc5
e94f0ea
 
 
1eb0dc5
6a3df22
e94f0ea
 
1eb0dc5
 
 
 
 
 
 
414407c
e94f0ea
1eb0dc5
e94f0ea
414407c
1eb0dc5
 
 
 
ba59239
6a3df22
1eb0dc5
 
 
ba59239
1eb0dc5
e94f0ea
 
1eb0dc5
 
e94f0ea
ba59239
1eb0dc5
e94f0ea
ba59239
6a3df22
1eb0dc5
 
ba59239
1eb0dc5
82009c8
ba59239
e94f0ea
 
 
 
 
 
1eb0dc5
 
 
 
ba59239
1eb0dc5
ba59239
1eb0dc5
 
82009c8
e94f0ea
 
9fa5ff3
e94f0ea
9fa5ff3
1eb0dc5
82009c8
e94f0ea
ba59239
1eb0dc5
 
5c55cb5
e94f0ea
9fa5ff3
1eb0dc5
 
5c55cb5
1eb0dc5
 
 
 
e94f0ea
 
 
1eb0dc5
 
e94f0ea
d97b7c8
1eb0dc5
e94f0ea
ba59239
e94f0ea
1eb0dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e94f0ea
 
6a3df22
e94f0ea
6a3df22
1eb0dc5
 
 
e94f0ea
6a3df22
1eb0dc5
 
 
 
 
 
 
 
 
 
6a3df22
 
1eb0dc5
 
 
 
 
 
 
 
6a3df22
 
 
 
 
 
 
1eb0dc5
6a3df22
1eb0dc5
 
e94f0ea
1eb0dc5
 
 
 
 
 
 
 
 
e94f0ea
 
1eb0dc5
e94f0ea
1eb0dc5
e94f0ea
1eb0dc5
 
 
 
 
 
 
 
 
 
 
e94f0ea
 
1eb0dc5
e94f0ea
1eb0dc5
 
 
 
e94f0ea
 
 
 
 
 
1eb0dc5
 
 
 
 
 
 
 
 
 
 
e94f0ea
 
6a3df22
 
1eb0dc5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import os
import json
import random
import datetime
import numpy as np
import gradio as gr
import requests
import faiss
from fastapi import FastAPI, Body, Header, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from filelock import FileLock

# === Config ===
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
API_KEY = os.getenv("API_KEY", "").strip()

HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}

# === FAISS Setup ===
VECTOR_DIM = 384
INDEX_FILE = "incident_vectors.index"
TEXTS_FILE = "incident_texts.json"
LOCK_FILE = "faiss_save.lock"

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

if os.path.exists(INDEX_FILE):
    index = faiss.read_index(INDEX_FILE)
    with open(TEXTS_FILE, "r") as f:
        incident_texts = json.load(f)
else:
    index = faiss.IndexFlatL2(VECTOR_DIM)
    incident_texts = []


# === Safe persistence ===
def save_index():
    with FileLock(LOCK_FILE):
        faiss.write_index(index, INDEX_FILE)
        with open(TEXTS_FILE, "w") as f:
            json.dump(incident_texts, f)


# === Core logic ===
events = []


def detect_anomaly(event):
    """Adaptive threshold-based anomaly detection."""
    latency = event["latency"]
    error_rate = event["error_rate"]

    # Occasionally flag random anomaly for testing
    if random.random() < 0.25:
        return True

    return latency > 150 or error_rate > 0.05


def call_huggingface_analysis(prompt):
    """Uses HF Inference API or local fallback."""
    if not HF_TOKEN:
        return "Offline mode: simulated analysis."

    try:
        payload = {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "prompt": prompt,
            "max_tokens": 200,
            "temperature": 0.3,
        }
        response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=10)
        if response.status_code == 200:
            result = response.json()
            return result.get("choices", [{}])[0].get("text", "").strip()
        else:
            return f"Error {response.status_code}: {response.text}"
    except Exception as e:
        return f"Error generating analysis: {e}"


def simulate_healing(event):
    actions = [
        "Restarted container",
        "Scaled up instance",
        "Cleared queue backlog",
        "No actionable step detected.",
    ]
    return random.choice(actions)


def analyze_event(component, latency, error_rate):
    event = {
        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "component": component,
        "latency": latency,
        "error_rate": error_rate,
    }

    is_anomaly = detect_anomaly(event)
    event["anomaly"] = is_anomaly
    event["status"] = "Anomaly" if is_anomaly else "Normal"

    prompt = (
        f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
        f"Status: {event['status']}\n\n"
        "Provide a one-line reliability insight or root cause analysis."
    )

    # AI Reliability analysis
    analysis = call_huggingface_analysis(prompt)
    event["analysis"] = analysis

    # Simulated self-healing
    healing_action = simulate_healing(event)
    event["healing_action"] = healing_action

    # === Vector learning & persistence ===
    vector_text = f"{component} {latency} {error_rate} {analysis}"
    vec = model.encode([vector_text])
    index.add(np.array(vec, dtype=np.float32))
    incident_texts.append(vector_text)
    save_index()

    # Similar incident lookup
    if len(incident_texts) > 1:
        D, I = index.search(vec, k=min(3, len(incident_texts)))
        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
        if similar:
            event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar[0][:100]}...)."
    else:
        event["healing_action"] += " - Not enough incidents stored yet."

    events.append(event)
    return event


# === FastAPI backend ===
app = FastAPI(title="Agentic Reliability Framework API")


class AddEventModel(BaseModel):
    component: str
    latency: float
    error_rate: float


def verify_api_key(provided_key: str):
    if not API_KEY:
        return True  # dev mode
    return provided_key == API_KEY


@app.post("/add-event")
def add_event(
    payload: AddEventModel = Body(...),
    x_api_key: str = Header(None, alias="X-API-Key"),
):
    """Add a telemetry event (secured via API key)."""
    if not verify_api_key(x_api_key):
        raise HTTPException(status_code=401, detail="Unauthorized: invalid API key.")

    try:
        event = analyze_event(payload.component, payload.latency, payload.error_rate)
        return {"status": "ok", "event": event}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to add event: {e}")


# === Gradio Dashboard ===
def submit_event(component, latency, error_rate):
    event = analyze_event(component, latency, error_rate)

    table = [
        [
            e["timestamp"],
            e["component"],
            e["latency"],
            e["error_rate"],
            e["status"],
            e["analysis"],
            e["healing_action"],
        ]
        for e in events[-20:]
    ]

    return (
        f"✅ Event Processed ({event['status']})",
        gr.Dataframe(
            headers=[
                "timestamp",
                "component",
                "latency",
                "error_rate",
                "status",
                "analysis",
                "healing_action",
            ],
            value=table,
        ),
    )


with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
    gr.Markdown(
        "## 🧠 Agentic Reliability Framework MVP\n"
        "Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory"
    )
    with gr.Row():
        component = gr.Textbox(label="Component", value="api-service")
        latency = gr.Slider(10, 400, value=100, step=1, label="Latency (ms)")
        error_rate = gr.Slider(0, 0.2, value=0.02, step=0.001, label="Error Rate")
    submit = gr.Button("🚀 Submit Telemetry Event")
    output_text = gr.Textbox(label="Detection Output")
    table_output = gr.Dataframe(
        headers=[
            "timestamp",
            "component",
            "latency",
            "error_rate",
            "status",
            "analysis",
            "healing_action",
        ]
    )
    submit.click(fn=submit_event, inputs=[component, latency, error_rate], outputs=[output_text, table_output])


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)