| |
| import logging |
| import uuid |
| from datetime import datetime, timezone |
| from typing import Dict, Optional, List |
|
|
| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html |
| from fastapi.responses import RedirectResponse |
| from pydantic import BaseModel |
| import gradio as gr |
|
|
| |
| from agentic_reliability_framework.core.governance.risk_engine import RiskEngine |
| from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory |
| from agentic_reliability_framework.runtime.memory.constants import MemoryConstants |
|
|
| |
| from agentic_reliability_framework.core.governance.governance_loop import GovernanceLoop |
| from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine |
| from agentic_reliability_framework.core.governance.cost_estimator import CostEstimator |
| from agentic_reliability_framework.core.governance.intents import ( |
| DeployConfigurationIntent, |
| Environment, |
| InfrastructureIntent, |
| ) |
| from agentic_reliability_framework.core.governance.healing_intent import ( |
| HealingIntent, |
| HealingIntentSerializer, |
| RecommendedAction, |
| ) |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| import pkgutil |
| import agentic_reliability_framework.core.governance as governance |
| print("Contents of governance module:", [name for _, name, _ in pkgutil.iter_modules(governance.__path__)]) |
|
|
| |
| fastapi_app = FastAPI(title="ARF v4 API") |
|
|
| |
| fastapi_app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["https://arf-frontend-sandy.vercel.app"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| risk_engine = RiskEngine() |
| faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM) |
| memory = RAGGraphMemory(faiss_index) |
|
|
| |
| policy_engine = PolicyEngine() |
| cost_estimator = CostEstimator() |
|
|
| |
| governance_loop = GovernanceLoop( |
| policy_engine=policy_engine, |
| cost_estimator=cost_estimator, |
| risk_engine=risk_engine, |
| memory=memory, |
| enable_epistemic=True, |
| ) |
|
|
| |
| decision_history = [] |
|
|
| |
| class EvaluateRequest(BaseModel): |
| service_name: str |
| event_type: str |
| severity: str |
| metrics: Dict[str, float] = {} |
|
|
| class EvaluateResponse(BaseModel): |
| risk_score: float |
| base_risk: float |
| memory_risk: Optional[float] = None |
| weight: float |
| similar_events: list = [] |
| confidence: float |
|
|
| |
| class _DemoIntent: |
| environment = "dev" |
| deployment_target = "dev" |
| service_name = "demo" |
|
|
| |
| @fastapi_app.get("/") |
| async def root(): |
| """Root endpoint – returns a welcome message.""" |
| return {"message": "ARF v4 API. See /docs for documentation."} |
|
|
| @fastapi_app.get("/health") |
| async def health(): |
| return {"status": "ok", "version": "4.0.0"} |
|
|
| @fastapi_app.get("/v1/get_risk") |
| async def get_risk(): |
| """Return the current demo risk.""" |
| intent = _DemoIntent() |
| risk_value, explanation, contributions = risk_engine.calculate_risk( |
| intent=intent, |
| cost_estimate=None, |
| policy_violations=[], |
| ) |
| decision = "approve" |
| if risk_value > 0.8: |
| decision = "deny" |
| elif risk_value > 0.2: |
| decision = "escalate" |
|
|
| decision_id = str(uuid.uuid4()) |
| decision_history.append({ |
| "decision_id": decision_id, |
| "timestamp": datetime.now(timezone.utc).isoformat(), |
| "risk_score": float(risk_value), |
| "outcome": None, |
| }) |
|
|
| return { |
| "system_risk": float(risk_value), |
| "status": "critical" if risk_value > 0.8 else "normal", |
| "explanation": explanation, |
| "contributions": contributions, |
| "decision_id": decision_id, |
| "decision": decision, |
| "timestamp": datetime.now(timezone.utc).isoformat() |
| } |
|
|
| @fastapi_app.get("/v1/history") |
| async def get_history(): |
| """Return the last 10 decisions.""" |
| return decision_history[-10:] |
|
|
| @fastapi_app.post("/v1/incidents/evaluate") |
| async def evaluate_incident(request: EvaluateRequest): |
| """ |
| Evaluate an incident by converting it into an infrastructure intent |
| and running it through the full governance loop. Returns a complete |
| HealingIntent with risk assessment, similar incidents, and recommended actions. |
| """ |
| try: |
| |
| |
| intent = DeployConfigurationIntent( |
| service_name=request.service_name, |
| change_scope="single_instance", |
| deployment_target=Environment.DEV, |
| configuration=request.metrics, |
| requester="system", |
| provenance={"source": "incident_evaluation", "event_type": request.event_type, "severity": request.severity}, |
| ) |
|
|
| |
| healing_intent: HealingIntent = governance_loop.run( |
| intent=intent, |
| context={ |
| "incident_metadata": { |
| "service_name": request.service_name, |
| "event_type": request.event_type, |
| "severity": request.severity, |
| "metrics": request.metrics, |
| } |
| }, |
| ) |
|
|
| |
| |
| response_dict = healing_intent.to_dict(include_oss_context=True) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| confidence_interval = None |
| if healing_intent.confidence_distribution: |
| dist = healing_intent.confidence_distribution |
| confidence_interval = [dist.get("p5", 0.0), dist.get("p95", 1.0)] |
| else: |
| |
| confidence_interval = [ |
| max(0.0, healing_intent.risk_score - 0.05), |
| min(1.0, healing_intent.risk_score + 0.05), |
| ] |
|
|
| |
| risk_contributions = [] |
| if healing_intent.risk_factors: |
| for factor, contribution in healing_intent.risk_factors.items(): |
| risk_contributions.append({"factor": factor, "contribution": contribution}) |
|
|
| |
| |
| |
| |
|
|
| |
| requires_escalation = healing_intent.recommended_action == RecommendedAction.ESCALATE |
|
|
| |
| response = { |
| "risk_score": healing_intent.risk_score, |
| "epistemic_uncertainty": healing_intent.confidence_distribution.get("std", 0.05) if healing_intent.confidence_distribution else 0.05, |
| "confidence_interval": confidence_interval, |
| "risk_contributions": risk_contributions, |
| "similar_incidents": healing_intent.similar_incidents or [], |
| "recommended_actions": healing_intent.alternative_actions or [], |
| "explanation": healing_intent.justification, |
| "policy_violations": healing_intent.policy_violations or [], |
| "requires_escalation": requires_escalation, |
| |
| "_full_healing_intent": healing_intent.to_dict(include_oss_context=False), |
| } |
|
|
| return response |
|
|
| except Exception as e: |
| logger.exception("Error in evaluate_incident") |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @fastapi_app.post("/v1/feedback") |
| async def record_outcome(decision_id: str, success: bool): |
| """Record the outcome of a decision (success/failure).""" |
| for dec in decision_history: |
| if dec["decision_id"] == decision_id: |
| dec["outcome"] = "success" if success else "failure" |
| |
| intent = _DemoIntent() |
| try: |
| risk_engine.update_outcome(intent, success) |
| except Exception as e: |
| logger.exception("Outcome update failed") |
| return {"status": "ok", "decision_id": decision_id, "outcome": dec["outcome"]} |
| return {"error": "decision not found"} |
|
|
| |
| @fastapi_app.get("/v1/memory/stats") |
| async def get_memory_stats(): |
| """Return current memory graph statistics.""" |
| if memory: |
| return memory.get_graph_stats() |
| return {"error": "Memory not initialized"} |
|
|
| |
| def get_risk_snapshot(): |
| try: |
| intent = _DemoIntent() |
| risk_value, explanation, contributions = risk_engine.calculate_risk( |
| intent=intent, |
| cost_estimate=None, |
| policy_violations=[], |
| ) |
| decision = "approve" |
| if risk_value > 0.8: |
| decision = "deny" |
| elif risk_value > 0.2: |
| decision = "escalate" |
| decision_id = str(uuid.uuid4()) |
| decision_history.append({ |
| "decision_id": decision_id, |
| "timestamp": datetime.now(timezone.utc).isoformat(), |
| "risk_score": float(risk_value), |
| "outcome": None, |
| }) |
| return { |
| "risk": float(risk_value), |
| "status": "critical" if risk_value > 0.8 else "normal", |
| "explanation": explanation, |
| "contributions": contributions, |
| "decision_id": decision_id, |
| "decision": decision, |
| "timestamp": datetime.now(timezone.utc).isoformat() |
| } |
| except Exception as e: |
| logger.exception("Failed to compute risk snapshot") |
| return {"error": str(e)} |
|
|
| def get_health_snapshot(): |
| return {"status": "ok", "version": "4.0.0", "service": "ARF OSS API", "timestamp": datetime.now(timezone.utc).isoformat()} |
|
|
| def get_memory_snapshot(): |
| if memory.has_historical_data(): |
| return {"status": "ok", "memory_stats": memory.get_graph_stats(), "timestamp": datetime.now(timezone.utc).isoformat()} |
| return {"status": "empty", "memory_stats": "No historical memory yet.", "timestamp": datetime.now(timezone.utc).isoformat()} |
|
|
| def record_outcome_ui(success: bool): |
| if not decision_history: |
| return {"error": "no decisions yet"} |
| last = decision_history[-1] |
| last["outcome"] = "success" if success else "failure" |
| intent = _DemoIntent() |
| try: |
| risk_engine.update_outcome(intent, success) |
| except Exception as e: |
| logger.exception("Outcome update failed") |
| return {"decision_id": last["decision_id"], "outcome": last["outcome"], "timestamp": datetime.now(timezone.utc).isoformat()} |
|
|
| with gr.Blocks(title="ARF v4 Demo") as demo: |
| gr.Markdown("# Agentic Reliability Framework v4\n### Probabilistic Infrastructure Governance") |
| with gr.Row(): |
| health_output = gr.JSON(label="Health") |
| risk_output = gr.JSON(label="Current Risk") |
| with gr.Row(): |
| memory_output = gr.JSON(label="Memory Stats") |
| with gr.Row(): |
| decision_output = gr.JSON(label="Recent Decisions") |
| with gr.Row(): |
| refresh_btn = gr.Button("Evaluate Intent") |
| success_btn = gr.Button("Action Succeeded") |
| fail_btn = gr.Button("Action Failed") |
| refresh_btn.click(fn=get_risk_snapshot, outputs=risk_output) |
| success_btn.click(fn=lambda: record_outcome_ui(True), outputs=decision_output) |
| fail_btn.click(fn=lambda: record_outcome_ui(False), outputs=decision_output) |
| with gr.Row(): |
| health_btn = gr.Button("Refresh Health") |
| memory_btn = gr.Button("Refresh Memory") |
| history_btn = gr.Button("Show Decision History") |
| health_btn.click(fn=get_health_snapshot, outputs=health_output) |
| memory_btn.click(fn=get_memory_snapshot, outputs=memory_output) |
| history_btn.click(fn=lambda: decision_history[-10:], outputs=decision_output) |
|
|
| |
| app = gr.mount_gradio_app(fastapi_app, demo, path="/api") |
|
|
| |
| @app.get("/docs", include_in_schema=False) |
| async def swagger_ui(): |
| return get_swagger_ui_html( |
| openapi_url="/openapi.json", |
| title="ARF API Docs" |
| ) |
|
|
| @app.get("/redoc", include_in_schema=False) |
| async def redoc_ui(): |
| return get_redoc_html( |
| openapi_url="/openapi.json", |
| title="ARF API ReDoc" |
| ) |
|
|
| @app.get("/openapi.json", include_in_schema=False) |
| async def openapi(): |
| return fastapi_app.openapi() |
|
|
| @app.get("/api/docs", include_in_schema=False) |
| async def redirect_docs(): |
| return RedirectResponse(url="/docs") |