Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

File size: 14,782 Bytes

40981c8
b635d04
f6fe53e
b635d04
6da229d
b635d04
6da229d
189570d
462ba16
2ce3527
0f5f4d1
642b6b3
6bb0212
 
642b6b3
c98f35f
 
ff4d74f
6da229d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b635d04
 
 
129b05b
 
 
 
0f5f4d1
40981c8
6756da2
0f5f4d1
40981c8
6756da2
642b6b3
6756da2
b635d04
d2a0c5e
6c7e606
0f5f4d1
642b6b3
6bb0212
 
 
6da229d
 
 
 
 
 
 
 
 
 
 
 
 
 
f6fe53e
 
0f5f4d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40981c8
6bb0212
0f5f4d1
 
6bb0212
40981c8
642b6b3
 
7cfde2b
40981c8
0f5f4d1
 
b635d04
 
 
 
 
 
f6fe53e
 
 
 
 
 
 
0f5f4d1
 
f6fe53e
0f5f4d1
40981c8
0f5f4d1
f6fe53e
54e37af
0f5f4d1
b635d04
 
 
f6fe53e
0f5f4d1
 
54e37af
 
40981c8
0f5f4d1
 
 
f6fe53e
6da229d
0f5f4d1
 
6da229d
 
 
0f5f4d1
6da229d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6fe53e
40981c8
0f5f4d1
 
 
 
 
40981c8
0f5f4d1
 
 
 
 
 
 
 
a22ea2e
 
 
 
 
 
 
 
2ce3527
b635d04
 
0f5f4d1
 
 
 
 
 
 
 
 
 
 
 
 
 
b635d04
0f5f4d1
 
 
 
 
 
 
 
 
 
 
b635d04
0f5f4d1
 
 
b635d04
 
0f5f4d1
b635d04
 
0f5f4d1
 
 
f6fe53e
0f5f4d1
 
 
 
 
 
b635d04
0f5f4d1
b635d04
0f5f4d1
 
f6fe53e
b635d04
0f5f4d1
54e37af
b635d04
 
54e37af
b635d04
 
f6fe53e
 
 
 
 
 
0f5f4d1
 
f6fe53e
b635d04
 
f6fe53e
b635d04
 
0f5f4d1
f6fe53e
2ce3527
1da74ed
40981c8
6da229d
2ce3527
462ba16
 
2ce3527
462ba16
 
 
2ce3527
462ba16
 
2ce3527
462ba16
 
 
2ce3527
462ba16
 
 
2ce3527

# app.py – ARF v4 API with Gradio frontend (FastAPI mounted under /api)
import logging
import uuid
from datetime import datetime, timezone
from typing import Dict, Optional, List

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
from fastapi.responses import RedirectResponse
from pydantic import BaseModel
import gradio as gr

# ARF v4 imports
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
from agentic_reliability_framework.runtime.memory.constants import MemoryConstants

# Additional imports for governance loop and healing intent
from agentic_reliability_framework.core.governance.governance_loop import GovernanceLoop
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
from agentic_reliability_framework.core.governance.cost_estimator import CostEstimator
from agentic_reliability_framework.core.governance.intents import (
    DeployConfigurationIntent,
    Environment,
    InfrastructureIntent,
)
from agentic_reliability_framework.core.governance.healing_intent import (
    HealingIntent,
    HealingIntentSerializer,
    RecommendedAction,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import pkgutil
import agentic_reliability_framework.core.governance as governance
print("Contents of governance module:", [name for _, name, _ in pkgutil.iter_modules(governance.__path__)])

# ========================= FASTAPI APP =========================
fastapi_app = FastAPI(title="ARF v4 API")

# Enable CORS for your frontend
fastapi_app.add_middleware(
    CORSMiddleware,
    allow_origins=["https://arf-frontend-sandy.vercel.app"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# ========================= ARF COMPONENTS =========================
risk_engine = RiskEngine()
faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
memory = RAGGraphMemory(faiss_index)

# Create policy engine and cost estimator (use default implementations)
policy_engine = PolicyEngine()      # Will need policies loaded if any
cost_estimator = CostEstimator()    # Default estimator

# Initialize the governance loop
governance_loop = GovernanceLoop(
    policy_engine=policy_engine,
    cost_estimator=cost_estimator,
    risk_engine=risk_engine,
    memory=memory,
    enable_epistemic=True,
)

# In‑memory storage for demo purposes (used by /v1/history and /v1/feedback)
decision_history = []

# ========================= PYDANTIC MODELS =========================
class EvaluateRequest(BaseModel):
    service_name: str
    event_type: str
    severity: str
    metrics: Dict[str, float] = {}

class EvaluateResponse(BaseModel):
    risk_score: float
    base_risk: float
    memory_risk: Optional[float] = None
    weight: float
    similar_events: list = []
    confidence: float

# ========================= HELPER: Demo Intent =========================
class _DemoIntent:
    environment = "dev"
    deployment_target = "dev"
    service_name = "demo"

# ========================= API ENDPOINTS =========================
@fastapi_app.get("/")
async def root():
    """Root endpoint – returns a welcome message."""
    return {"message": "ARF v4 API. See /docs for documentation."}

@fastapi_app.get("/health")
async def health():
    return {"status": "ok", "version": "4.0.0"}

@fastapi_app.get("/v1/get_risk")
async def get_risk():
    """Return the current demo risk."""
    intent = _DemoIntent()
    risk_value, explanation, contributions = risk_engine.calculate_risk(
        intent=intent,
        cost_estimate=None,
        policy_violations=[],
    )
    decision = "approve"
    if risk_value > 0.8:
        decision = "deny"
    elif risk_value > 0.2:
        decision = "escalate"

    decision_id = str(uuid.uuid4())
    decision_history.append({
        "decision_id": decision_id,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "risk_score": float(risk_value),
        "outcome": None,  # will be filled when feedback is given
    })

    return {
        "system_risk": float(risk_value),
        "status": "critical" if risk_value > 0.8 else "normal",
        "explanation": explanation,
        "contributions": contributions,
        "decision_id": decision_id,
        "decision": decision,
        "timestamp": datetime.now(timezone.utc).isoformat()
    }

@fastapi_app.get("/v1/history")
async def get_history():
    """Return the last 10 decisions."""
    return decision_history[-10:]

@fastapi_app.post("/v1/incidents/evaluate")
async def evaluate_incident(request: EvaluateRequest):
    """
    Evaluate an incident by converting it into an infrastructure intent
    and running it through the full governance loop. Returns a complete
    HealingIntent with risk assessment, similar incidents, and recommended actions.
    """
    try:
        # Map the incident to a DeployConfigurationIntent (as an example)
        # You can change the mapping logic based on your needs
        intent = DeployConfigurationIntent(
            service_name=request.service_name,
            change_scope="single_instance",  # default
            deployment_target=Environment.DEV,  # assume dev for now
            configuration=request.metrics,
            requester="system",
            provenance={"source": "incident_evaluation", "event_type": request.event_type, "severity": request.severity},
        )

        # Run through governance loop
        healing_intent: HealingIntent = governance_loop.run(
            intent=intent,
            context={
                "incident_metadata": {
                    "service_name": request.service_name,
                    "event_type": request.event_type,
                    "severity": request.severity,
                    "metrics": request.metrics,
                }
            },
        )

        # Serialize the healing intent to a dictionary suitable for JSON response
        # We'll use the full dict (including OSS context) for the frontend.
        response_dict = healing_intent.to_dict(include_oss_context=True)

        # Add any extra fields expected by the frontend that might not be in HealingIntent
        # The frontend's EvaluateResponse includes fields like base_risk, weight, etc.
        # We can compute these from the risk contributions if needed.
        # For simplicity, we'll map the healing intent fields to the expected shape.
        # The frontend expects:
        # - risk_score (already present)
        # - epistemic_uncertainty (from confidence_distribution)
        # - confidence_interval (from confidence_distribution)
        # - risk_contributions (from risk_factors)
        # - similar_incidents (already present)
        # - recommended_actions (from action or alternative_actions)
        # - explanation (from justification)
        # - policy_violations (already present)
        # - requires_escalation (based on recommended_action)
        #
        # We'll construct a response that matches the frontend's EvaluateResponse type.

        # Compute confidence interval if confidence_distribution exists
        confidence_interval = None
        if healing_intent.confidence_distribution:
            dist = healing_intent.confidence_distribution
            confidence_interval = [dist.get("p5", 0.0), dist.get("p95", 1.0)]
        else:
            # Fallback based on risk_score (e.g., 90% CI width 0.1)
            confidence_interval = [
                max(0.0, healing_intent.risk_score - 0.05),
                min(1.0, healing_intent.risk_score + 0.05),
            ]

        # Convert risk_factors to list of RiskContribution objects
        risk_contributions = []
        if healing_intent.risk_factors:
            for factor, contribution in healing_intent.risk_factors.items():
                risk_contributions.append({"factor": factor, "contribution": contribution})

        # Convert similar_incidents (list of dicts) – already in correct format? The frontend expects
        # each incident to have fields: incident_id, component, severity, timestamp, metrics, similarity_score, outcome_success.
        # HealingIntent's similar_incidents might have different structure; we can pass as-is if matches.
        # If not, we need to transform. We'll assume they are compatible or simply pass.

        # Determine if escalation is required
        requires_escalation = healing_intent.recommended_action == RecommendedAction.ESCALATE

        # Build the response
        response = {
            "risk_score": healing_intent.risk_score,
            "epistemic_uncertainty": healing_intent.confidence_distribution.get("std", 0.05) if healing_intent.confidence_distribution else 0.05,
            "confidence_interval": confidence_interval,
            "risk_contributions": risk_contributions,
            "similar_incidents": healing_intent.similar_incidents or [],
            "recommended_actions": healing_intent.alternative_actions or [],
            "explanation": healing_intent.justification,
            "policy_violations": healing_intent.policy_violations or [],
            "requires_escalation": requires_escalation,
            # Also include raw healing intent for debugging (optional)
            "_full_healing_intent": healing_intent.to_dict(include_oss_context=False),
        }

        return response

    except Exception as e:
        logger.exception("Error in evaluate_incident")
        raise HTTPException(status_code=500, detail=str(e))

@fastapi_app.post("/v1/feedback")
async def record_outcome(decision_id: str, success: bool):
    """Record the outcome of a decision (success/failure)."""
    for dec in decision_history:
        if dec["decision_id"] == decision_id:
            dec["outcome"] = "success" if success else "failure"
            # Update the risk engine (optional)
            intent = _DemoIntent()
            try:
                risk_engine.update_outcome(intent, success)
            except Exception as e:
                logger.exception("Outcome update failed")
            return {"status": "ok", "decision_id": decision_id, "outcome": dec["outcome"]}
    return {"error": "decision not found"}

# ========================= NEW MEMORY STATS ENDPOINT =========================
@fastapi_app.get("/v1/memory/stats")
async def get_memory_stats():
    """Return current memory graph statistics."""
    if memory:
        return memory.get_graph_stats()
    return {"error": "Memory not initialized"}

# ========================= GRADIO UI =========================
def get_risk_snapshot():
    try:
        intent = _DemoIntent()
        risk_value, explanation, contributions = risk_engine.calculate_risk(
            intent=intent,
            cost_estimate=None,
            policy_violations=[],
        )
        decision = "approve"
        if risk_value > 0.8:
            decision = "deny"
        elif risk_value > 0.2:
            decision = "escalate"
        decision_id = str(uuid.uuid4())
        decision_history.append({
            "decision_id": decision_id,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "risk_score": float(risk_value),
            "outcome": None,
        })
        return {
            "risk": float(risk_value),
            "status": "critical" if risk_value > 0.8 else "normal",
            "explanation": explanation,
            "contributions": contributions,
            "decision_id": decision_id,
            "decision": decision,
            "timestamp": datetime.now(timezone.utc).isoformat()
        }
    except Exception as e:
        logger.exception("Failed to compute risk snapshot")
        return {"error": str(e)}

def get_health_snapshot():
    return {"status": "ok", "version": "4.0.0", "service": "ARF OSS API", "timestamp": datetime.now(timezone.utc).isoformat()}

def get_memory_snapshot():
    if memory.has_historical_data():
        return {"status": "ok", "memory_stats": memory.get_graph_stats(), "timestamp": datetime.now(timezone.utc).isoformat()}
    return {"status": "empty", "memory_stats": "No historical memory yet.", "timestamp": datetime.now(timezone.utc).isoformat()}

def record_outcome_ui(success: bool):
    if not decision_history:
        return {"error": "no decisions yet"}
    last = decision_history[-1]
    last["outcome"] = "success" if success else "failure"
    intent = _DemoIntent()
    try:
        risk_engine.update_outcome(intent, success)
    except Exception as e:
        logger.exception("Outcome update failed")
    return {"decision_id": last["decision_id"], "outcome": last["outcome"], "timestamp": datetime.now(timezone.utc).isoformat()}

with gr.Blocks(title="ARF v4 Demo") as demo:
    gr.Markdown("# Agentic Reliability Framework v4\n### Probabilistic Infrastructure Governance")
    with gr.Row():
        health_output = gr.JSON(label="Health")
        risk_output = gr.JSON(label="Current Risk")
    with gr.Row():
        memory_output = gr.JSON(label="Memory Stats")
    with gr.Row():
        decision_output = gr.JSON(label="Recent Decisions")
    with gr.Row():
        refresh_btn = gr.Button("Evaluate Intent")
        success_btn = gr.Button("Action Succeeded")
        fail_btn = gr.Button("Action Failed")
    refresh_btn.click(fn=get_risk_snapshot, outputs=risk_output)
    success_btn.click(fn=lambda: record_outcome_ui(True), outputs=decision_output)
    fail_btn.click(fn=lambda: record_outcome_ui(False), outputs=decision_output)
    with gr.Row():
        health_btn = gr.Button("Refresh Health")
        memory_btn = gr.Button("Refresh Memory")
        history_btn = gr.Button("Show Decision History")
    health_btn.click(fn=get_health_snapshot, outputs=health_output)
    memory_btn.click(fn=get_memory_snapshot, outputs=memory_output)
    history_btn.click(fn=lambda: decision_history[-10:], outputs=decision_output)

# ========================= Mount Gradio and Add Documentation Routes =========================
app = gr.mount_gradio_app(fastapi_app, demo, path="/api")

# Add documentation routes at "/docs"
@app.get("/docs", include_in_schema=False)
async def swagger_ui():
    return get_swagger_ui_html(
        openapi_url="/openapi.json",
        title="ARF API Docs"
    )

@app.get("/redoc", include_in_schema=False)
async def redoc_ui():
    return get_redoc_html(
        openapi_url="/openapi.json",
        title="ARF API ReDoc"
    )

@app.get("/openapi.json", include_in_schema=False)
async def openapi():
    return fastapi_app.openapi()

@app.get("/api/docs", include_in_schema=False)
async def redirect_docs():
    return RedirectResponse(url="/docs")