petter2025's picture
Update hf_demo.py
b635d04 verified
raw
history blame
7.37 kB
# hf_demo.py – ARF v4 dashboard for Hugging Face Spaces
import logging
from datetime import datetime, timezone
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import gradio as gr
# ARF v4 imports
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
from agentic_reliability_framework.runtime.memory.constants import MemoryConstants
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="ARF v4 API with Memory")
# Enable CORS for your frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["https://arf-frontend-sandy.vercel.app"],
allow_methods=["*"],
allow_headers=["*"],
)
# ---------------------------------------------------------------------------
# Initialize ARF components
# ---------------------------------------------------------------------------
risk_engine = RiskEngine()
# Create FAISS index and memory (using default dimension from constants)
faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
memory = RAGGraphMemory(faiss_index)
# ---------------------------------------------------------------------------
# API Endpoints
# ---------------------------------------------------------------------------
@app.get("/")
async def root():
return {
"service": "ARF OSS API",
"version": "4.0.0",
"status": "operational",
"memory_stats": memory.get_graph_stats() if memory.has_historical_data() else "empty",
}
@app.get("/health")
async def health():
return {"status": "ok", "version": "4.0.0"}
@app.get("/api/v1/get_risk")
async def get_risk():
"""
Compute a safe risk snapshot using the supported RiskEngine.calculate_risk()
API. This avoids calling the removed get_current_risk() method.
"""
try:
score = _calculate_demo_risk()
return {
"system_risk": score["risk"],
"status": "critical" if score["risk"] > 0.8 else "normal",
"details": score,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/v1/incident")
async def store_incident(event_data: dict, analysis: dict):
try:
incident_id = memory.store_incident(event_data, analysis)
return {"incident_id": incident_id}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/v1/memory/similar")
async def find_similar_incidents(action: str, k: int = 5):
class DummyEvent:
def __init__(self, action: str):
self.component = "user_action"
self.latency_p99 = 0.0
self.error_rate = 0.0
self.throughput = 0
self.cpu_util = 0.0
self.memory_util = 0.0
self.timestamp = datetime.now()
self.severity = "low"
self.action = action
event = DummyEvent(action)
analysis = {"action": action}
similar = memory.find_similar(event, analysis, k=k)
results = []
for node in similar:
results.append(
{
"incident_id": node.incident_id,
"component": node.component,
"severity": node.severity,
"timestamp": node.timestamp,
"metrics": node.metrics,
"agent_analysis": node.agent_analysis,
"similarity_score": node.metadata.get("similarity_score", 0.0),
}
)
return {"similar": results, "count": len(results)}
@app.get("/api/v1/memory/stats")
async def memory_stats():
return memory.get_graph_stats()
# ---------------------------------------------------------------------------
# Gradio dashboard
# ---------------------------------------------------------------------------
class _DemoIntent:
"""
Minimal intent object for demo-only risk snapshots.
RiskEngine.categorize_intent() will fall back to DEFAULT for this object.
"""
environment = "dev"
deployment_target = "dev"
service_name = "demo"
def _calculate_demo_risk():
"""
Use the supported RiskEngine.calculate_risk() API.
Avoids the removed get_current_risk() method.
"""
intent = _DemoIntent()
risk_value, explanation, contributions = risk_engine.calculate_risk(
intent=intent,
cost_estimate=None,
policy_violations=[],
)
return {
"risk": float(risk_value),
"status": "critical" if risk_value > 0.8 else "normal",
"explanation": explanation,
"contributions": contributions,
}
def get_risk_snapshot():
try:
snapshot = _calculate_demo_risk()
snapshot["timestamp"] = datetime.now(timezone.utc).isoformat()
return snapshot
except Exception as e:
logger.exception("Failed to compute risk snapshot")
return {
"status": "error",
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat(),
}
def get_health_snapshot():
try:
return {
"status": "ok",
"version": "4.0.0",
"service": "ARF OSS API",
"timestamp": datetime.now(timezone.utc).isoformat(),
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat(),
}
def get_memory_snapshot():
try:
if memory.has_historical_data():
stats = memory.get_graph_stats()
return {
"status": "ok",
"memory_stats": stats,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
return {
"status": "empty",
"memory_stats": "No historical memory yet.",
"timestamp": datetime.now(timezone.utc).isoformat(),
}
except Exception as e:
logger.exception("Failed to compute memory snapshot")
return {
"status": "error",
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat(),
}
with gr.Blocks(title="ARF v4 Demo") as demo:
gr.Markdown("# Agentic Reliability Framework v4")
gr.Markdown("### Status dashboard")
with gr.Row():
health_output = gr.JSON(label="Health")
risk_output = gr.JSON(label="Current Risk")
with gr.Row():
memory_output = gr.JSON(label="Memory Stats")
with gr.Row():
refresh_btn = gr.Button("Refresh Risk")
health_btn = gr.Button("Refresh Health")
memory_btn = gr.Button("Refresh Memory")
refresh_btn.click(fn=get_risk_snapshot, outputs=risk_output)
health_btn.click(fn=get_health_snapshot, outputs=health_output)
memory_btn.click(fn=get_memory_snapshot, outputs=memory_output)
# Load initial state after startup, not during import.
demo.load(fn=get_health_snapshot, outputs=health_output)
demo.load(fn=get_risk_snapshot, outputs=risk_output)
demo.load(fn=get_memory_snapshot, outputs=memory_output)
# ============== MAIN ENTRY POINT ==============
if __name__ == "__main__":
# Launch Gradio directly to keep the Space alive and avoid the startup crash.
demo.launch(server_name="0.0.0.0")