Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

Agentic-Reliability-Framework-API / hf_demo.py

petter2025

Update hf_demo.py

b635d04 verified about 1 month ago

raw

history blame

7.37 kB

	# hf_demo.py – ARF v4 dashboard for Hugging Face Spaces
	import logging
	from datetime import datetime, timezone

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	import gradio as gr

	# ARF v4 imports
	from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
	from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
	from agentic_reliability_framework.runtime.memory.constants import MemoryConstants

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(title="ARF v4 API with Memory")

	# Enable CORS for your frontend
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["https://arf-frontend-sandy.vercel.app"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ---------------------------------------------------------------------------
	# Initialize ARF components
	# ---------------------------------------------------------------------------
	risk_engine = RiskEngine()

	# Create FAISS index and memory (using default dimension from constants)
	faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
	memory = RAGGraphMemory(faiss_index)

	# ---------------------------------------------------------------------------
	# API Endpoints
	# ---------------------------------------------------------------------------
	@app.get("/")
	async def root():
	return {
	"service": "ARF OSS API",
	"version": "4.0.0",
	"status": "operational",
	"memory_stats": memory.get_graph_stats() if memory.has_historical_data() else "empty",
	}

	@app.get("/health")
	async def health():
	return {"status": "ok", "version": "4.0.0"}

	@app.get("/api/v1/get_risk")
	async def get_risk():
	"""
	Compute a safe risk snapshot using the supported RiskEngine.calculate_risk()
	API. This avoids calling the removed get_current_risk() method.
	"""
	try:
	score = _calculate_demo_risk()
	return {
	"system_risk": score["risk"],
	"status": "critical" if score["risk"] > 0.8 else "normal",
	"details": score,
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/api/v1/incident")
	async def store_incident(event_data: dict, analysis: dict):
	try:
	incident_id = memory.store_incident(event_data, analysis)
	return {"incident_id": incident_id}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/api/v1/memory/similar")
	async def find_similar_incidents(action: str, k: int = 5):
	class DummyEvent:
	def __init__(self, action: str):
	self.component = "user_action"
	self.latency_p99 = 0.0
	self.error_rate = 0.0
	self.throughput = 0
	self.cpu_util = 0.0
	self.memory_util = 0.0
	self.timestamp = datetime.now()
	self.severity = "low"
	self.action = action

	event = DummyEvent(action)
	analysis = {"action": action}
	similar = memory.find_similar(event, analysis, k=k)

	results = []
	for node in similar:
	results.append(
	{
	"incident_id": node.incident_id,
	"component": node.component,
	"severity": node.severity,
	"timestamp": node.timestamp,
	"metrics": node.metrics,
	"agent_analysis": node.agent_analysis,
	"similarity_score": node.metadata.get("similarity_score", 0.0),
	}
	)

	return {"similar": results, "count": len(results)}

	@app.get("/api/v1/memory/stats")
	async def memory_stats():
	return memory.get_graph_stats()

	# ---------------------------------------------------------------------------
	# Gradio dashboard
	# ---------------------------------------------------------------------------

	class _DemoIntent:
	"""
	Minimal intent object for demo-only risk snapshots.
	RiskEngine.categorize_intent() will fall back to DEFAULT for this object.
	"""
	environment = "dev"
	deployment_target = "dev"
	service_name = "demo"

	def _calculate_demo_risk():
	"""
	Use the supported RiskEngine.calculate_risk() API.
	Avoids the removed get_current_risk() method.
	"""
	intent = _DemoIntent()
	risk_value, explanation, contributions = risk_engine.calculate_risk(
	intent=intent,
	cost_estimate=None,
	policy_violations=[],
	)

	return {
	"risk": float(risk_value),
	"status": "critical" if risk_value > 0.8 else "normal",
	"explanation": explanation,
	"contributions": contributions,
	}

	def get_risk_snapshot():
	try:
	snapshot = _calculate_demo_risk()
	snapshot["timestamp"] = datetime.now(timezone.utc).isoformat()
	return snapshot
	except Exception as e:
	logger.exception("Failed to compute risk snapshot")
	return {
	"status": "error",
	"error": str(e),
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}

	def get_health_snapshot():
	try:
	return {
	"status": "ok",
	"version": "4.0.0",
	"service": "ARF OSS API",
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}
	except Exception as e:
	return {
	"status": "error",
	"error": str(e),
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}

	def get_memory_snapshot():
	try:
	if memory.has_historical_data():
	stats = memory.get_graph_stats()
	return {
	"status": "ok",
	"memory_stats": stats,
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}
	return {
	"status": "empty",
	"memory_stats": "No historical memory yet.",
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}
	except Exception as e:
	logger.exception("Failed to compute memory snapshot")
	return {
	"status": "error",
	"error": str(e),
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}

	with gr.Blocks(title="ARF v4 Demo") as demo:
	gr.Markdown("# Agentic Reliability Framework v4")
	gr.Markdown("### Status dashboard")

	with gr.Row():
	health_output = gr.JSON(label="Health")
	risk_output = gr.JSON(label="Current Risk")

	with gr.Row():
	memory_output = gr.JSON(label="Memory Stats")

	with gr.Row():
	refresh_btn = gr.Button("Refresh Risk")
	health_btn = gr.Button("Refresh Health")
	memory_btn = gr.Button("Refresh Memory")

	refresh_btn.click(fn=get_risk_snapshot, outputs=risk_output)
	health_btn.click(fn=get_health_snapshot, outputs=health_output)
	memory_btn.click(fn=get_memory_snapshot, outputs=memory_output)

	# Load initial state after startup, not during import.
	demo.load(fn=get_health_snapshot, outputs=health_output)
	demo.load(fn=get_risk_snapshot, outputs=risk_output)
	demo.load(fn=get_memory_snapshot, outputs=memory_output)

	# ============== MAIN ENTRY POINT ==============
	if __name__ == "__main__":
	# Launch Gradio directly to keep the Space alive and avoid the startup crash.
	demo.launch(server_name="0.0.0.0")