Spaces:
Runtime error
Runtime error
| """ | |
| Incident evaluation endpoints — backward‑compatible Bayesian reroute. | |
| This module provides two incident‑related routes: | |
| * ``POST /api/v1/report_incident`` | |
| Stores a ``ReliabilityEvent`` in an in‑memory history for auditing | |
| and debugging. | |
| * ``POST /api/v1/v1/incidents/evaluate`` **(deprecated)** | |
| Former heuristic endpoint now **rerouted to the full Bayesian risk | |
| engine**. All callers should migrate to | |
| ``POST /api/v1/intents/evaluate``, which returns richer metadata | |
| including CUDL uncertainty decomposition and decision traces. | |
| The local model duplicates (``ReliabilityEvent``, ``HealingAction``) | |
| have been removed; all types are imported from the canonical ARF core | |
| framework (``agentic_reliability_framework.core.models.event``). | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import time | |
| from typing import Optional | |
| from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request | |
| from agentic_reliability_framework.core.models.event import ( | |
| HealingAction, | |
| ReliabilityEvent, | |
| ) | |
| from app.causal_explainer import CausalExplainer | |
| from app.core.usage_tracker import UsageRecord, enforce_quota, tracker | |
| logger = logging.getLogger(__name__) | |
| router = APIRouter() | |
| # --------------------------------------------------------------------------- | |
| # In‑memory incident store (for auditing / debugging only) | |
| # --------------------------------------------------------------------------- | |
| incident_history: list[dict] = [] | |
| # --------------------------------------------------------------------------- | |
| # POST /api/v1/report_incident | |
| # --------------------------------------------------------------------------- | |
| async def report_incident(event: ReliabilityEvent) -> dict[str, str]: | |
| """ | |
| Record a ``ReliabilityEvent`` in the in‑memory incident history. | |
| This endpoint is used by internal monitoring tools to feed incident | |
| data into the causal explainer and downstream analysis. The event | |
| is stored as a JSON‑safe dictionary and is **not** persisted across | |
| API restarts. | |
| Parameters | |
| ---------- | |
| event : ReliabilityEvent | |
| The reliability event to record. Must include at minimum | |
| ``component``, ``latency_p99``, ``error_rate``, and | |
| ``service_mesh``. | |
| Returns | |
| ------- | |
| dict | |
| A simple acknowledgement ``{"status": "recorded"}``. | |
| """ | |
| incident_history.append(event.model_dump(mode="json")) | |
| return {"status": "recorded"} | |
| # --------------------------------------------------------------------------- | |
| # POST /api/v1/v1/incidents/evaluate (deprecated) | |
| # --------------------------------------------------------------------------- | |
| async def evaluate_incident( | |
| request: Request, | |
| event: ReliabilityEvent, | |
| background_tasks: BackgroundTasks, | |
| quota: dict = Depends(enforce_quota), | |
| ) -> dict: | |
| """ | |
| Evaluate an incident using the **Bayesian risk engine**. | |
| .. deprecated:: 0.6.0 | |
| Use ``POST /api/v1/intents/evaluate`` instead. This endpoint | |
| will be removed in a future release. Responses include a | |
| ``deprecation_notice`` field to assist migration. | |
| The following steps are performed: | |
| 1. Convert the ``ReliabilityEvent`` into a minimal | |
| ``DeployConfigurationIntent`` via ``intent_adapter``. | |
| 2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian | |
| risk score. | |
| 3. Generate a heuristic healing action based on the risk score. | |
| 4. Run the causal explainer for counter‑factual text. | |
| 5. Build a backward‑compatible response envelope. | |
| Parameters | |
| ---------- | |
| request : Request | |
| The Starlette request object (used for internal state access). | |
| event : ReliabilityEvent | |
| The incident event containing component name, latency, error | |
| rate, etc. | |
| background_tasks : BackgroundTasks | |
| FastAPI background‑task runner for asynchronous logging. | |
| quota : dict | |
| Injected by ``enforce_quota``; contains ``api_key``, ``tier``, | |
| and ``remaining``. | |
| Returns | |
| ------- | |
| dict | |
| A dictionary with keys: | |
| * ``deprecation_notice`` (str) — migration guidance. | |
| * ``healing_intent`` (dict) — action, component, risk score, | |
| justification, confidence, and advisory status. | |
| * ``causal_explanation`` (dict) — factual/counter‑factual | |
| outcomes and explanation text. | |
| * ``utility_decision`` (dict) — selected action and expected | |
| utility. | |
| """ | |
| start_time = time.time() | |
| api_key: str = quota["api_key"] | |
| tier = quota["tier"] | |
| response_data: Optional[dict] = None | |
| error_msg: Optional[str] = None | |
| try: | |
| # ------------------------------------------------------------------ | |
| # Step 1 – Convert the event into an infrastructure intent | |
| # ------------------------------------------------------------------ | |
| from app.services.intent_adapter import to_oss_intent | |
| from app.services.risk_service import evaluate_intent | |
| raw_intent = { | |
| "intent_type": "deploy_config", | |
| "environment": "prod", | |
| "service_name": event.component, | |
| "requester": "auto", | |
| "change_scope": "global", | |
| "deployment_target": "prod", | |
| "configuration": {}, | |
| "provenance": {"source": "incident_evaluate"}, | |
| } | |
| oss_intent = to_oss_intent(raw_intent) | |
| # ------------------------------------------------------------------ | |
| # Step 2 – Bayesian risk evaluation | |
| # ------------------------------------------------------------------ | |
| risk_engine = request.app.state.risk_engine | |
| result = evaluate_intent( | |
| engine=risk_engine, | |
| intent=oss_intent, | |
| cost_estimate=None, | |
| policy_violations=[], | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Step 3 – Heuristic action selection based on risk threshold | |
| # ------------------------------------------------------------------ | |
| optimal_action = ( | |
| HealingAction.RESTART_CONTAINER | |
| if result["risk_score"] > 0.5 | |
| else HealingAction.NO_ACTION | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Step 4 – Causal explainer | |
| # ------------------------------------------------------------------ | |
| causal_explainer = CausalExplainer() | |
| current_state = { | |
| "latency": event.latency_p99, | |
| "error_rate": event.error_rate, | |
| "last_action": {"action_type": "no_action"}, | |
| } | |
| proposed_action = {"action_type": optimal_action.value, "params": {}} | |
| causal_exp = causal_explainer.explain_healing_intent( | |
| proposed_action, current_state, "latency" | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Step 5 – Build response envelope | |
| # ------------------------------------------------------------------ | |
| healing_intent = { | |
| "action": optimal_action.value, | |
| "component": event.component, | |
| "parameters": {}, | |
| "justification": ( | |
| f"Bayesian risk score: {result['risk_score']:.3f}. " | |
| f"Causal: {causal_exp.explanation_text}" | |
| ), | |
| "confidence": 1.0 - result.get("uncertainty", 0.0), | |
| "risk_score": result["risk_score"], | |
| "status": "oss_advisory_only", | |
| } | |
| response_data = { | |
| "deprecation_notice": ( | |
| "This endpoint is deprecated. Use POST /api/v1/intents/evaluate " | |
| "for the full Bayesian evaluation with CUDL decomposition." | |
| ), | |
| "healing_intent": healing_intent, | |
| "causal_explanation": { | |
| "factual_outcome": causal_exp.factual_outcome, | |
| "counterfactual_outcome": causal_exp.counterfactual_outcome, | |
| "effect": causal_exp.effect, | |
| "explanation_text": causal_exp.explanation_text, | |
| "is_model_based": causal_exp.is_model_based, | |
| "warnings": causal_exp.warnings, | |
| }, | |
| "utility_decision": { | |
| "best_action": optimal_action.value, | |
| "expected_utility": 0.5, | |
| "explanation": ( | |
| "Decision based on Bayesian risk threshold > 0.5" | |
| ), | |
| }, | |
| } | |
| # ------------------------------------------------------------------ | |
| # Asynchronous usage logging | |
| # ------------------------------------------------------------------ | |
| if tracker: | |
| record = UsageRecord( | |
| api_key=api_key, | |
| tier=tier, | |
| timestamp=time.time(), | |
| endpoint="/v1/incidents/evaluate", | |
| request_body=event.model_dump(mode="json"), | |
| response=response_data, | |
| processing_ms=(time.time() - start_time) * 1000, | |
| ) | |
| await tracker.increment_usage_async(record, background_tasks) | |
| logger.warning( | |
| "Deprecated endpoint /v1/incidents/evaluate called by key %s", | |
| api_key[:8], | |
| ) | |
| return response_data | |
| except HTTPException: | |
| raise | |
| except Exception as exc: | |
| error_msg = str(exc) | |
| if tracker: | |
| record = UsageRecord( | |
| api_key=api_key, | |
| tier=tier, | |
| timestamp=time.time(), | |
| endpoint="/v1/incidents/evaluate", | |
| request_body=event.model_dump(mode="json"), | |
| error=error_msg, | |
| processing_ms=(time.time() - start_time) * 1000, | |
| ) | |
| await tracker.increment_usage_async(record, background_tasks) | |
| raise HTTPException(status_code=500, detail=error_msg) | |