Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Runtime error

App Files Files Community

petter2025 commited on 30 days ago

Commit

e93eabb

verified ·

1 Parent(s): fa2c5e7

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -88

app.py CHANGED Viewed

@@ -3,21 +3,24 @@ import asyncio
 import json
 import logging
 import traceback
-import random
 from datetime import datetime
-# Import the base engine
 from agentic_reliability_framework.runtime.engine import EnhancedReliabilityEngine
-# Import our new AI components
-from ai_event import AIEvent
 from hallucination_detective import HallucinationDetectiveAgent
 from memory_drift_diagnostician import MemoryDriftDiagnosticianAgent
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Initialize the engine (for infrastructure analysis)
 try:
     logger.info("Initializing EnhancedReliabilityEngine...")
     engine = EnhancedReliabilityEngine()
@@ -26,128 +29,160 @@ except Exception as e:
     logger.error(f"Failed to initialize engine: {e}\n{traceback.format_exc()}")
     engine = None
-# AI agents (initialize once)
-hallucination_detective = HallucinationDetectiveAgent()
 memory_drift_diagnostician = MemoryDriftDiagnosticianAgent()
-async def analyze_infrastructure(component, latency, error_rate, throughput, cpu_util, memory_util):
-    """Original infrastructure analysis."""
-    if engine is None:
-        return json.dumps({"error": "Engine failed to initialize. Check logs."}, indent=2)
     try:
-        result = await engine.process_event_enhanced(
-            component=component,
-            latency=float(latency),
-            error_rate=float(error_rate),
-            throughput=float(throughput) if throughput else 1000.0,
-            cpu_util=float(cpu_util) if cpu_util else None,
-            memory_util=float(memory_util) if memory_util else None
         )
-        return json.dumps(result, indent=2)
     except Exception as e:
-        logger.error(f"Infrastructure analysis error: {e}\n{traceback.format_exc()}")
-        return json.dumps({"error": str(e), "traceback": traceback.format_exc()}, indent=2)
-async def analyze_ai(component, prompt, model_name, model_version, confidence, perplexity, retrieval_score):
-    """AI reliability analysis."""
     try:
-        # Simulate a response (in a real app, call an actual model)
-        response = f"Mock response to: {prompt}"
         # Create AIEvent
         event = AIEvent(
             timestamp=datetime.utcnow(),
-            component=component,
             service_mesh="ai",
-            latency_p99=random.uniform(100, 500),
             error_rate=0.0,
             throughput=1,
             cpu_util=None,
             memory_util=None,
-            model_name=model_name,
-            model_version=model_version,
             prompt=prompt,
             response=response,
             response_length=len(response),
             confidence=confidence,
-            perplexity=perplexity,
-            retrieval_scores=[retrieval_score],
             user_feedback=None,
-            latency_ms=random.uniform(200, 800)
         )
         # Run agents
         hallu_result = await hallucination_detective.analyze(event)
         drift_result = await memory_drift_diagnostician.analyze(event)
-        # Combine results
         result = {
             "hallucination_detection": hallu_result,
             "memory_drift_detection": drift_result,
-            "response": response
         }
         return json.dumps(result, indent=2)
     except Exception as e:
         logger.error(f"AI analysis error: {e}\n{traceback.format_exc()}")
         return json.dumps({"error": str(e), "traceback": traceback.format_exc()}, indent=2)
-def sync_infrastructure(*args):
-    return asyncio.run(analyze_infrastructure(*args))
-def sync_ai(*args):
-    return asyncio.run(analyze_ai(*args))
-# Build the Gradio interface with tabs
-with gr.Blocks(title="ARF v4 – Reliability Lab", theme="soft") as demo:
-    gr.Markdown("# 🧠 Agentic Reliability Framework v4\n**Infrastructure & AI Reliability**")
-    with gr.Tabs():
-        with gr.TabItem("Infrastructure"):
-            gr.Markdown("Enter telemetry to analyze infrastructure incidents.")
-            with gr.Row():
-                with gr.Column():
-                    component = gr.Dropdown(
-                        choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
-                        value="api-service", label="Component"
-                    )
-                    latency = gr.Slider(10, 1000, value=100, label="Latency P99 (ms)")
-                    error_rate = gr.Slider(0, 0.5, value=0.02, step=0.001, label="Error Rate")
-                    throughput = gr.Number(value=1000, label="Throughput (req/s)")
-                    cpu_util = gr.Slider(0, 1, value=0.4, label="CPU Utilization")
-                    memory_util = gr.Slider(0, 1, value=0.3, label="Memory Utilization")
-                    infra_submit = gr.Button("Analyze Infrastructure", variant="primary")
-                with gr.Column():
-                    infra_output = gr.JSON(label="Analysis Result")
-            infra_submit.click(
-                fn=sync_infrastructure,
-                inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
-                outputs=infra_output
             )
-        with gr.TabItem("AI Reliability"):
-            gr.Markdown("Simulate an AI query to detect hallucinations and memory drift.")
-            with gr.Row():
-                with gr.Column():
-                    ai_component = gr.Dropdown(
-                        choices=["chat", "code", "summary"], label="Task Type", value="chat"
-                    )
-                    prompt = gr.Textbox(label="Prompt", value="What is the capital of France?")
-                    model_name = gr.Dropdown(["gpt-3.5", "gpt-4", "claude"], label="Model", value="gpt-4")
-                    model_version = gr.Textbox(value="v1", label="Version")
-                    confidence = gr.Slider(0, 1, value=0.95, label="Model Confidence")
-                    perplexity = gr.Slider(0, 50, value=5, label="Perplexity")
-                    retrieval_score = gr.Slider(0, 1, value=0.8, label="Retrieval Score")
-                    ai_submit = gr.Button("Analyze AI", variant="primary")
-                with gr.Column():
-                    ai_output = gr.JSON(label="Analysis Result")
-            ai_submit.click(
-                fn=sync_ai,
-                inputs=[ai_component, prompt, model_name, model_version, confidence, perplexity, retrieval_score],
-                outputs=ai_output
             )
     gr.Markdown("""
     ---
-    [📖 Tutorial](https://github.com/petter2025us/agentic-reliability-framework/blob/main/TUTORIAL.md) |
-    [🐙 GitHub](https://github.com/petter2025us/agentic-reliability-framework) |
-    [💼 Enterprise](mailto:petter2025us@outlook.com)
     """)
 if __name__ == "__main__":

 import json
 import logging
 import traceback
+import os
+import numpy as np
 from datetime import datetime
+from transformers import pipeline, set_seed
+import torch
+# Import our components
 from agentic_reliability_framework.runtime.engine import EnhancedReliabilityEngine
 from hallucination_detective import HallucinationDetectiveAgent
 from memory_drift_diagnostician import MemoryDriftDiagnosticianAgent
+from ai_event import AIEvent
+from ai_risk_engine import AIRiskEngine
+from nli_detector import NLIDetector
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Initialize infrastructure engine (optional)
 try:
     logger.info("Initializing EnhancedReliabilityEngine...")
     engine = EnhancedReliabilityEngine()
     logger.error(f"Failed to initialize engine: {e}\n{traceback.format_exc()}")
     engine = None
+# Load generative model (small autoregressive)
+gen_model_name = "microsoft/DialoGPT-small"
+try:
+    generator = pipeline('text-generation', model=gen_model_name, device=0 if torch.cuda.is_available() else -1)
+    logger.info(f"Generator {gen_model_name} loaded.")
+except Exception as e:
+    logger.error(f"Failed to load generator: {e}")
+    generator = None
+# Load NLI detector
+nli_detector = NLIDetector()
+# AI agents
+hallucination_detective = HallucinationDetectiveAgent(nli_detector=nli_detector)
 memory_drift_diagnostician = MemoryDriftDiagnosticianAgent()
+# AI risk engine
+ai_risk_engine = AIRiskEngine()
+# In‑memory storage for last event to attach feedback
+last_ai_event = None
+last_ai_category = None
+async def generate_response(prompt: str, max_length: int = 100) -> tuple:
+    """Generate response using the small autoregressive model."""
+    if generator is None:
+        return "[Model not loaded]", 0.0, "Model loading failed"
     try:
+        loop = asyncio.get_event_loop()
+        # We need to compute confidence; text-generation pipeline returns text but not logits.
+        # For simplicity, we'll set confidence based on a heuristic (e.g., generation length?).
+        # Alternatively, use a model that returns probabilities.
+        # Let's use a simple placeholder: confidence = 0.8 if generation succeeds.
+        # In practice, we'd need to access logits.
+        result = await loop.run_in_executor(
+            None,
+            lambda: generator(prompt, max_new_tokens=max_length, return_full_text=False)
         )
+        response = result[0]['generated_text']
+        # Placeholder confidence
+        confidence = 0.8
+        return response, confidence, ""
     except Exception as e:
+        logger.error(f"Generation error: {e}")
+        return "", 0.0, str(e)
+async def analyze_ai(task_type, prompt):
+    global last_ai_event, last_ai_category
     try:
+        # Generate response
+        response, confidence, error = await generate_response(prompt)
+        if error:
+            return json.dumps({"error": error}, indent=2)
         # Create AIEvent
         event = AIEvent(
             timestamp=datetime.utcnow(),
+            component="ai",
             service_mesh="ai",
+            latency_p99=0,
             error_rate=0.0,
             throughput=1,
             cpu_util=None,
             memory_util=None,
+            action_category=task_type,
+            model_name=gen_model_name,
+            model_version="latest",
             prompt=prompt,
             response=response,
             response_length=len(response),
             confidence=confidence,
+            perplexity=None,
+            retrieval_scores=None,
             user_feedback=None,
+            latency_ms=0
         )
+        last_ai_event = event
+        last_ai_category = task_type
         # Run agents
         hallu_result = await hallucination_detective.analyze(event)
         drift_result = await memory_drift_diagnostician.analyze(event)
+        # Get current risk metrics
+        risk_metrics = ai_risk_engine.risk_score(task_type)
         result = {
+            "response": response,
+            "confidence": confidence,
             "hallucination_detection": hallu_result,
             "memory_drift_detection": drift_result,
+            "risk_metrics": risk_metrics
         }
         return json.dumps(result, indent=2)
     except Exception as e:
         logger.error(f"AI analysis error: {e}\n{traceback.format_exc()}")
         return json.dumps({"error": str(e), "traceback": traceback.format_exc()}, indent=2)
+def feedback(thumbs_up: bool):
+    """Handle user feedback to update Beta priors."""
+    global last_ai_category, last_ai_event
+    if last_ai_category is None:
+        return "No previous analysis to rate."
+    ai_risk_engine.update_outcome(last_ai_category, success=thumbs_up)
+    # Optionally, also update the event with feedback
+    if last_ai_event:
+        last_ai_event.user_feedback = thumbs_up
+    return f"Feedback recorded: {'👍' if thumbs_up else '👎'} for {last_ai_category}."
+# Build the Gradio interface
+with gr.Blocks(title="ARF v4 – AI Reliability Lab", theme="soft") as demo:
+    gr.Markdown("# 🧠 ARF v4 – AI Reliability Lab\n**Detect hallucinations and drift in generative AI**")
+    with gr.Row():
+        with gr.Column():
+            task_type = gr.Dropdown(
+                choices=["chat", "code", "summary"],
+                value="chat",
+                label="Task Type"
             )
+            prompt = gr.Textbox(
+                label="Prompt",
+                value="What is the capital of France?",
+                lines=3
             )
+            analyze_btn = gr.Button("Analyze", variant="primary")
+        with gr.Column():
+            output = gr.JSON(label="Analysis Result")
+    with gr.Row():
+        feedback_btn_up = gr.Button("👍 Correct")
+        feedback_btn_down = gr.Button("👎 Incorrect")
+        feedback_msg = gr.Textbox(label="Feedback", interactive=False)
+    analyze_btn.click(
+        fn=analyze_ai,
+        inputs=[task_type, prompt],
+        outputs=output
+    )
+    feedback_btn_up.click(
+        fn=lambda: feedback(True),
+        outputs=feedback_msg
+    )
+    feedback_btn_down.click(
+        fn=lambda: feedback(False),
+        outputs=feedback_msg
+    )
     gr.Markdown("""
     ---
+    - **Model**: `microsoft/DialoGPT-small` (autoregressive, 117M params)
+    - **NLI Detector**: `typeform/distilroberta-base-mnli` (82M params)
+    - **Risk engine**: Beta conjugate priors per task category
+    - **Feedback** updates the posterior distribution
     """)
 if __name__ == "__main__":