Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Dec 5, 2025

Commit

666a364

verified ·

1 Parent(s): d6c4007

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -3

app.py CHANGED Viewed

@@ -112,6 +112,127 @@ class Config:
 config = Config()
 HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
 # === Input Validation (FIXED: Comprehensive validation) ===
 def validate_component_id(component_id: str) -> Tuple[bool, str]:
@@ -1578,7 +1699,6 @@ class RateLimiter:
 rate_limiter = RateLimiter()
 # === Gradio UI ===
 def create_enhanced_ui():
     """
@@ -1586,6 +1706,7 @@ def create_enhanced_ui():
     FIXED: Uses native async handlers (no event loop creation)
     FIXED: Rate limiting on all endpoints
     """
     with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
@@ -1600,6 +1721,22 @@ def create_enhanced_ui():
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### 📊 Telemetry Input")
                 component = gr.Dropdown(
                     choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
                     value="api-service",
@@ -1699,6 +1836,41 @@ def create_enhanced_ui():
                     )
             gr.Markdown("\n\n".join(policy_info))
         # FIXED: Native async handler (no event loop creation needed)
         async def submit_event_enhanced_async(
@@ -1818,17 +1990,17 @@ def create_enhanced_ui():
     return demo
 # === Main Entry Point ===
 if __name__ == "__main__":
     logger.info("=" * 80)
-    logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
     logger.info("=" * 80)
     logger.info(f"Python version: {os.sys.version}")
     logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
     logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
     logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
     logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
     logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
     logger.info("=" * 80)

 config = Config()
 HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
+# === Demo Scenarios for Hackathon Presentations ===
+DEMO_SCENARIOS = {
+    "🛍️ Black Friday Crisis": {
+        "description": "2:47 AM on Black Friday. Payment processing is failing. \$50K/minute at risk.",
+        "component": "payment-service",
+        "latency": 450,
+        "error_rate": 0.22,
+        "throughput": 8500,
+        "cpu_util": 0.95,
+        "memory_util": 0.88,
+        "story": """
+**SCENARIO: Black Friday Payment Crisis**
+🕐 **Time:** 2:47 AM EST
+💰 **Revenue at Risk:** \$50,000 per minute
+🔥 **Status:** CRITICAL
+Your payment service is buckling under Black Friday load. Database connection pool
+is exhausted. Customers are abandoning carts. Every minute of downtime costs \$50K.
+Traditional monitoring would alert you at 500ms latency - by then you've lost \$200K.
+**Watch ARF prevent this disaster...**
+        """
+    },
+    "🚨 Database Meltdown": {
+        "description": "Connection pool exhausted. Cascading failures across 5 services.",
+        "component": "database",
+        "latency": 850,
+        "error_rate": 0.35,
+        "throughput": 450,
+        "cpu_util": 0.78,
+        "memory_util": 0.98,
+        "story": """
+**SCENARIO: Database Connection Pool Exhaustion**
+🕐 **Time:** 11:23 AM
+⚠️ **Impact:** 5 services affected
+🔥 **Status:** CRITICAL
+Your primary database has hit max connections. API calls are timing out.
+Errors are cascading to dependent services. Customer support calls spiking.
+This is a textbook cascading failure scenario.
+**See how ARF identifies root cause in seconds...**
+        """
+    },
+    "⚡ Viral Traffic Spike": {
+        "description": "Viral tweet drives 10x traffic. Infrastructure straining.",
+        "component": "api-service",
+        "latency": 280,
+        "error_rate": 0.12,
+        "throughput": 15000,
+        "cpu_util": 0.88,
+        "memory_util": 0.65,
+        "story": """
+**SCENARIO: Unexpected Viral Traffic**
+🕐 **Time:** 3:15 PM
+📈 **Traffic Spike:** 10x normal load
+⚠️ **Status:** HIGH
+A celebrity just tweeted about your product. Traffic jumped from 1,500 to 15,000
+requests/sec. Your auto-scaling is struggling to keep up. Latency is climbing.
+You have maybe 15 minutes before this becomes a full outage.
+**Watch ARF predict the failure and trigger scaling...**
+        """
+    },
+    "🔥 Memory Leak Discovery": {
+        "description": "Slow memory leak detected. 18 minutes until OOM crash.",
+        "component": "cache-service",
+        "latency": 320,
+        "error_rate": 0.05,
+        "throughput": 2200,
+        "cpu_util": 0.45,
+        "memory_util": 0.94,
+        "story": """
+**SCENARIO: Memory Leak Time Bomb**
+🕐 **Time:** 9:42 PM
+💾 **Memory:** 94% (climbing 2%/hour)
+⏰ **Time to Crash:** ~18 minutes
+A memory leak has been slowly growing for 8 hours. Most monitoring tools won't
+catch this until it's too late. At current trajectory, the service crashes at 10 PM.
+That's right when your international users come online.
+**See ARF's predictive engine spot this before disaster...**
+        """
+    },
+    "✅ Normal Operations": {
+        "description": "Everything running smoothly - baseline metrics.",
+        "component": "api-service",
+        "latency": 85,
+        "error_rate": 0.008,
+        "throughput": 1200,
+        "cpu_util": 0.35,
+        "memory_util": 0.42,
+        "story": """
+**SCENARIO: Healthy System Baseline**
+🕐 **Time:** 2:30 PM
+✅ **Status:** NORMAL
+📊 **All Metrics:** Within range
+This is what good looks like. All services running smoothly.
+Use this to show how ARF distinguishes between normal operations and actual incidents.
+**Intelligent anomaly detection prevents alert fatigue...**
+        """
+    }
+}
 # === Input Validation (FIXED: Comprehensive validation) ===
 def validate_component_id(component_id: str) -> Tuple[bool, str]:
 rate_limiter = RateLimiter()
 # === Gradio UI ===
 def create_enhanced_ui():
     """
     FIXED: Uses native async handlers (no event loop creation)
     FIXED: Rate limiting on all endpoints
+    NEW: Demo scenarios for killer presentations
     """
     with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### 📊 Telemetry Input")
+                # NEW: Demo Scenarios Dropdown
+                with gr.Row():
+                    scenario_dropdown = gr.Dropdown(
+                        choices=["Manual Entry"] + list(DEMO_SCENARIOS.keys()),
+                        value="Manual Entry",
+                        label="🎬 Demo Scenario (Quick Start)",
+                        info="Select a pre-configured scenario or enter manually"
+                    )
+                # NEW: Scenario Story Display
+                scenario_story = gr.Markdown(
+                    value="*Select a demo scenario above for a pre-configured incident, or enter values manually below.*",
+                    visible=True
+                )
                 component = gr.Dropdown(
                     choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
                     value="api-service",
                     )
             gr.Markdown("\n\n".join(policy_info))
+        # NEW: Scenario change handler
+        def on_scenario_change(scenario_name):
+            """Update input fields when demo scenario is selected"""
+            if scenario_name == "Manual Entry":
+                return {
+                    scenario_story: gr.update(value="*Enter values manually below.*"),
+                    component: gr.update(value="api-service"),
+                    latency: gr.update(value=100),
+                    error_rate: gr.update(value=0.02),
+                    throughput: gr.update(value=1000),
+                    cpu_util: gr.update(value=0.4),
+                    memory_util: gr.update(value=0.3)
+                }
+            scenario = DEMO_SCENARIOS.get(scenario_name)
+            if not scenario:
+                return {}
+            return {
+                scenario_story: gr.update(value=scenario["story"]),
+                component: gr.update(value=scenario["component"]),
+                latency: gr.update(value=scenario["latency"]),
+                error_rate: gr.update(value=scenario["error_rate"]),
+                throughput: gr.update(value=scenario["throughput"]),
+                cpu_util: gr.update(value=scenario.get("cpu_util", 0.5)),
+                memory_util: gr.update(value=scenario.get("memory_util", 0.5))
+            }
+        # Connect scenario dropdown to inputs
+        scenario_dropdown.change(
+            fn=on_scenario_change,
+            inputs=[scenario_dropdown],
+            outputs=[scenario_story, component, latency, error_rate, throughput, cpu_util, memory_util]
+        )
         # FIXED: Native async handler (no event loop creation needed)
         async def submit_event_enhanced_async(
     return demo
 # === Main Entry Point ===
 if __name__ == "__main__":
     logger.info("=" * 80)
+    logger.info("Starting Enterprise Agentic Reliability Framework (DEMO READY VERSION)")
     logger.info("=" * 80)
     logger.info(f"Python version: {os.sys.version}")
     logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
     logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
     logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
+    logger.info(f"Demo scenarios loaded: {len(DEMO_SCENARIOS)}")
     logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
     logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
     logger.info("=" * 80)