Update app.py
Browse files
app.py
CHANGED
|
@@ -112,6 +112,127 @@ class Config:
|
|
| 112 |
config = Config()
|
| 113 |
HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# === Input Validation (FIXED: Comprehensive validation) ===
|
| 117 |
def validate_component_id(component_id: str) -> Tuple[bool, str]:
|
|
@@ -1578,7 +1699,6 @@ class RateLimiter:
|
|
| 1578 |
|
| 1579 |
rate_limiter = RateLimiter()
|
| 1580 |
|
| 1581 |
-
|
| 1582 |
# === Gradio UI ===
|
| 1583 |
def create_enhanced_ui():
|
| 1584 |
"""
|
|
@@ -1586,6 +1706,7 @@ def create_enhanced_ui():
|
|
| 1586 |
|
| 1587 |
FIXED: Uses native async handlers (no event loop creation)
|
| 1588 |
FIXED: Rate limiting on all endpoints
|
|
|
|
| 1589 |
"""
|
| 1590 |
|
| 1591 |
with gr.Blocks(title="π§ Agentic Reliability Framework", theme="soft") as demo:
|
|
@@ -1600,6 +1721,22 @@ def create_enhanced_ui():
|
|
| 1600 |
with gr.Row():
|
| 1601 |
with gr.Column(scale=1):
|
| 1602 |
gr.Markdown("### π Telemetry Input")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1603 |
component = gr.Dropdown(
|
| 1604 |
choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
|
| 1605 |
value="api-service",
|
|
@@ -1699,6 +1836,41 @@ def create_enhanced_ui():
|
|
| 1699 |
)
|
| 1700 |
|
| 1701 |
gr.Markdown("\n\n".join(policy_info))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1702 |
|
| 1703 |
# FIXED: Native async handler (no event loop creation needed)
|
| 1704 |
async def submit_event_enhanced_async(
|
|
@@ -1818,17 +1990,17 @@ def create_enhanced_ui():
|
|
| 1818 |
|
| 1819 |
return demo
|
| 1820 |
|
| 1821 |
-
|
| 1822 |
# === Main Entry Point ===
|
| 1823 |
if __name__ == "__main__":
|
| 1824 |
logger.info("=" * 80)
|
| 1825 |
-
logger.info("Starting Enterprise Agentic Reliability Framework (
|
| 1826 |
logger.info("=" * 80)
|
| 1827 |
logger.info(f"Python version: {os.sys.version}")
|
| 1828 |
logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
|
| 1829 |
logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
|
| 1830 |
logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
|
| 1831 |
logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
|
|
|
|
| 1832 |
logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
|
| 1833 |
logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
|
| 1834 |
logger.info("=" * 80)
|
|
|
|
| 112 |
config = Config()
|
| 113 |
HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
|
| 114 |
|
| 115 |
+
# === Demo Scenarios for Hackathon Presentations ===
|
| 116 |
+
DEMO_SCENARIOS = {
|
| 117 |
+
"ποΈ Black Friday Crisis": {
|
| 118 |
+
"description": "2:47 AM on Black Friday. Payment processing is failing. \$50K/minute at risk.",
|
| 119 |
+
"component": "payment-service",
|
| 120 |
+
"latency": 450,
|
| 121 |
+
"error_rate": 0.22,
|
| 122 |
+
"throughput": 8500,
|
| 123 |
+
"cpu_util": 0.95,
|
| 124 |
+
"memory_util": 0.88,
|
| 125 |
+
"story": """
|
| 126 |
+
**SCENARIO: Black Friday Payment Crisis**
|
| 127 |
+
|
| 128 |
+
π **Time:** 2:47 AM EST
|
| 129 |
+
π° **Revenue at Risk:** \$50,000 per minute
|
| 130 |
+
π₯ **Status:** CRITICAL
|
| 131 |
+
|
| 132 |
+
Your payment service is buckling under Black Friday load. Database connection pool
|
| 133 |
+
is exhausted. Customers are abandoning carts. Every minute of downtime costs \$50K.
|
| 134 |
+
|
| 135 |
+
Traditional monitoring would alert you at 500ms latency - by then you've lost \$200K.
|
| 136 |
+
|
| 137 |
+
**Watch ARF prevent this disaster...**
|
| 138 |
+
"""
|
| 139 |
+
},
|
| 140 |
+
|
| 141 |
+
"π¨ Database Meltdown": {
|
| 142 |
+
"description": "Connection pool exhausted. Cascading failures across 5 services.",
|
| 143 |
+
"component": "database",
|
| 144 |
+
"latency": 850,
|
| 145 |
+
"error_rate": 0.35,
|
| 146 |
+
"throughput": 450,
|
| 147 |
+
"cpu_util": 0.78,
|
| 148 |
+
"memory_util": 0.98,
|
| 149 |
+
"story": """
|
| 150 |
+
**SCENARIO: Database Connection Pool Exhaustion**
|
| 151 |
+
|
| 152 |
+
π **Time:** 11:23 AM
|
| 153 |
+
β οΈ **Impact:** 5 services affected
|
| 154 |
+
π₯ **Status:** CRITICAL
|
| 155 |
+
|
| 156 |
+
Your primary database has hit max connections. API calls are timing out.
|
| 157 |
+
Errors are cascading to dependent services. Customer support calls spiking.
|
| 158 |
+
|
| 159 |
+
This is a textbook cascading failure scenario.
|
| 160 |
+
|
| 161 |
+
**See how ARF identifies root cause in seconds...**
|
| 162 |
+
"""
|
| 163 |
+
},
|
| 164 |
+
|
| 165 |
+
"β‘ Viral Traffic Spike": {
|
| 166 |
+
"description": "Viral tweet drives 10x traffic. Infrastructure straining.",
|
| 167 |
+
"component": "api-service",
|
| 168 |
+
"latency": 280,
|
| 169 |
+
"error_rate": 0.12,
|
| 170 |
+
"throughput": 15000,
|
| 171 |
+
"cpu_util": 0.88,
|
| 172 |
+
"memory_util": 0.65,
|
| 173 |
+
"story": """
|
| 174 |
+
**SCENARIO: Unexpected Viral Traffic**
|
| 175 |
+
|
| 176 |
+
π **Time:** 3:15 PM
|
| 177 |
+
π **Traffic Spike:** 10x normal load
|
| 178 |
+
β οΈ **Status:** HIGH
|
| 179 |
+
|
| 180 |
+
A celebrity just tweeted about your product. Traffic jumped from 1,500 to 15,000
|
| 181 |
+
requests/sec. Your auto-scaling is struggling to keep up. Latency is climbing.
|
| 182 |
+
|
| 183 |
+
You have maybe 15 minutes before this becomes a full outage.
|
| 184 |
+
|
| 185 |
+
**Watch ARF predict the failure and trigger scaling...**
|
| 186 |
+
"""
|
| 187 |
+
},
|
| 188 |
+
|
| 189 |
+
"π₯ Memory Leak Discovery": {
|
| 190 |
+
"description": "Slow memory leak detected. 18 minutes until OOM crash.",
|
| 191 |
+
"component": "cache-service",
|
| 192 |
+
"latency": 320,
|
| 193 |
+
"error_rate": 0.05,
|
| 194 |
+
"throughput": 2200,
|
| 195 |
+
"cpu_util": 0.45,
|
| 196 |
+
"memory_util": 0.94,
|
| 197 |
+
"story": """
|
| 198 |
+
**SCENARIO: Memory Leak Time Bomb**
|
| 199 |
+
|
| 200 |
+
π **Time:** 9:42 PM
|
| 201 |
+
πΎ **Memory:** 94% (climbing 2%/hour)
|
| 202 |
+
β° **Time to Crash:** ~18 minutes
|
| 203 |
+
|
| 204 |
+
A memory leak has been slowly growing for 8 hours. Most monitoring tools won't
|
| 205 |
+
catch this until it's too late. At current trajectory, the service crashes at 10 PM.
|
| 206 |
+
|
| 207 |
+
That's right when your international users come online.
|
| 208 |
+
|
| 209 |
+
**See ARF's predictive engine spot this before disaster...**
|
| 210 |
+
"""
|
| 211 |
+
},
|
| 212 |
+
|
| 213 |
+
"β
Normal Operations": {
|
| 214 |
+
"description": "Everything running smoothly - baseline metrics.",
|
| 215 |
+
"component": "api-service",
|
| 216 |
+
"latency": 85,
|
| 217 |
+
"error_rate": 0.008,
|
| 218 |
+
"throughput": 1200,
|
| 219 |
+
"cpu_util": 0.35,
|
| 220 |
+
"memory_util": 0.42,
|
| 221 |
+
"story": """
|
| 222 |
+
**SCENARIO: Healthy System Baseline**
|
| 223 |
+
|
| 224 |
+
π **Time:** 2:30 PM
|
| 225 |
+
β
**Status:** NORMAL
|
| 226 |
+
π **All Metrics:** Within range
|
| 227 |
+
|
| 228 |
+
This is what good looks like. All services running smoothly.
|
| 229 |
+
|
| 230 |
+
Use this to show how ARF distinguishes between normal operations and actual incidents.
|
| 231 |
+
|
| 232 |
+
**Intelligent anomaly detection prevents alert fatigue...**
|
| 233 |
+
"""
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
|
| 237 |
# === Input Validation (FIXED: Comprehensive validation) ===
|
| 238 |
def validate_component_id(component_id: str) -> Tuple[bool, str]:
|
|
|
|
| 1699 |
|
| 1700 |
rate_limiter = RateLimiter()
|
| 1701 |
|
|
|
|
| 1702 |
# === Gradio UI ===
|
| 1703 |
def create_enhanced_ui():
|
| 1704 |
"""
|
|
|
|
| 1706 |
|
| 1707 |
FIXED: Uses native async handlers (no event loop creation)
|
| 1708 |
FIXED: Rate limiting on all endpoints
|
| 1709 |
+
NEW: Demo scenarios for killer presentations
|
| 1710 |
"""
|
| 1711 |
|
| 1712 |
with gr.Blocks(title="π§ Agentic Reliability Framework", theme="soft") as demo:
|
|
|
|
| 1721 |
with gr.Row():
|
| 1722 |
with gr.Column(scale=1):
|
| 1723 |
gr.Markdown("### π Telemetry Input")
|
| 1724 |
+
|
| 1725 |
+
# NEW: Demo Scenarios Dropdown
|
| 1726 |
+
with gr.Row():
|
| 1727 |
+
scenario_dropdown = gr.Dropdown(
|
| 1728 |
+
choices=["Manual Entry"] + list(DEMO_SCENARIOS.keys()),
|
| 1729 |
+
value="Manual Entry",
|
| 1730 |
+
label="π¬ Demo Scenario (Quick Start)",
|
| 1731 |
+
info="Select a pre-configured scenario or enter manually"
|
| 1732 |
+
)
|
| 1733 |
+
|
| 1734 |
+
# NEW: Scenario Story Display
|
| 1735 |
+
scenario_story = gr.Markdown(
|
| 1736 |
+
value="*Select a demo scenario above for a pre-configured incident, or enter values manually below.*",
|
| 1737 |
+
visible=True
|
| 1738 |
+
)
|
| 1739 |
+
|
| 1740 |
component = gr.Dropdown(
|
| 1741 |
choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
|
| 1742 |
value="api-service",
|
|
|
|
| 1836 |
)
|
| 1837 |
|
| 1838 |
gr.Markdown("\n\n".join(policy_info))
|
| 1839 |
+
|
| 1840 |
+
# NEW: Scenario change handler
|
| 1841 |
+
def on_scenario_change(scenario_name):
|
| 1842 |
+
"""Update input fields when demo scenario is selected"""
|
| 1843 |
+
if scenario_name == "Manual Entry":
|
| 1844 |
+
return {
|
| 1845 |
+
scenario_story: gr.update(value="*Enter values manually below.*"),
|
| 1846 |
+
component: gr.update(value="api-service"),
|
| 1847 |
+
latency: gr.update(value=100),
|
| 1848 |
+
error_rate: gr.update(value=0.02),
|
| 1849 |
+
throughput: gr.update(value=1000),
|
| 1850 |
+
cpu_util: gr.update(value=0.4),
|
| 1851 |
+
memory_util: gr.update(value=0.3)
|
| 1852 |
+
}
|
| 1853 |
+
|
| 1854 |
+
scenario = DEMO_SCENARIOS.get(scenario_name)
|
| 1855 |
+
if not scenario:
|
| 1856 |
+
return {}
|
| 1857 |
+
|
| 1858 |
+
return {
|
| 1859 |
+
scenario_story: gr.update(value=scenario["story"]),
|
| 1860 |
+
component: gr.update(value=scenario["component"]),
|
| 1861 |
+
latency: gr.update(value=scenario["latency"]),
|
| 1862 |
+
error_rate: gr.update(value=scenario["error_rate"]),
|
| 1863 |
+
throughput: gr.update(value=scenario["throughput"]),
|
| 1864 |
+
cpu_util: gr.update(value=scenario.get("cpu_util", 0.5)),
|
| 1865 |
+
memory_util: gr.update(value=scenario.get("memory_util", 0.5))
|
| 1866 |
+
}
|
| 1867 |
+
|
| 1868 |
+
# Connect scenario dropdown to inputs
|
| 1869 |
+
scenario_dropdown.change(
|
| 1870 |
+
fn=on_scenario_change,
|
| 1871 |
+
inputs=[scenario_dropdown],
|
| 1872 |
+
outputs=[scenario_story, component, latency, error_rate, throughput, cpu_util, memory_util]
|
| 1873 |
+
)
|
| 1874 |
|
| 1875 |
# FIXED: Native async handler (no event loop creation needed)
|
| 1876 |
async def submit_event_enhanced_async(
|
|
|
|
| 1990 |
|
| 1991 |
return demo
|
| 1992 |
|
|
|
|
| 1993 |
# === Main Entry Point ===
|
| 1994 |
if __name__ == "__main__":
|
| 1995 |
logger.info("=" * 80)
|
| 1996 |
+
logger.info("Starting Enterprise Agentic Reliability Framework (DEMO READY VERSION)")
|
| 1997 |
logger.info("=" * 80)
|
| 1998 |
logger.info(f"Python version: {os.sys.version}")
|
| 1999 |
logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
|
| 2000 |
logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
|
| 2001 |
logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
|
| 2002 |
logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
|
| 2003 |
+
logger.info(f"Demo scenarios loaded: {len(DEMO_SCENARIOS)}")
|
| 2004 |
logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
|
| 2005 |
logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
|
| 2006 |
logger.info("=" * 80)
|