petter2025 commited on
Commit
666a364
Β·
verified Β·
1 Parent(s): d6c4007

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -3
app.py CHANGED
@@ -112,6 +112,127 @@ class Config:
112
  config = Config()
113
  HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # === Input Validation (FIXED: Comprehensive validation) ===
117
  def validate_component_id(component_id: str) -> Tuple[bool, str]:
@@ -1578,7 +1699,6 @@ class RateLimiter:
1578
 
1579
  rate_limiter = RateLimiter()
1580
 
1581
-
1582
  # === Gradio UI ===
1583
  def create_enhanced_ui():
1584
  """
@@ -1586,6 +1706,7 @@ def create_enhanced_ui():
1586
 
1587
  FIXED: Uses native async handlers (no event loop creation)
1588
  FIXED: Rate limiting on all endpoints
 
1589
  """
1590
 
1591
  with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
@@ -1600,6 +1721,22 @@ def create_enhanced_ui():
1600
  with gr.Row():
1601
  with gr.Column(scale=1):
1602
  gr.Markdown("### πŸ“Š Telemetry Input")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1603
  component = gr.Dropdown(
1604
  choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
1605
  value="api-service",
@@ -1699,6 +1836,41 @@ def create_enhanced_ui():
1699
  )
1700
 
1701
  gr.Markdown("\n\n".join(policy_info))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1702
 
1703
  # FIXED: Native async handler (no event loop creation needed)
1704
  async def submit_event_enhanced_async(
@@ -1818,17 +1990,17 @@ def create_enhanced_ui():
1818
 
1819
  return demo
1820
 
1821
-
1822
  # === Main Entry Point ===
1823
  if __name__ == "__main__":
1824
  logger.info("=" * 80)
1825
- logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
1826
  logger.info("=" * 80)
1827
  logger.info(f"Python version: {os.sys.version}")
1828
  logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
1829
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1830
  logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
1831
  logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
 
1832
  logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
1833
  logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
1834
  logger.info("=" * 80)
 
112
  config = Config()
113
  HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
114
 
115
+ # === Demo Scenarios for Hackathon Presentations ===
116
+ DEMO_SCENARIOS = {
117
+ "πŸ›οΈ Black Friday Crisis": {
118
+ "description": "2:47 AM on Black Friday. Payment processing is failing. \$50K/minute at risk.",
119
+ "component": "payment-service",
120
+ "latency": 450,
121
+ "error_rate": 0.22,
122
+ "throughput": 8500,
123
+ "cpu_util": 0.95,
124
+ "memory_util": 0.88,
125
+ "story": """
126
+ **SCENARIO: Black Friday Payment Crisis**
127
+
128
+ πŸ• **Time:** 2:47 AM EST
129
+ πŸ’° **Revenue at Risk:** \$50,000 per minute
130
+ πŸ”₯ **Status:** CRITICAL
131
+
132
+ Your payment service is buckling under Black Friday load. Database connection pool
133
+ is exhausted. Customers are abandoning carts. Every minute of downtime costs \$50K.
134
+
135
+ Traditional monitoring would alert you at 500ms latency - by then you've lost \$200K.
136
+
137
+ **Watch ARF prevent this disaster...**
138
+ """
139
+ },
140
+
141
+ "🚨 Database Meltdown": {
142
+ "description": "Connection pool exhausted. Cascading failures across 5 services.",
143
+ "component": "database",
144
+ "latency": 850,
145
+ "error_rate": 0.35,
146
+ "throughput": 450,
147
+ "cpu_util": 0.78,
148
+ "memory_util": 0.98,
149
+ "story": """
150
+ **SCENARIO: Database Connection Pool Exhaustion**
151
+
152
+ πŸ• **Time:** 11:23 AM
153
+ ⚠️ **Impact:** 5 services affected
154
+ πŸ”₯ **Status:** CRITICAL
155
+
156
+ Your primary database has hit max connections. API calls are timing out.
157
+ Errors are cascading to dependent services. Customer support calls spiking.
158
+
159
+ This is a textbook cascading failure scenario.
160
+
161
+ **See how ARF identifies root cause in seconds...**
162
+ """
163
+ },
164
+
165
+ "⚑ Viral Traffic Spike": {
166
+ "description": "Viral tweet drives 10x traffic. Infrastructure straining.",
167
+ "component": "api-service",
168
+ "latency": 280,
169
+ "error_rate": 0.12,
170
+ "throughput": 15000,
171
+ "cpu_util": 0.88,
172
+ "memory_util": 0.65,
173
+ "story": """
174
+ **SCENARIO: Unexpected Viral Traffic**
175
+
176
+ πŸ• **Time:** 3:15 PM
177
+ πŸ“ˆ **Traffic Spike:** 10x normal load
178
+ ⚠️ **Status:** HIGH
179
+
180
+ A celebrity just tweeted about your product. Traffic jumped from 1,500 to 15,000
181
+ requests/sec. Your auto-scaling is struggling to keep up. Latency is climbing.
182
+
183
+ You have maybe 15 minutes before this becomes a full outage.
184
+
185
+ **Watch ARF predict the failure and trigger scaling...**
186
+ """
187
+ },
188
+
189
+ "πŸ”₯ Memory Leak Discovery": {
190
+ "description": "Slow memory leak detected. 18 minutes until OOM crash.",
191
+ "component": "cache-service",
192
+ "latency": 320,
193
+ "error_rate": 0.05,
194
+ "throughput": 2200,
195
+ "cpu_util": 0.45,
196
+ "memory_util": 0.94,
197
+ "story": """
198
+ **SCENARIO: Memory Leak Time Bomb**
199
+
200
+ πŸ• **Time:** 9:42 PM
201
+ πŸ’Ύ **Memory:** 94% (climbing 2%/hour)
202
+ ⏰ **Time to Crash:** ~18 minutes
203
+
204
+ A memory leak has been slowly growing for 8 hours. Most monitoring tools won't
205
+ catch this until it's too late. At current trajectory, the service crashes at 10 PM.
206
+
207
+ That's right when your international users come online.
208
+
209
+ **See ARF's predictive engine spot this before disaster...**
210
+ """
211
+ },
212
+
213
+ "βœ… Normal Operations": {
214
+ "description": "Everything running smoothly - baseline metrics.",
215
+ "component": "api-service",
216
+ "latency": 85,
217
+ "error_rate": 0.008,
218
+ "throughput": 1200,
219
+ "cpu_util": 0.35,
220
+ "memory_util": 0.42,
221
+ "story": """
222
+ **SCENARIO: Healthy System Baseline**
223
+
224
+ πŸ• **Time:** 2:30 PM
225
+ βœ… **Status:** NORMAL
226
+ πŸ“Š **All Metrics:** Within range
227
+
228
+ This is what good looks like. All services running smoothly.
229
+
230
+ Use this to show how ARF distinguishes between normal operations and actual incidents.
231
+
232
+ **Intelligent anomaly detection prevents alert fatigue...**
233
+ """
234
+ }
235
+ }
236
 
237
  # === Input Validation (FIXED: Comprehensive validation) ===
238
  def validate_component_id(component_id: str) -> Tuple[bool, str]:
 
1699
 
1700
  rate_limiter = RateLimiter()
1701
 
 
1702
  # === Gradio UI ===
1703
  def create_enhanced_ui():
1704
  """
 
1706
 
1707
  FIXED: Uses native async handlers (no event loop creation)
1708
  FIXED: Rate limiting on all endpoints
1709
+ NEW: Demo scenarios for killer presentations
1710
  """
1711
 
1712
  with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
 
1721
  with gr.Row():
1722
  with gr.Column(scale=1):
1723
  gr.Markdown("### πŸ“Š Telemetry Input")
1724
+
1725
+ # NEW: Demo Scenarios Dropdown
1726
+ with gr.Row():
1727
+ scenario_dropdown = gr.Dropdown(
1728
+ choices=["Manual Entry"] + list(DEMO_SCENARIOS.keys()),
1729
+ value="Manual Entry",
1730
+ label="🎬 Demo Scenario (Quick Start)",
1731
+ info="Select a pre-configured scenario or enter manually"
1732
+ )
1733
+
1734
+ # NEW: Scenario Story Display
1735
+ scenario_story = gr.Markdown(
1736
+ value="*Select a demo scenario above for a pre-configured incident, or enter values manually below.*",
1737
+ visible=True
1738
+ )
1739
+
1740
  component = gr.Dropdown(
1741
  choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
1742
  value="api-service",
 
1836
  )
1837
 
1838
  gr.Markdown("\n\n".join(policy_info))
1839
+
1840
+ # NEW: Scenario change handler
1841
+ def on_scenario_change(scenario_name):
1842
+ """Update input fields when demo scenario is selected"""
1843
+ if scenario_name == "Manual Entry":
1844
+ return {
1845
+ scenario_story: gr.update(value="*Enter values manually below.*"),
1846
+ component: gr.update(value="api-service"),
1847
+ latency: gr.update(value=100),
1848
+ error_rate: gr.update(value=0.02),
1849
+ throughput: gr.update(value=1000),
1850
+ cpu_util: gr.update(value=0.4),
1851
+ memory_util: gr.update(value=0.3)
1852
+ }
1853
+
1854
+ scenario = DEMO_SCENARIOS.get(scenario_name)
1855
+ if not scenario:
1856
+ return {}
1857
+
1858
+ return {
1859
+ scenario_story: gr.update(value=scenario["story"]),
1860
+ component: gr.update(value=scenario["component"]),
1861
+ latency: gr.update(value=scenario["latency"]),
1862
+ error_rate: gr.update(value=scenario["error_rate"]),
1863
+ throughput: gr.update(value=scenario["throughput"]),
1864
+ cpu_util: gr.update(value=scenario.get("cpu_util", 0.5)),
1865
+ memory_util: gr.update(value=scenario.get("memory_util", 0.5))
1866
+ }
1867
+
1868
+ # Connect scenario dropdown to inputs
1869
+ scenario_dropdown.change(
1870
+ fn=on_scenario_change,
1871
+ inputs=[scenario_dropdown],
1872
+ outputs=[scenario_story, component, latency, error_rate, throughput, cpu_util, memory_util]
1873
+ )
1874
 
1875
  # FIXED: Native async handler (no event loop creation needed)
1876
  async def submit_event_enhanced_async(
 
1990
 
1991
  return demo
1992
 
 
1993
  # === Main Entry Point ===
1994
  if __name__ == "__main__":
1995
  logger.info("=" * 80)
1996
+ logger.info("Starting Enterprise Agentic Reliability Framework (DEMO READY VERSION)")
1997
  logger.info("=" * 80)
1998
  logger.info(f"Python version: {os.sys.version}")
1999
  logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
2000
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
2001
  logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
2002
  logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
2003
+ logger.info(f"Demo scenarios loaded: {len(DEMO_SCENARIOS)}")
2004
  logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
2005
  logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
2006
  logger.info("=" * 80)