Spaces:
Sleeping
Sleeping
| """Alerts configuration and history panel component.""" | |
| import gradio as gr | |
| import pandas as pd | |
| from datetime import datetime | |
| from typing import Dict, Any, Tuple, List | |
| from services.alerting import AlertEngine, AlertDispatcher, Alert, AlertSeverity | |
| def create_alerts_panel( | |
| alert_engine: AlertEngine, | |
| alert_dispatcher: AlertDispatcher, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Create the alerts panel. | |
| Args: | |
| alert_engine: Alert engine instance | |
| alert_dispatcher: Alert dispatcher instance | |
| Returns: | |
| Dictionary of Gradio components | |
| """ | |
| with gr.Column(): | |
| with gr.Row(): | |
| # Active alerts column | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Active Alerts") | |
| active_alerts_table = gr.Dataframe( | |
| headers=["Time", "Severity", "Metric", "Value", "Threshold", "Message"], | |
| datatype=["str", "str", "str", "number", "number", "str"], | |
| label="Active Alerts", | |
| interactive=False, | |
| ) | |
| gr.Markdown("### Alert History") | |
| alert_history_table = gr.Dataframe( | |
| headers=["Time", "Severity", "Message", "Resolved"], | |
| datatype=["str", "str", "str", "str"], | |
| label="Recent Alerts", | |
| interactive=False, | |
| ) | |
| # Configuration column | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Alert Configuration") | |
| kv_threshold = gr.Slider( | |
| label="KV Cache Alert Threshold (%)", | |
| minimum=50, | |
| maximum=100, | |
| value=90, | |
| step=5, | |
| ) | |
| gpu_memory_threshold = gr.Slider( | |
| label="GPU Memory Alert Threshold (%)", | |
| minimum=70, | |
| maximum=100, | |
| value=95, | |
| step=5, | |
| ) | |
| ttft_multiplier = gr.Slider( | |
| label="TTFT Spike Multiplier", | |
| minimum=1.5, | |
| maximum=5, | |
| value=2, | |
| step=0.5, | |
| ) | |
| throughput_drop = gr.Slider( | |
| label="Throughput Drop Alert (%)", | |
| minimum=20, | |
| maximum=80, | |
| value=50, | |
| step=10, | |
| ) | |
| gr.Markdown("### Webhook Configuration") | |
| slack_webhook = gr.Textbox( | |
| label="Slack Webhook URL", | |
| placeholder="https://hooks.slack.com/services/...", | |
| type="password", | |
| ) | |
| pagerduty_key = gr.Textbox( | |
| label="PagerDuty Routing Key", | |
| placeholder="Enter routing key...", | |
| type="password", | |
| ) | |
| with gr.Row(): | |
| save_config_btn = gr.Button("Save Configuration") | |
| test_alert_btn = gr.Button("Send Test Alert", variant="secondary") | |
| config_status = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| visible=True, | |
| ) | |
| # Event handlers | |
| def save_config(kv, gpu, ttft, tp_drop, slack, pd_key): | |
| # Update alert thresholds | |
| if "kv_cache_high" in alert_engine.rules: | |
| alert_engine.rules["kv_cache_high"].threshold = kv | |
| if "gpu_memory_critical" in alert_engine.rules: | |
| alert_engine.rules["gpu_memory_critical"].threshold = gpu | |
| if "ttft_spike" in alert_engine.rules: | |
| alert_engine.rules["ttft_spike"].multiplier = ttft | |
| if "throughput_drop" in alert_engine.rules: | |
| alert_engine.rules["throughput_drop"].percent = tp_drop | |
| # Update webhook config | |
| alert_dispatcher.slack_webhook = slack if slack else None | |
| alert_dispatcher.pagerduty_key = pd_key if pd_key else None | |
| return "Configuration saved successfully" | |
| save_config_btn.click( | |
| fn=save_config, | |
| inputs=[ | |
| kv_threshold, | |
| gpu_memory_threshold, | |
| ttft_multiplier, | |
| throughput_drop, | |
| slack_webhook, | |
| pagerduty_key, | |
| ], | |
| outputs=config_status, | |
| ) | |
| async def send_test(): | |
| success = await alert_dispatcher.send_test_alert() | |
| if success: | |
| return "Test alert sent successfully" | |
| return "Failed to send test alert - check webhook configuration" | |
| test_alert_btn.click( | |
| fn=send_test, | |
| outputs=config_status, | |
| ) | |
| return { | |
| "active_alerts_table": active_alerts_table, | |
| "alert_history_table": alert_history_table, | |
| "kv_threshold": kv_threshold, | |
| "gpu_memory_threshold": gpu_memory_threshold, | |
| "ttft_multiplier": ttft_multiplier, | |
| "throughput_drop": throughput_drop, | |
| "slack_webhook": slack_webhook, | |
| "pagerduty_key": pagerduty_key, | |
| "config_status": config_status, | |
| } | |
| def update_alerts_panel( | |
| alert_engine: AlertEngine, | |
| db=None, | |
| ) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Update the alerts panel with current data. | |
| Args: | |
| alert_engine: Alert engine instance | |
| db: Optional database for history | |
| Returns: | |
| Tuple of (active_alerts_df, history_df) | |
| """ | |
| # Get active alerts | |
| active = alert_engine.get_active_alerts() | |
| active_rows = [] | |
| for alert in active: | |
| active_rows.append({ | |
| "Time": alert.timestamp.strftime("%H:%M:%S"), | |
| "Severity": _format_severity(alert.severity), | |
| "Metric": alert.metric, | |
| "Value": round(alert.value, 2), | |
| "Threshold": round(alert.threshold, 2), | |
| "Message": alert.message, | |
| }) | |
| active_df = pd.DataFrame(active_rows) if active_rows else pd.DataFrame( | |
| columns=["Time", "Severity", "Metric", "Value", "Threshold", "Message"] | |
| ) | |
| # Get history from database | |
| history_rows = [] | |
| if db: | |
| recent = db.get_recent_alerts(limit=20) | |
| for record in recent: | |
| history_rows.append({ | |
| "Time": record.timestamp.strftime("%Y-%m-%d %H:%M:%S"), | |
| "Severity": _format_severity_str(record.severity), | |
| "Message": record.message, | |
| "Resolved": "Yes" if record.resolved_at else "No", | |
| }) | |
| history_df = pd.DataFrame(history_rows) if history_rows else pd.DataFrame( | |
| columns=["Time", "Severity", "Message", "Resolved"] | |
| ) | |
| return active_df, history_df | |
| def _format_severity(severity: AlertSeverity) -> str: | |
| """Format severity for display.""" | |
| icons = { | |
| AlertSeverity.INFO: "INFO", | |
| AlertSeverity.WARNING: "WARNING", | |
| AlertSeverity.CRITICAL: "CRITICAL", | |
| } | |
| return icons.get(severity, "UNKNOWN") | |
| def _format_severity_str(severity: str) -> str: | |
| """Format severity string for display.""" | |
| return severity.upper() | |
| def get_alert_badge_html(alerts: List[Alert]) -> str: | |
| """ | |
| Generate HTML badge for active alerts. | |
| Args: | |
| alerts: List of active alerts | |
| Returns: | |
| HTML string for badge | |
| """ | |
| if not alerts: | |
| return '<span style="color: #2e7d32;">No Active Alerts</span>' | |
| critical = sum(1 for a in alerts if a.severity == AlertSeverity.CRITICAL) | |
| warning = sum(1 for a in alerts if a.severity == AlertSeverity.WARNING) | |
| badges = [] | |
| if critical > 0: | |
| badges.append( | |
| f'<span style="background: #c62828; color: white; padding: 2px 8px; ' | |
| f'border-radius: 12px; margin-right: 5px;">{critical} Critical</span>' | |
| ) | |
| if warning > 0: | |
| badges.append( | |
| f'<span style="background: #ff9800; color: white; padding: 2px 8px; ' | |
| f'border-radius: 12px;">{warning} Warning</span>' | |
| ) | |
| return "".join(badges) | |