jkottu's picture
Initial commit: LLM Inference Dashboard
aefabf0
"""Alerts configuration and history panel component."""
import gradio as gr
import pandas as pd
from datetime import datetime
from typing import Dict, Any, Tuple, List
from services.alerting import AlertEngine, AlertDispatcher, Alert, AlertSeverity
def create_alerts_panel(
alert_engine: AlertEngine,
alert_dispatcher: AlertDispatcher,
) -> Dict[str, Any]:
"""
Create the alerts panel.
Args:
alert_engine: Alert engine instance
alert_dispatcher: Alert dispatcher instance
Returns:
Dictionary of Gradio components
"""
with gr.Column():
with gr.Row():
# Active alerts column
with gr.Column(scale=2):
gr.Markdown("### Active Alerts")
active_alerts_table = gr.Dataframe(
headers=["Time", "Severity", "Metric", "Value", "Threshold", "Message"],
datatype=["str", "str", "str", "number", "number", "str"],
label="Active Alerts",
interactive=False,
)
gr.Markdown("### Alert History")
alert_history_table = gr.Dataframe(
headers=["Time", "Severity", "Message", "Resolved"],
datatype=["str", "str", "str", "str"],
label="Recent Alerts",
interactive=False,
)
# Configuration column
with gr.Column(scale=1):
gr.Markdown("### Alert Configuration")
kv_threshold = gr.Slider(
label="KV Cache Alert Threshold (%)",
minimum=50,
maximum=100,
value=90,
step=5,
)
gpu_memory_threshold = gr.Slider(
label="GPU Memory Alert Threshold (%)",
minimum=70,
maximum=100,
value=95,
step=5,
)
ttft_multiplier = gr.Slider(
label="TTFT Spike Multiplier",
minimum=1.5,
maximum=5,
value=2,
step=0.5,
)
throughput_drop = gr.Slider(
label="Throughput Drop Alert (%)",
minimum=20,
maximum=80,
value=50,
step=10,
)
gr.Markdown("### Webhook Configuration")
slack_webhook = gr.Textbox(
label="Slack Webhook URL",
placeholder="https://hooks.slack.com/services/...",
type="password",
)
pagerduty_key = gr.Textbox(
label="PagerDuty Routing Key",
placeholder="Enter routing key...",
type="password",
)
with gr.Row():
save_config_btn = gr.Button("Save Configuration")
test_alert_btn = gr.Button("Send Test Alert", variant="secondary")
config_status = gr.Textbox(
label="Status",
interactive=False,
visible=True,
)
# Event handlers
def save_config(kv, gpu, ttft, tp_drop, slack, pd_key):
# Update alert thresholds
if "kv_cache_high" in alert_engine.rules:
alert_engine.rules["kv_cache_high"].threshold = kv
if "gpu_memory_critical" in alert_engine.rules:
alert_engine.rules["gpu_memory_critical"].threshold = gpu
if "ttft_spike" in alert_engine.rules:
alert_engine.rules["ttft_spike"].multiplier = ttft
if "throughput_drop" in alert_engine.rules:
alert_engine.rules["throughput_drop"].percent = tp_drop
# Update webhook config
alert_dispatcher.slack_webhook = slack if slack else None
alert_dispatcher.pagerduty_key = pd_key if pd_key else None
return "Configuration saved successfully"
save_config_btn.click(
fn=save_config,
inputs=[
kv_threshold,
gpu_memory_threshold,
ttft_multiplier,
throughput_drop,
slack_webhook,
pagerduty_key,
],
outputs=config_status,
)
async def send_test():
success = await alert_dispatcher.send_test_alert()
if success:
return "Test alert sent successfully"
return "Failed to send test alert - check webhook configuration"
test_alert_btn.click(
fn=send_test,
outputs=config_status,
)
return {
"active_alerts_table": active_alerts_table,
"alert_history_table": alert_history_table,
"kv_threshold": kv_threshold,
"gpu_memory_threshold": gpu_memory_threshold,
"ttft_multiplier": ttft_multiplier,
"throughput_drop": throughput_drop,
"slack_webhook": slack_webhook,
"pagerduty_key": pagerduty_key,
"config_status": config_status,
}
def update_alerts_panel(
alert_engine: AlertEngine,
db=None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Update the alerts panel with current data.
Args:
alert_engine: Alert engine instance
db: Optional database for history
Returns:
Tuple of (active_alerts_df, history_df)
"""
# Get active alerts
active = alert_engine.get_active_alerts()
active_rows = []
for alert in active:
active_rows.append({
"Time": alert.timestamp.strftime("%H:%M:%S"),
"Severity": _format_severity(alert.severity),
"Metric": alert.metric,
"Value": round(alert.value, 2),
"Threshold": round(alert.threshold, 2),
"Message": alert.message,
})
active_df = pd.DataFrame(active_rows) if active_rows else pd.DataFrame(
columns=["Time", "Severity", "Metric", "Value", "Threshold", "Message"]
)
# Get history from database
history_rows = []
if db:
recent = db.get_recent_alerts(limit=20)
for record in recent:
history_rows.append({
"Time": record.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
"Severity": _format_severity_str(record.severity),
"Message": record.message,
"Resolved": "Yes" if record.resolved_at else "No",
})
history_df = pd.DataFrame(history_rows) if history_rows else pd.DataFrame(
columns=["Time", "Severity", "Message", "Resolved"]
)
return active_df, history_df
def _format_severity(severity: AlertSeverity) -> str:
"""Format severity for display."""
icons = {
AlertSeverity.INFO: "INFO",
AlertSeverity.WARNING: "WARNING",
AlertSeverity.CRITICAL: "CRITICAL",
}
return icons.get(severity, "UNKNOWN")
def _format_severity_str(severity: str) -> str:
"""Format severity string for display."""
return severity.upper()
def get_alert_badge_html(alerts: List[Alert]) -> str:
"""
Generate HTML badge for active alerts.
Args:
alerts: List of active alerts
Returns:
HTML string for badge
"""
if not alerts:
return '<span style="color: #2e7d32;">No Active Alerts</span>'
critical = sum(1 for a in alerts if a.severity == AlertSeverity.CRITICAL)
warning = sum(1 for a in alerts if a.severity == AlertSeverity.WARNING)
badges = []
if critical > 0:
badges.append(
f'<span style="background: #c62828; color: white; padding: 2px 8px; '
f'border-radius: 12px; margin-right: 5px;">{critical} Critical</span>'
)
if warning > 0:
badges.append(
f'<span style="background: #ff9800; color: white; padding: 2px 8px; '
f'border-radius: 12px;">{warning} Warning</span>'
)
return "".join(badges)