Spaces:

jkottu
/

llm-inference-dashboard

Sleeping

App Files Files Community

llm-inference-dashboard / components /alerts_panel.py

jkottu

Initial commit: LLM Inference Dashboard

aefabf0 2 months ago

raw

history blame contribute delete

8.2 kB

	"""Alerts configuration and history panel component."""

	import gradio as gr
	import pandas as pd
	from datetime import datetime
	from typing import Dict, Any, Tuple, List

	from services.alerting import AlertEngine, AlertDispatcher, Alert, AlertSeverity


	def create_alerts_panel(
	alert_engine: AlertEngine,
	alert_dispatcher: AlertDispatcher,
	) -> Dict[str, Any]:
	"""
	Create the alerts panel.

	Args:
	alert_engine: Alert engine instance
	alert_dispatcher: Alert dispatcher instance

	Returns:
	Dictionary of Gradio components
	"""
	with gr.Column():
	with gr.Row():
	# Active alerts column
	with gr.Column(scale=2):
	gr.Markdown("### Active Alerts")
	active_alerts_table = gr.Dataframe(
	headers=["Time", "Severity", "Metric", "Value", "Threshold", "Message"],
	datatype=["str", "str", "str", "number", "number", "str"],
	label="Active Alerts",
	interactive=False,
	)

	gr.Markdown("### Alert History")
	alert_history_table = gr.Dataframe(
	headers=["Time", "Severity", "Message", "Resolved"],
	datatype=["str", "str", "str", "str"],
	label="Recent Alerts",
	interactive=False,
	)

	# Configuration column
	with gr.Column(scale=1):
	gr.Markdown("### Alert Configuration")

	kv_threshold = gr.Slider(
	label="KV Cache Alert Threshold (%)",
	minimum=50,
	maximum=100,
	value=90,
	step=5,
	)

	gpu_memory_threshold = gr.Slider(
	label="GPU Memory Alert Threshold (%)",
	minimum=70,
	maximum=100,
	value=95,
	step=5,
	)

	ttft_multiplier = gr.Slider(
	label="TTFT Spike Multiplier",
	minimum=1.5,
	maximum=5,
	value=2,
	step=0.5,
	)

	throughput_drop = gr.Slider(
	label="Throughput Drop Alert (%)",
	minimum=20,
	maximum=80,
	value=50,
	step=10,
	)

	gr.Markdown("### Webhook Configuration")

	slack_webhook = gr.Textbox(
	label="Slack Webhook URL",
	placeholder="https://hooks.slack.com/services/...",
	type="password",
	)

	pagerduty_key = gr.Textbox(
	label="PagerDuty Routing Key",
	placeholder="Enter routing key...",
	type="password",
	)

	with gr.Row():
	save_config_btn = gr.Button("Save Configuration")
	test_alert_btn = gr.Button("Send Test Alert", variant="secondary")

	config_status = gr.Textbox(
	label="Status",
	interactive=False,
	visible=True,
	)

	# Event handlers
	def save_config(kv, gpu, ttft, tp_drop, slack, pd_key):
	# Update alert thresholds
	if "kv_cache_high" in alert_engine.rules:
	alert_engine.rules["kv_cache_high"].threshold = kv
	if "gpu_memory_critical" in alert_engine.rules:
	alert_engine.rules["gpu_memory_critical"].threshold = gpu
	if "ttft_spike" in alert_engine.rules:
	alert_engine.rules["ttft_spike"].multiplier = ttft
	if "throughput_drop" in alert_engine.rules:
	alert_engine.rules["throughput_drop"].percent = tp_drop

	# Update webhook config
	alert_dispatcher.slack_webhook = slack if slack else None
	alert_dispatcher.pagerduty_key = pd_key if pd_key else None

	return "Configuration saved successfully"

	save_config_btn.click(
	fn=save_config,
	inputs=[
	kv_threshold,
	gpu_memory_threshold,
	ttft_multiplier,
	throughput_drop,
	slack_webhook,
	pagerduty_key,
	],
	outputs=config_status,
	)

	async def send_test():
	success = await alert_dispatcher.send_test_alert()
	if success:
	return "Test alert sent successfully"
	return "Failed to send test alert - check webhook configuration"

	test_alert_btn.click(
	fn=send_test,
	outputs=config_status,
	)

	return {
	"active_alerts_table": active_alerts_table,
	"alert_history_table": alert_history_table,
	"kv_threshold": kv_threshold,
	"gpu_memory_threshold": gpu_memory_threshold,
	"ttft_multiplier": ttft_multiplier,
	"throughput_drop": throughput_drop,
	"slack_webhook": slack_webhook,
	"pagerduty_key": pagerduty_key,
	"config_status": config_status,
	}


	def update_alerts_panel(
	alert_engine: AlertEngine,
	db=None,
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	"""
	Update the alerts panel with current data.

	Args:
	alert_engine: Alert engine instance
	db: Optional database for history

	Returns:
	Tuple of (active_alerts_df, history_df)
	"""
	# Get active alerts
	active = alert_engine.get_active_alerts()
	active_rows = []
	for alert in active:
	active_rows.append({
	"Time": alert.timestamp.strftime("%H:%M:%S"),
	"Severity": _format_severity(alert.severity),
	"Metric": alert.metric,
	"Value": round(alert.value, 2),
	"Threshold": round(alert.threshold, 2),
	"Message": alert.message,
	})

	active_df = pd.DataFrame(active_rows) if active_rows else pd.DataFrame(
	columns=["Time", "Severity", "Metric", "Value", "Threshold", "Message"]
	)

	# Get history from database
	history_rows = []
	if db:
	recent = db.get_recent_alerts(limit=20)
	for record in recent:
	history_rows.append({
	"Time": record.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
	"Severity": _format_severity_str(record.severity),
	"Message": record.message,
	"Resolved": "Yes" if record.resolved_at else "No",
	})

	history_df = pd.DataFrame(history_rows) if history_rows else pd.DataFrame(
	columns=["Time", "Severity", "Message", "Resolved"]
	)

	return active_df, history_df


	def _format_severity(severity: AlertSeverity) -> str:
	"""Format severity for display."""
	icons = {
	AlertSeverity.INFO: "INFO",
	AlertSeverity.WARNING: "WARNING",
	AlertSeverity.CRITICAL: "CRITICAL",
	}
	return icons.get(severity, "UNKNOWN")


	def _format_severity_str(severity: str) -> str:
	"""Format severity string for display."""
	return severity.upper()


	def get_alert_badge_html(alerts: List[Alert]) -> str:
	"""
	Generate HTML badge for active alerts.

	Args:
	alerts: List of active alerts

	Returns:
	HTML string for badge
	"""
	if not alerts:
	return '<span style="color: #2e7d32;">No Active Alerts</span>'

	critical = sum(1 for a in alerts if a.severity == AlertSeverity.CRITICAL)
	warning = sum(1 for a in alerts if a.severity == AlertSeverity.WARNING)

	badges = []
	if critical > 0:
	badges.append(
	f'<span style="background: #c62828; color: white; padding: 2px 8px; '
	f'border-radius: 12px; margin-right: 5px;">{critical} Critical</span>'
	)
	if warning > 0:
	badges.append(
	f'<span style="background: #ff9800; color: white; padding: 2px 8px; '
	f'border-radius: 12px;">{warning} Warning</span>'
	)

	return "".join(badges)