Spaces:

yoon-gu
/

drift-detection-ui

Sleeping

Yoon-gu Hwang

월별 데이터 드리프트 감지 및 분석 대시보드 추가

1ee2788 2 months ago

14.8 kB

	import sqlite3
	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from datetime import datetime, timedelta
	import os
	import subprocess
	import numpy as np
	from frouros.detectors.data_drift import KSTest
	from scipy.stats import wasserstein_distance

	# Initialize database if it doesn't exist
	if not os.path.exists('drift_detection.db'):
	print("Database not found. Creating new database...")
	subprocess.run(['python', 'create_database.py'], check=True)
	print("Database created successfully!")

	def get_korean_holidays_2025():
	"""Get Korean holidays for Jan-Aug 2025 (including substitute holidays)"""
	holidays = [
	(datetime(2025, 1, 1), datetime(2025, 1, 1), "신정"),
	(datetime(2025, 1, 28), datetime(2025, 1, 30), "설날 연휴"),
	(datetime(2025, 3, 1), datetime(2025, 3, 1), "삼일절"),
	(datetime(2025, 3, 3), datetime(2025, 3, 3), "삼일절 대체공휴일"),
	(datetime(2025, 5, 5), datetime(2025, 5, 5), "어린이날·부처님오신날"),
	(datetime(2025, 5, 6), datetime(2025, 5, 6), "대체공휴일"),
	(datetime(2025, 6, 6), datetime(2025, 6, 6), "현충일"),
	(datetime(2025, 8, 15), datetime(2025, 8, 15), "광복절"),
	]
	return holidays

	def get_weekends(start_date, end_date):
	"""Get all weekends (Saturday-Sunday) between start and end date"""
	weekends = []
	current = start_date
	while current <= end_date:
	if current.weekday() == 5: # Saturday
	weekend_start = current
	weekend_end = current + timedelta(days=1) # Sunday
	weekends.append((weekend_start, weekend_end))
	current += timedelta(days=1)
	return weekends

	def load_drift_data():
	"""Load all drift_record data"""
	conn = sqlite3.connect('drift_detection.db')

	query = """
	SELECT
	dr.model_id,
	mi.model_name,
	dr.precision,
	dr.recall,
	dr.sample_numbers,
	dr.js_value,
	dr.wd_value,
	dr.prediction_date
	FROM drift_record dr
	LEFT JOIN model_info mi ON dr.model_id = mi.model_id
	ORDER BY dr.prediction_date, dr.model_id
	"""

	df = pd.read_sql_query(query, conn)
	conn.close()

	# Round numeric columns for better display
	numeric_cols = ['precision', 'recall', 'js_value', 'wd_value']
	for col in numeric_cols:
	if col in df.columns:
	df[col] = df[col].round(4)

	return df

	def load_model_info():
	"""Load model_info data"""
	conn = sqlite3.connect('drift_detection.db')

	query = """
	SELECT
	model_id,
	model_name,
	release_date,
	prediction_period
	FROM model_info
	ORDER BY model_id
	"""

	df = pd.read_sql_query(query, conn)
	conn.close()

	return df

	def split_data_by_month(df):
	"""Split dataframe by month"""
	df = df.copy()
	df['prediction_date'] = pd.to_datetime(df['prediction_date'])
	df['month'] = df['prediction_date'].dt.to_period('M')
	return df

	def detect_drift_ks_test(reference_data, current_data):
	"""Detect drift using Kolmogorov-Smirnov test"""
	detector = KSTest()
	detector.fit(X=reference_data)
	result, _ = detector.compare(X=current_data)
	return {
	'p_value': result.p_value,
	'statistic': result.statistic,
	'drift_detected': result.p_value < 0.05
	}

	def calculate_monthly_drift(df, metric='precision'):
	"""Calculate drift for each month compared to January (baseline)"""
	df_with_month = split_data_by_month(df)

	months = sorted(df_with_month['month'].unique())
	if len(months) < 2:
	return pd.DataFrame()

	# Use January as baseline
	baseline_month = months[0]
	baseline_data = df_with_month[df_with_month['month'] == baseline_month][metric].values

	drift_results = []
	for month in months[1:]:
	current_data = df_with_month[df_with_month['month'] == month][metric].values

	if len(current_data) > 0 and len(baseline_data) > 0:
	# KS Test
	ks_result = detect_drift_ks_test(baseline_data, current_data)

	# Wasserstein Distance
	wd = wasserstein_distance(baseline_data, current_data)

	drift_results.append({
	'month': str(month),
	'month_name': month.strftime('%Y-%m'),
	'ks_statistic': ks_result['statistic'],
	'p_value': ks_result['p_value'],
	'drift_detected': ks_result['drift_detected'],
	'wasserstein_distance': wd,
	'sample_size': len(current_data)
	})

	return pd.DataFrame(drift_results)

	def create_metric_chart(df, metric='precision'):
	"""Create Plotly line chart for selected metric over time by model"""
	if df.empty:
	return px.line(title="No data available")

	# Convert prediction_date to datetime
	df = df.copy()
	df['prediction_date'] = pd.to_datetime(df['prediction_date'])

	# Metric display names
	metric_names = {
	'precision': 'Precision',
	'recall': 'Recall',
	'js_value': 'JS Divergence',
	'wd_value': 'Wasserstein Distance'
	}

	metric_display = metric_names.get(metric, metric.capitalize())

	# Create line chart
	fig = px.line(
	df,
	x='prediction_date',
	y=metric,
	color='model_name',
	labels={
	'prediction_date': 'Date',
	metric: metric_display,
	'model_name': 'Model'
	},
	markers=True
	)

	# Get date range from data
	start_date = df['prediction_date'].min()
	end_date = df['prediction_date'].max()

	# Add weekend shading (light gray)
	weekends = get_weekends(start_date, end_date)
	for weekend_start, weekend_end in weekends:
	fig.add_vrect(
	x0=weekend_start,
	x1=weekend_end,
	fillcolor="gray",
	opacity=0.2,
	layer="below",
	line_width=0,
	)

	# Add Korean holiday shading (light red)
	holidays = get_korean_holidays_2025()
	for holiday_start, holiday_end, holiday_name in holidays:
	if holiday_start <= end_date and holiday_end >= start_date:
	fig.add_vrect(
	x0=holiday_start,
	x1=holiday_end + timedelta(days=1), # Add 1 day to include the end date
	fillcolor="red",
	opacity=0.25,
	layer="below",
	line_width=0,
	annotation_text=holiday_name,
	annotation_position="top left",
	annotation=dict(font_size=10, font_color="darkred")
	)

	fig.update_layout(
	hovermode='x unified',
	xaxis_title='Date',
	yaxis_title=metric_display,
	legend_title='Model',
	height=450,
	margin=dict(t=20, b=50, l=50, r=20)
	)

	return fig

	def create_drift_markers_chart(df, metric='precision'):
	"""Create time series chart with drift markers"""
	df_with_month = split_data_by_month(df)
	drift_df = calculate_monthly_drift(df, metric)

	# Create base chart
	fig = create_metric_chart(df, metric)

	# Add drift markers for each month with drift
	if not drift_df.empty:
	for _, row in drift_df[drift_df['drift_detected']].iterrows():
	month_str = row['month']
	# Add vertical line at month boundary
	month_date = pd.Period(month_str).to_timestamp()
	fig.add_vline(
	x=month_date,
	line_dash="dash",
	line_color="red",
	line_width=2,
	annotation_text=f"Drift Detected<br>{row['month_name']}",
	annotation_position="top",
	annotation=dict(font_size=9, font_color="red")
	)

	return fig

	def create_monthly_drift_chart(df, metric='precision'):
	"""Create bar chart of monthly drift scores"""
	drift_df = calculate_monthly_drift(df, metric)

	if drift_df.empty:
	return go.Figure().add_annotation(
	text="Not enough data for drift detection",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False
	)

	fig = go.Figure()

	# KS Statistic bars
	fig.add_trace(go.Bar(
	x=drift_df['month_name'],
	y=drift_df['ks_statistic'],
	name='KS Statistic',
	marker_color=['red' if d else 'blue' for d in drift_df['drift_detected']],
	text=[f"p={p:.4f}" for p in drift_df['p_value']],
	textposition='outside'
	))

	# Wasserstein Distance (secondary y-axis)
	fig.add_trace(go.Scatter(
	x=drift_df['month_name'],
	y=drift_df['wasserstein_distance'],
	name='Wasserstein Distance',
	yaxis='y2',
	mode='lines+markers',
	line=dict(color='orange', width=2),
	marker=dict(size=8)
	))

	fig.update_layout(
	title=f'Monthly Drift Detection for {metric.capitalize()}',
	xaxis_title='Month',
	yaxis_title='KS Statistic',
	yaxis2=dict(
	title='Wasserstein Distance',
	overlaying='y',
	side='right'
	),
	height=500,
	hovermode='x unified',
	showlegend=True
	)

	return fig

	def create_drift_heatmap(df):
	"""Create heatmap showing drift across all metrics and months"""
	metrics = ['precision', 'recall', 'js_value', 'wd_value']
	metric_names = ['Precision', 'Recall', 'JS Divergence', 'WD Value']

	all_drift_data = {}
	all_months = set()

	for metric in metrics:
	drift_df = calculate_monthly_drift(df, metric)
	if not drift_df.empty:
	all_drift_data[metric] = drift_df
	all_months.update(drift_df['month_name'].values)

	if not all_drift_data:
	return go.Figure().add_annotation(
	text="Not enough data for drift heatmap",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False
	)

	months = sorted(list(all_months))
	z_data = []
	hover_text = []

	for metric in metrics:
	if metric in all_drift_data:
	drift_df = all_drift_data[metric]
	row_z = []
	row_hover = []
	for month in months:
	month_data = drift_df[drift_df['month_name'] == month]
	if not month_data.empty:
	row = month_data.iloc[0]
	# Use p-value as color intensity (lower p-value = more drift = darker color)
	row_z.append(1 - row['p_value']) # Invert so drift shows as high value
	row_hover.append(
	f"KS: {row['ks_statistic']:.4f}<br>" +
	f"p-value: {row['p_value']:.4f}<br>" +
	f"WD: {row['wasserstein_distance']:.4f}<br>" +
	f"Drift: {'Yes' if row['drift_detected'] else 'No'}"
	)
	else:
	row_z.append(0)
	row_hover.append("No data")
	z_data.append(row_z)
	hover_text.append(row_hover)
	else:
	z_data.append([0] * len(months))
	hover_text.append(["No data"] * len(months))

	fig = go.Figure(data=go.Heatmap(
	z=z_data,
	x=months,
	y=metric_names,
	colorscale='RdYlGn_r', # Red for drift, Green for no drift
	text=hover_text,
	hovertemplate='%{y}<br>%{x}<br>%{text}<extra></extra>',
	colorbar=dict(title="Drift<br>Intensity")
	))

	fig.update_layout(
	title='Drift Detection Heatmap (All Metrics)',
	xaxis_title='Month',
	yaxis_title='Metric',
	height=400
	)

	return fig

	def update_chart(metric):
	"""Update chart based on selected metric"""
	df = load_drift_data()
	chart = create_metric_chart(df, metric)
	return chart

	def update_all_drift_visualizations(metric):
	"""Update all drift-related visualizations"""
	df = load_drift_data()
	drift_markers_chart = create_drift_markers_chart(df, metric)
	monthly_drift_chart = create_monthly_drift_chart(df, metric)
	drift_heatmap = create_drift_heatmap(df)
	return drift_markers_chart, monthly_drift_chart, drift_heatmap

	# Create Gradio interface
	with gr.Blocks(title="Drift Detection Dashboard", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Drift Detection Dashboard")
	gr.Markdown("모델별 메트릭 시계열 및 월별 데이터 드리프트 분석")

	with gr.Row():
	metric_dropdown = gr.Dropdown(
	choices=[
	("Precision", "precision"),
	("Recall", "recall"),
	("JS Divergence", "js_value"),
	("Wasserstein Distance", "wd_value")
	],
	value="precision",
	label="Metric to Analyze",
	scale=1
	)

	with gr.Tabs():
	with gr.Tab("📈 Time Series + Drift Markers"):
	gr.Markdown("### 시계열 차트 (드리프트 발생 지점 표시)")
	drift_markers_plot = gr.Plot()

	with gr.Tab("📊 Monthly Drift Scores"):
	gr.Markdown("### 월별 드리프트 점수 (1월 대비)")
	monthly_drift_plot = gr.Plot()

	with gr.Tab("🔥 Drift Heatmap"):
	gr.Markdown("### 전체 메트릭 드리프트 히트맵")
	heatmap_plot = gr.Plot()

	with gr.Tab("📋 Data Tables"):
	gr.Markdown("### 원본 데이터")
	with gr.Row():
	with gr.Column(scale=2):
	dataframe_output = gr.Dataframe(
	value=load_drift_data(),
	interactive=False,
	wrap=True,
	label="Drift Records"
	)
	with gr.Column(scale=1):
	model_info_output = gr.Dataframe(
	value=load_model_info(),
	interactive=False,
	wrap=True,
	label="Model Info"
	)

	# Event handlers
	metric_dropdown.change(
	fn=update_all_drift_visualizations,
	inputs=[metric_dropdown],
	outputs=[drift_markers_plot, monthly_drift_plot, heatmap_plot]
	)

	# Load initial data
	def load_initial_data():
	df = load_drift_data()
	drift_markers = create_drift_markers_chart(df, 'precision')
	monthly_drift = create_monthly_drift_chart(df, 'precision')
	heatmap = create_drift_heatmap(df)
	return drift_markers, monthly_drift, heatmap

	demo.load(
	fn=load_initial_data,
	outputs=[drift_markers_plot, monthly_drift_plot, heatmap_plot]
	)

	if __name__ == "__main__":
	demo.launch()