LLM-PII-Detection-Leaderboard

Sleeping

LLM-PII-Detection-Leaderboard / pii_leaderboard.py

Luis Kalckstein

Improving layout

eeb055e unverified 9 months ago

25.7 kB

	import gradio as gr
	import pandas as pd
	import tempfile
	import os
	from data_loader import (
	load_data,
	PII_CATEGORIES,
	HEADER_CONTENT,
	METHODOLOGY,
	COLORS,
	MODEL_TYPES
	)

	def get_rank_badge(rank):
	"""Generate HTML for rank badge with appropriate styling"""
	badge_styles = {
	1: ("1st", f"linear-gradient(145deg, {COLORS['digital_pollen']}, {COLORS['digital_pollen']})", COLORS['warm_black']),
	2: ("2nd", f"linear-gradient(145deg, {COLORS['soft_grey']}, {COLORS['warm_grey']})", COLORS['white']),
	3: ("3rd", f"linear-gradient(145deg, {COLORS['code_coral']}, {COLORS['code_coral_dm']})", COLORS['white']),
	}

	if rank in badge_styles:
	label, gradient, text_color = badge_styles[rank]
	return f"""
	<div style="
	display: inline-flex;
	align-items: center;
	justify-content: center;
	min-width: 48px;
	padding: 4px 12px;
	background: {gradient};
	color: {text_color};
	border-radius: 6px;
	font-weight: 600;
	font-size: 0.9em;
	box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
	">
	{label}
	</div>
	"""
	return f"""
	<div style="
	display: inline-flex;
	align-items: center;
	justify-content: center;
	min-width: 28px;
	color: var(--text-secondary);
	font-weight: 500;
	">
	{rank}
	</div>
	"""

	def get_type_badge(model_type):
	"""Generate HTML for model type badge"""
	bg_color = COLORS['disc_pink'] if model_type == 'Proprietary' else COLORS['data_green']
	return f"""
	<div style="
	display: inline-flex;
	align-items: center;
	padding: 4px 8px;
	background: {bg_color};
	color: white;
	border-radius: 4px;
	font-size: 0.85em;
	font-weight: 500;
	">
	{model_type}
	</div>
	"""

	def get_score_bar(score, is_inverse=False):
	"""Generate HTML for score bar with gradient styling"""
	if pd.isna(score) or score == '':
	score = 0
	else:
	score = float(score)

	width = score * 100

	# For over-detection rate, use inverse coloring (lower is better)
	if is_inverse:
	gradient = f"linear-gradient(90deg, {COLORS['data_green']}, {COLORS['code_coral']})"
	else:
	gradient = f"linear-gradient(90deg, {COLORS['code_coral']}, {COLORS['data_green']})"

	return f"""
	<div style="display: flex; align-items: center; gap: 12px; width: 100%;">
	<div style="
	flex-grow: 1;
	height: 8px;
	background: rgba(239, 235, 231, 0.1);
	border-radius: 4px;
	overflow: hidden;
	max-width: 200px;
	">
	<div style="
	width: {width}%;
	height: 100%;
	background: {gradient};
	border-radius: 4px;
	transition: width 0.3s ease;
	"></div>
	</div>
	<span style="
	font-family: 'SF Mono', monospace;
	font-weight: 600;
	color: var(--text-primary);
	min-width: 60px;
	">{score:.3f}</span>
	</div>
	"""

	def create_pii_leaderboard():
	"""Create the main PII detection leaderboard interface"""

	def load_leaderboard_data():
	"""Load and prepare the leaderboard data"""
	return load_data()

	def generate_html_table(filtered_df, document_type, sort_by):
	"""Generate styled HTML table with rank badges and score bars"""
	table_html = """
	<div class="v2-table-container">
	<table class="v2-styled-table">
	<thead>
	<tr>
	<th style="width: 80px;">Rank</th>
	<th>Model</th>
	<th style="width: 120px;">Type</th>
	<th>Vendor</th>
	<th style="width: 200px;">Overall Accuracy</th>
	<th style="width: 150px;">Precision</th>
	<th style="width: 150px;">Recall</th>
	<th style="width: 150px;">F1 Score</th>
	<th style="width: 160px;">Over-detection Rate</th>
	<th>Cost/Doc ($)</th>
	<th>Time (s)</th>
	</tr>
	</thead>
	<tbody>
	"""

	# Generate table rows
	for idx, (_, row) in enumerate(filtered_df.iterrows()):
	rank = idx + 1
	table_html += f"""
	<tr>
	<td>{get_rank_badge(rank)}</td>
	<td class="model-name">{row['Model']}</td>
	<td>{get_type_badge(row['Model Type'])}</td>
	<td>{row['Vendor']}</td>
	"""

	# Get appropriate values based on document type filter
	if document_type != "All":
	# For specific document type, show domain-specific scores
	accuracy_col = f'{document_type} Accuracy'
	accuracy = row.get(accuracy_col, row.get('Overall Accuracy', ''))
	else:
	# For "All", show overall accuracy
	accuracy = row.get('Overall Accuracy', '')

	precision = row.get('Precision', '')
	recall = row.get('Recall', '')
	f1 = row.get('F1 Score', '')
	over_detection = row.get('Over-redaction Rate', '')
	cost = row.get('Cost per Document ($)', '')
	time = row.get('Processing Time (s)', '')

	# Add score bars
	if accuracy != '':
	table_html += f'<td class="score-cell">{get_score_bar(accuracy)}</td>'
	else:
	table_html += '<td class="numeric-cell">-</td>'

	if precision != '':
	table_html += f'<td class="score-cell">{get_score_bar(precision)}</td>'
	else:
	table_html += '<td class="numeric-cell">-</td>'

	if recall != '':
	table_html += f'<td class="score-cell">{get_score_bar(recall)}</td>'
	else:
	table_html += '<td class="numeric-cell">-</td>'

	if f1 != '':
	table_html += f'<td class="score-cell">{get_score_bar(f1)}</td>'
	else:
	table_html += '<td class="numeric-cell">-</td>'

	if over_detection != '':
	table_html += f'<td class="score-cell">{get_score_bar(over_detection, is_inverse=True)}</td>'
	else:
	table_html += '<td class="numeric-cell">-</td>'

	# Format cost and time
	if cost != '':
	cost_display = f'${float(cost):.3f}'
	else:
	cost_display = '-'

	if time != '':
	time_display = f'{float(time):.1f}'
	else:
	time_display = '-'

	table_html += f"""
	<td class="numeric-cell">{cost_display}</td>
	<td class="numeric-cell">{time_display}</td>
	</tr>
	"""

	table_html += """
	</tbody>
	</table>
	</div>
	"""

	return table_html

	def filter_and_sort_data(document_type, model_type_filter, sort_by, sort_order):
	"""Filter and sort the leaderboard data"""
	df = load_leaderboard_data()
	filtered_df = df.copy()

	# Document type filtering
	if document_type != "All":
	# Only show models that have data for this document type
	doc_col = f'{document_type} Accuracy'
	if doc_col in filtered_df.columns:
	filtered_df = filtered_df[filtered_df[doc_col] != '']

	# Model type filtering
	if model_type_filter != "All":
	if model_type_filter == "Open Source":
	filtered_df = filtered_df[filtered_df['Model Type'] == 'Open Source']
	elif model_type_filter == "Proprietary":
	filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']

	# Sorting
	sort_column = sort_by
	if document_type != "All" and sort_by == 'Overall Accuracy':
	sort_column = f'{document_type} Accuracy'

	if sort_column in filtered_df.columns:
	ascending = (sort_order == "Ascending")
	# For over-detection rate, flip the logic (lower is better)
	if sort_by == "Over-redaction Rate":
	ascending = not ascending
	filtered_df = filtered_df.sort_values(by=sort_column, ascending=ascending, na_position='last')

	return generate_html_table(filtered_df, document_type, sort_by)

	def generate_performance_card(model_name):
	"""Generate HTML for the model performance card"""
	if not model_name:
	return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
	Please select a model to generate its performance card
	</div>"""

	df = load_leaderboard_data()
	model_data = df[df['Model'] == model_name]

	if model_data.empty:
	return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
	Model not found in the database
	</div>"""

	row = model_data.iloc[0]

	# Get overall rank
	df_with_accuracy = df[df['Overall Accuracy'] != ''].copy()
	df_with_accuracy['Overall Accuracy'] = pd.to_numeric(df_with_accuracy['Overall Accuracy'], errors='coerce')
	df_sorted = df_with_accuracy.sort_values('Overall Accuracy', ascending=False).reset_index(drop=True)
	try:
	rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
	except:
	rank = 'N/A'

	# Format values
	def format_value(val, decimals=3, prefix='', suffix=''):
	if pd.isna(val) or val == '':
	return 'N/A'
	return f"{prefix}{float(val):.{decimals}f}{suffix}"

	# Determine model type icon
	type_icon = "🔓" if row['Model Type'] == 'Open Source' else "🔒"

	# Calculate performance stars
	def get_performance_stars(value, max_val=1.0):
	if pd.isna(value) or value == '':
	return '⭐' * 0
	score = float(value) / max_val
	if score >= 0.9:
	return '⭐' * 5
	elif score >= 0.8:
	return '⭐' * 4
	elif score >= 0.7:
	return '⭐' * 3
	elif score >= 0.6:
	return '⭐' * 2
	else:
	return '⭐' * 1

	# Create HTML
	card_html = f"""
	<div class="performance-card">
	<div class="card-header">
	<h1 class="card-model-name">{model_name}</h1>
	<div class="card-stars">
	{get_performance_stars(row['Overall Accuracy'])}
	</div>
	</div>

	<div class="metrics-grid" style="margin-bottom: 24px;">
	<div class="metric-item">
	<div class="metric-icon" style="color: var(--accent-primary);">🏆</div>
	<div class="metric-label">Overall Rank</div>
	<div class="metric-value">#{rank}</div>
	</div>

	<div class="metric-item">
	<div class="metric-icon" style="color: var(--accent-primary);">🎯</div>
	<div class="metric-label">Overall Accuracy</div>
	<div class="metric-value">{format_value(row['Overall Accuracy'])}</div>
	</div>

	<div class="metric-item">
	<div class="metric-icon" style="color: var(--accent-secondary);">📊</div>
	<div class="metric-label">Precision</div>
	<div class="metric-value">{format_value(row['Precision'])}</div>
	</div>

	<div class="metric-item">
	<div class="metric-icon" style="color: var(--accent-tertiary);">🔍</div>
	<div class="metric-label">Recall</div>
	<div class="metric-value">{format_value(row['Recall'])}</div>
	</div>

	<div class="metric-item">
	<div class="metric-icon" style="color: var(--accent-quaternary);">💰</div>
	<div class="metric-label">Cost/Doc</div>
	<div class="metric-value">{format_value(row['Cost per Document ($)'], 3, '$')}</div>
	</div>

	<div class="metric-item">
	<div class="metric-icon" style="color: var(--text-primary);">⚡</div>
	<div class="metric-label">Processing Time</div>
	<div class="metric-value">{format_value(row['Processing Time (s)'], 1, '', 's')}</div>
	</div>
	</div>

	<div class="domains-section" style="margin-top: 24px;">
	<h3 class="domains-title">📄 Document Type Performance</h3>
	<div class="domains-grid">
	"""

	# Add document type scores
	doc_types = [
	('🏥', 'Healthcare'),
	('💰', 'Financial'),
	('🏛️', 'Government'),
	('⚖️', 'Legal'),
	('👤', 'Personal')
	]

	for doc_icon, doc_type in doc_types:
	accuracy_col = f'{doc_type} Accuracy'
	accuracy_value = row.get(accuracy_col, '')

	if accuracy_value != '' and not pd.isna(accuracy_value):
	score_display = f"{float(accuracy_value):.3f}"
	score_color = "var(--accent-primary)"
	else:
	score_display = "N/A"
	score_color = "var(--text-muted)"

	card_html += f"""
	<div class="domain-item">
	<div class="domain-name">{doc_icon}</div>
	<div style="font-size: 0.7rem; color: var(--text-secondary); margin-bottom: 2px;">{doc_type}</div>
	<div class="domain-score" style="color: {score_color};">{score_display}</div>
	</div>
	"""

	card_html += f"""
	</div>
	</div>

	<div class="card-footer">
	<div class="card-url">
	<strong>LLM PII Detection Leaderboard</strong>
	</div>
	</div>
	</div>
	"""

	return card_html

	# Load initial data
	initial_df = load_leaderboard_data()
	initial_table = filter_and_sort_data("All", "All", "Overall Accuracy", "Descending")

	# Display header
	gr.HTML(HEADER_CONTENT)

	# Main leaderboard section with all filters
	gr.HTML("""
	<div class="dark-container" style="margin-bottom: 32px;">
	<div class="section-header">
	<span class="section-icon" style="color: var(--accent-primary);">📈</span>
	<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Inter', sans-serif; font-weight: 700;">
	PII Detection Performance Leaderboard
	</h3>
	</div>
	<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Inter', sans-serif;">
	Filter by document type, model access, and sort by any metric to explore performance
	</p>

	<!-- Document Type Filter -->
	<div style="margin-bottom: 24px;">
	<h4 style="color: var(--text-primary); margin-bottom: 12px; font-size: 1rem;">📄 Document Type</h4>
	""")

	document_type_filter = gr.Radio(
	choices=["All", "Healthcare", "Financial", "Government", "Legal", "Personal"],
	value="All",
	label="",
	interactive=True,
	elem_classes=["document-type-radio"]
	)

	gr.HTML("""
	</div>

	<!-- Other Filters -->
	<div style="margin-bottom: 24px;">
	<h4 style="color: var(--text-primary); margin-bottom: 12px; font-size: 1rem;">🔍 Filters & Sorting</h4>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	model_type_filter = gr.Radio(
	choices=["All", "Open Source", "Proprietary"],
	value="All",
	label="🔓 Model Access",
	elem_classes=["compact-radio"]
	)

	with gr.Column(scale=1):
	sort_by = gr.Dropdown(
	choices=["Overall Accuracy", "Precision", "Recall", "F1 Score", "Over-redaction Rate", "Cost per Document ($)", "Processing Time (s)"],
	value="Overall Accuracy",
	label="📊 Sort By",
	elem_classes=["dropdown"]
	)

	with gr.Column(scale=1):
	sort_order = gr.Radio(
	choices=["Descending", "Ascending"],
	value="Descending",
	label="🔄 Sort Order",
	elem_classes=["compact-radio"]
	)

	gr.HTML("""
	<!-- Leaderboard Table -->
	<div style="margin-top: 24px;">
	<div class="dataframe-container">
	""")

	leaderboard_table = gr.HTML(initial_table)

	gr.HTML("""
	</div>
	</div>
	</div>""")

	# Methodology section
	gr.HTML(f"""
	<div class="dark-container" style="margin-top: 32px;">
	{METHODOLOGY}
	</div>
	""")

	# Performance Card Section
	gr.HTML("""
	<div class="dark-container" style="margin-top: 32px;">
	<div class="section-header">
	<span class="section-icon" style="color: var(--accent-primary);">🎯</span>
	<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Inter', sans-serif; font-weight: 700;">
	Model Performance Cards
	</h3>
	</div>
	<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Inter', sans-serif; text-align: center;">
	Dive deep into individual model performance across all metrics and document types
	</p>

	""")

	card_model_selector = gr.Dropdown(
	choices=initial_df['Model'].tolist(),
	value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
	label="🤖 Select Model",
	info="Choose a model to view its performance card",
	elem_classes=["dropdown"]
	)

	gr.HTML("""
	</div>
	</div>

	<div style="width: 100%;">
	""")

	# Card display area
	initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
	initial_card_html = generate_performance_card(initial_model) if initial_model else ""
	card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")

	gr.HTML("""
	</div>
	</div>
	</div>""")

	# Add performance card CSS
	gr.HTML(f"""
	<style>
	.performance-card {{
	background: linear-gradient(145deg, rgba(26, 20, 20, 0.98) 0%, rgba(222, 157, 204, 0.05) 100%);
	border: 2px solid var(--accent-primary);
	border-radius: 24px;
	padding: 32px;
	max-width: 700px;
	margin: 0 auto;
	position: relative;
	overflow: hidden;
	box-shadow:
	0 20px 40px rgba(0, 0, 0, 0.5),
	0 0 80px rgba(222, 157, 204, 0.2),
	inset 0 0 120px rgba(222, 157, 204, 0.05);
	}}

	.card-header {{
	text-align: center;
	margin-bottom: 24px;
	position: relative;
	z-index: 1;
	}}

	.card-model-name {{
	font-size: 2rem;
	font-weight: 800;
	background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 8px;
	text-shadow: 0 0 40px var(--glow-primary);
	line-height: 1.2;
	}}

	.card-stars {{
	font-size: 1.2rem;
	margin: 8px 0;
	}}

	.metrics-grid {{
	display: grid;
	grid-template-columns: repeat(2, 1fr);
	gap: 16px;
	margin: 24px 0;
	}}

	.metric-item {{
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 16px;
	background: rgba(239, 235, 231, 0.05);
	border-radius: 12px;
	border: 1px solid var(--border-subtle);
	transition: all 0.3s ease;
	}}

	.metric-item:hover {{
	transform: translateY(-2px);
	border-color: var(--accent-primary);
	box-shadow: 0 8px 16px rgba(222, 157, 204, 0.3);
	}}

	.metric-icon {{
	font-size: 1.5rem;
	margin-bottom: 8px;
	}}

	.metric-label {{
	font-size: 0.85rem;
	color: var(--text-secondary);
	margin-bottom: 4px;
	text-align: center;
	}}

	.metric-value {{
	font-size: 1.1rem;
	font-weight: 700;
	color: var(--text-primary);
	text-align: center;
	}}

	.domains-section {{
	margin-top: 24px;
	}}

	.domains-title {{
	color: var(--text-primary);
	font-size: 1.2rem;
	margin-bottom: 16px;
	text-align: center;
	}}

	.domains-grid {{
	display: grid;
	grid-template-columns: repeat(5, 1fr);
	gap: 12px;
	}}

	.domain-item {{
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 12px;
	background: rgba(239, 235, 231, 0.03);
	border-radius: 8px;
	border: 1px solid var(--border-subtle);
	transition: all 0.3s ease;
	}}

	.domain-item:hover {{
	border-color: var(--accent-primary);
	transform: scale(1.02);
	}}

	.domain-name {{
	font-size: 1.2rem;
	margin-bottom: 4px;
	}}

	.domain-score {{
	font-size: 0.9rem;
	font-weight: 600;
	}}

	.card-footer {{
	text-align: center;
	margin-top: 24px;
	padding-top: 16px;
	border-top: 1px solid var(--border-subtle);
	}}

	.card-url {{
	color: var(--text-secondary);
	font-size: 0.9rem;
	}}

	/* Additional styling for radio buttons and specific components */
	.document-type-radio .wrap {{
	display: flex !important;
	gap: 12px !important;
	flex-wrap: wrap !important;
	justify-content: center !important;
	}}

	.document-type-radio .wrap > label {{
	flex: 1 !important;
	min-width: 140px !important;
	max-width: 180px !important;
	padding: 12px 16px !important;
	background: var(--bg-card) !important;
	border: 2px solid var(--border-default) !important;
	border-radius: 12px !important;
	cursor: pointer !important;
	transition: all 0.3s ease !important;
	text-align: center !important;
	font-weight: 500 !important;
	}}

	.document-type-radio .wrap > label:hover {{
	border-color: var(--accent-primary) !important;
	transform: translateY(-2px) !important;
	}}

	.document-type-radio .wrap > label:has(input[type="radio"]:checked) {{
	background: transparent !important;
	border-color: var(--accent-primary) !important;
	color: var(--text-primary) !important;
	font-weight: 600 !important;
	box-shadow: 0 8px 16px var(--glow-primary) !important;
	}}

	.document-type-radio input[type="radio"] {{
	display: none !important;
	}}

	.compact-radio .wrap > label {{
	padding: 8px 12px !important;
	font-size: 0.85rem !important;
	min-width: auto !important;
	max-width: 120px !important;
	}}
	</style>
	""")

	# Update functions
	def update_table(*args):
	return filter_and_sort_data(*args)

	def update_card(model_name):
	return generate_performance_card(model_name)

	# Connect update functions to components
	filter_inputs = [document_type_filter, model_type_filter, sort_by, sort_order]

	for input_component in filter_inputs:
	input_component.change(
	fn=update_table,
	inputs=filter_inputs,
	outputs=[leaderboard_table]
	)

	# Update card when model selection changes
	card_model_selector.change(
	fn=update_card,
	inputs=[card_model_selector],
	outputs=[card_display]
	)

	def create_app():
	"""Create the main Gradio application"""
	with gr.Blocks(
	theme=gr.themes.Default(),
	title="🔒 LLM PII Detection Leaderboard"
	) as app:
	create_pii_leaderboard()

	return app

	if __name__ == "__main__":
	demo = create_app()
	demo.launch()