LLM-PII-Detection-Leaderboard

Running

LLM-PII-Detection-Leaderboard / data_loader.py

Luis Kalckstein

V1 including mock results

32e8dbc unverified 5 months ago

16.1 kB

	import pandas as pd
	import os

	# PII Detection Categories
	PII_CATEGORIES = {
	"Overall": ["Overall Accuracy"],
	"Entity Types": ["Names", "Addresses", "Phone Numbers", "SSN", "Medical IDs", "Financial Info"],
	"Document Types": ["Healthcare", "Financial", "Government", "Legal", "Personal"],
	"Performance": ["Precision", "Recall", "F1 Score"],
	"Efficiency": ["Processing Time", "Cost per Document"]
	}

	# Model type definitions
	MODEL_TYPES = {
	"Proprietary": "🔒",
	"Open Source": "🔓"
	}

	def load_data():
	"""Load and preprocess the PII detection evaluation data from CSV file."""

	# Load from CSV file
	csv_path = "results/pii_detection_results.csv"

	if not os.path.exists(csv_path):
	raise FileNotFoundError(f"Results file not found: {csv_path}. Please ensure the CSV file exists in the results folder.")

	df = pd.read_csv(csv_path)

	# Clean and prepare data
	df = df.fillna('')

	# Round numeric columns for better display
	numeric_cols = [
	'Overall Accuracy', 'Precision', 'Recall', 'F1 Score', 'Over-redaction Rate',
	'Processing Time (s)', 'Cost per Document ($)',
	'Healthcare Accuracy', 'Financial Accuracy', 'Government Accuracy',
	'Legal Accuracy', 'Personal Accuracy'
	]

	for col in numeric_cols:
	if col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce').round(3)

	return df

	# Color palette matching DocumentProcessing style
	COLORS = {
	# Light mode colors
	"white": "#FFFFFF",
	"disc_pink": "#DE9DCC",
	"code_coral": "#F25E45",
	"data_green": "#6EB579",
	"digital_pollen": "#F0C968",
	"warm_black": "#1A1414",
	"off_white": "#EFEBE7",
	"pixel_mist": "#E2DBD9",
	"soft_grey": "#C2B8AE",
	"warm_grey": "#67594B",

	# Dark mode colors
	"disc_pink_dm": "#4F2B45",
	"code_coral_dm": "#672D23",
	"data_green_dm": "#2B412F",
	"digital_pollen_dm": "#5B481A",
	}

	# Header content with PII detection focus
	HEADER_CONTENT = f"""
	<style>
	/* Import fonts */
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');

	/* Root variables with custom color palette */
	:root {{
	--bg-primary: #1A1414;
	--bg-secondary: rgba(239, 235, 231, 0.03);
	--bg-card: rgba(239, 235, 231, 0.02);
	--border-subtle: rgba(239, 235, 231, 0.08);
	--border-default: rgba(239, 235, 231, 0.12);
	--border-strong: rgba(239, 235, 231, 0.2);
	--text-primary: #EFEBE7;
	--text-secondary: #C2B8AE;
	--text-muted: #67594B;
	--accent-primary: #DE9DCC;
	--accent-secondary: #F25E45;
	--accent-tertiary: #6EB579;
	--accent-quaternary: #F0C968;
	--glow-primary: rgba(222, 157, 204, 0.4);
	--glow-secondary: rgba(242, 94, 69, 0.4);
	--glow-tertiary: rgba(110, 181, 121, 0.4);
	}}

	/* Global font and background */
	.gradio-container {{
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
	background: var(--bg-primary) !important;
	color: var(--text-primary) !important;
	}}

	/* Headers and text */
	h1, h2, h3, h4 {{
	color: var(--text-primary) !important;
	font-weight: 700 !important;
	font-family: 'Inter', sans-serif !important;
	}}

	p, span, div {{
	color: var(--text-primary) !important;
	font-family: 'Inter', sans-serif !important;
	}}

	/* Dark containers */
	.dark-container {{
	background: var(--bg-card);
	border: 1px solid var(--border-subtle);
	border-radius: 20px;
	padding: 28px;
	box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
	backdrop-filter: blur(10px);
	position: relative;
	overflow: hidden;
	}}

	/* Section headers */
	.section-header {{
	display: flex;
	align-items: center;
	gap: 12px;
	margin-bottom: 24px;
	}}

	.section-icon {{
	filter: drop-shadow(0 0 12px currentColor);
	transition: all 0.3s ease;
	}}

	/* Enhanced table styling */
	.v2-table-container {{
	background: var(--bg-card);
	border-radius: 16px;
	overflow: hidden;
	border: 1px solid var(--border-subtle);
	box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
	backdrop-filter: blur(10px);
	}}

	.v2-styled-table {{
	width: 100%;
	border-collapse: collapse;
	font-family: 'Inter', sans-serif;
	font-size: 14px;
	}}

	.v2-styled-table thead {{
	background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary));
	}}

	.v2-styled-table th {{
	padding: 16px 12px;
	text-align: left;
	color: white;
	font-weight: 600;
	font-size: 13px;
	text-transform: uppercase;
	letter-spacing: 0.05em;
	border: none;
	position: relative;
	}}

	.v2-styled-table td {{
	padding: 14px 12px;
	border-bottom: 1px solid var(--border-subtle);
	color: var(--text-primary);
	transition: all 0.2s ease;
	vertical-align: middle;
	}}

	.v2-styled-table tbody tr {{
	transition: all 0.3s ease;
	background: var(--bg-secondary);
	}}

	.v2-styled-table tbody tr:nth-child(even) {{
	background: var(--bg-card);
	}}

	.v2-styled-table tbody tr:hover {{
	background: rgba(222, 157, 204, 0.1);
	box-shadow: 0 0 20px var(--glow-primary);
	transform: scale(1.01);
	}}

	.model-name {{
	font-weight: 600;
	color: var(--accent-primary);
	transition: all 0.2s ease;
	}}

	.numeric-cell {{
	text-align: center;
	font-family: 'SF Mono', monospace;
	font-weight: 500;
	}}

	.score-cell {{
	padding: 8px 12px;
	}}

	/* Scrollbar styling */
	::-webkit-scrollbar {{
	width: 8px;
	height: 8px;
	}}

	::-webkit-scrollbar-track {{
	background: var(--bg-secondary);
	border-radius: 4px;
	}}

	::-webkit-scrollbar-thumb {{
	background: var(--accent-secondary);
	border-radius: 4px;
	}}

	::-webkit-scrollbar-thumb:hover {{
	background: var(--accent-primary);
	}}
	</style>

	<div style="
	background: var(--bg-primary);
	padding: 4rem 2rem;
	border-radius: 16px;
	margin-bottom: 0;
	transition: all 0.3s ease;
	position: relative;
	">
	<div style="max-width: 72rem; margin: 0 auto;">
	<div style="text-align: center; margin-bottom: 4rem;">
	<h1 style="
	font-size: 4rem;
	font-weight: 800;
	line-height: 1.1;
	background: linear-gradient(45deg, var(--accent-primary), var(--accent-secondary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	">
	🔒 LLM PII Detection Leaderboard
	</h1>

	<p style="
	color: var(--text-secondary);
	font-size: 1.25rem;
	line-height: 1.75;
	max-width: 800px;
	margin: 0 auto;
	text-align: center;
	">
	Comprehensive benchmark for language models' performance in detecting and redacting
	personally identifiable information (PII) across various document types and scenarios.
	<span style="
	background: linear-gradient(to right, var(--accent-tertiary), var(--accent-quaternary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	display: block;
	margin-top: 1rem;
	font-size: 1.5rem;
	font-weight: 500;
	">
	"How well do LLMs protect sensitive information?"
	</span>
	</p>
	</div>

	<div style="
	display: grid;
	grid-template-columns: repeat(3, 1fr);
	gap: 1.5rem;
	margin-top: 4rem;
	">
	<div style="
	background: var(--bg-secondary);
	border: 1px solid var(--border-subtle);
	border-radius: 1rem;
	padding: 2rem;
	transition: all 0.3s ease;
	text-align: center;
	">
	<div style="
	font-size: 4rem;
	font-weight: 800;
	margin-bottom: 1rem;
	background: linear-gradient(45deg, var(--accent-primary), var(--accent-secondary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	">8</div>
	<div style="color: var(--text-secondary); font-size: 1.5rem; margin-bottom: 1.5rem;">
	Language Models
	</div>
	<div style="font-size: 1.125rem; line-height: 1.75; color: var(--text-primary);">
	Leading proprietary & open source
	</div>
	<div style="color: var(--text-secondary); margin-top: 0.5rem;">
	GPT-4o, Claude, Gemini, LLaMA, Mistral
	</div>
	</div>

	<div style="
	background: var(--bg-secondary);
	border: 1px solid var(--border-subtle);
	border-radius: 1rem;
	padding: 2rem;
	transition: all 0.3s ease;
	text-align: center;
	">
	<div style="
	font-size: 4rem;
	font-weight: 800;
	margin-bottom: 1rem;
	background: linear-gradient(45deg, var(--accent-tertiary), var(--accent-quaternary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	">5</div>
	<div style="color: var(--text-secondary); font-size: 1.5rem; margin-bottom: 1.5rem;">
	Document Types
	</div>
	<div style="font-size: 1.125rem; line-height: 1.75; color: var(--text-primary);">
	Real-world scenarios
	</div>
	<div style="color: var(--text-secondary); margin-top: 0.5rem;">
	Healthcare, Financial, Government, Legal, Personal
	</div>
	</div>

	<div style="
	background: var(--bg-secondary);
	border: 1px solid var(--border-subtle);
	border-radius: 1rem;
	padding: 2rem;
	transition: all 0.3s ease;
	text-align: center;
	">
	<div style="
	font-size: 4rem;
	font-weight: 800;
	margin-bottom: 1rem;
	background: linear-gradient(45deg, var(--accent-secondary), var(--accent-primary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	">94.1%</div>
	<div style="color: var(--text-secondary); font-size: 1.5rem; margin-bottom: 1.5rem;">
	Best Accuracy
	</div>
	<div style="font-size: 1.125rem; line-height: 1.75; color: var(--text-primary);">
	State-of-the-art performance
	</div>
	<div style="color: var(--text-secondary); margin-top: 0.5rem;">
	GPT-4o leading precision & recall
	</div>
	</div>
	</div>
	</div>
	</div>
	"""

	# Methodology section adapted for PII detection
	METHODOLOGY = """
	<div style="max-width: 1200px; margin: 0 auto; padding: 2rem; color: var(--text-secondary); line-height: 1.7; font-size: 1rem;">
	<h1 style="font-size: 2.5rem; font-weight: 700; margin: 3rem 0 1.5rem; color: var(--text-primary);
	background: linear-gradient(to right, var(--accent-primary), var(--accent-secondary));
	-webkit-background-clip: text; -webkit-text-fill-color: transparent;">
	Methodology
	</h1>

	<p>Our evaluation methodology assesses language models' capabilities in detecting and handling personally identifiable information (PII) across realistic document scenarios. Each model is tested on synthetic documents containing embedded PII entities across 5 document categories.</p>

	<h2 style="font-size: 1.75rem; font-weight: 600; margin: 2rem 0 1rem; color: var(--text-primary);">
	Evaluation Process
	</h2>

	<ul style="list-style: none; padding: 0; margin: 1rem 0;">
	<li style="padding-left: 2rem; position: relative; margin: 1rem 0; display: flex; align-items: flex-start;">
	<span style="content: ''; position: absolute; left: 0; top: 0.75rem; width: 8px; height: 8px;
	background: var(--accent-primary); border-radius: 50%;
	box-shadow: 0 0 0 2px rgba(222, 157, 204, 0.2);"></span>
	<span style="color: var(--accent-primary); font-weight: 600;">Model Selection:</span>
	We evaluate leading language models across proprietary and open-source categories
	</li>
	<li style="padding-left: 2rem; position: relative; margin: 1rem 0; display: flex; align-items: flex-start;">
	<span style="content: ''; position: absolute; left: 0; top: 0.75rem; width: 8px; height: 8px;
	background: var(--accent-primary); border-radius: 50%;
	box-shadow: 0 0 0 2px rgba(222, 157, 204, 0.2);"></span>
	<span style="color: var(--accent-primary); font-weight: 600;">PII Detection:</span>
	Each model processes documents with instructions to identify and classify PII entities
	</li>
	<li style="padding-left: 2rem; position: relative; margin: 1rem 0; display: flex; align-items: flex-start;">
	<span style="content: ''; position: absolute; left: 0; top: 0.75rem; width: 8px; height: 8px;
	background: var(--accent-primary); border-radius: 50%;
	box-shadow: 0 0 0 2px rgba(222, 157, 204, 0.2);"></span>
	<span style="color: var(--accent-primary); font-weight: 600;">Performance Metrics:</span>
	Precision, Recall, F1 Score, Over-detection Rate, Processing Time, and Cost
	</li>
	<li style="padding-left: 2rem; position: relative; margin: 1rem 0; display: flex; align-items: flex-start;">
	<span style="content: ''; position: absolute; left: 0; top: 0.75rem; width: 8px; height: 8px;
	background: var(--accent-primary); border-radius: 50%;
	box-shadow: 0 0 0 2px rgba(222, 157, 204, 0.2);"></span>
	<span style="color: var(--accent-primary); font-weight: 600;">Domain Analysis:</span>
	Specialized evaluation across Healthcare, Financial, Government, Legal, and Personal documents
	</li>
	</ul>

	<h2 style="font-size: 1.75rem; font-weight: 600; margin: 2rem 0 1rem; color: var(--text-primary);">
	Key Metrics Explained
	</h2>

	<div style="background: var(--bg-secondary); border: 1px solid var(--border-subtle); border-radius: 12px; padding: 1.5rem; margin: 1.5rem 0;">
	<ul style="list-style: none; padding: 0; margin: 0;">
	<li style="margin: 1rem 0;"><span style="color: var(--accent-tertiary); font-weight: 600;">Overall Accuracy:</span> Percentage of correctly identified and classified PII entities</li>
	<li style="margin: 1rem 0;"><span style="color: var(--accent-tertiary); font-weight: 600;">Precision:</span> Of all flagged items, how many were actually PII (avoiding false positives)</li>
	<li style="margin: 1rem 0;"><span style="color: var(--accent-tertiary); font-weight: 600;">Recall:</span> Of all PII present, how many were successfully detected (avoiding false negatives)</li>
	<li style="margin: 1rem 0;"><span style="color: var(--accent-tertiary); font-weight: 600;">F1 Score:</span> Harmonic mean balancing precision and recall</li>
	<li style="margin: 1rem 0;"><span style="color: var(--accent-secondary); font-weight: 600;">Over-detection Rate:</span> Percentage of non-PII incorrectly flagged (lower is better)</li>
	</ul>
	</div>
	</div>
	"""