import pandas as pd import os # PII Detection Categories PII_CATEGORIES = { "Overall": ["Overall Accuracy"], "Entity Types": ["Names", "Addresses", "Phone Numbers", "SSN", "Medical IDs", "Financial Info"], "Document Types": ["Healthcare", "Financial", "Government", "Legal", "Personal"], "Performance": ["Precision", "Recall", "F1 Score"], "Efficiency": ["Processing Time", "Cost per Document"] } # Model type definitions MODEL_TYPES = { "Proprietary": "🔒", "Open Source": "🔓" } def load_data(): """Load and preprocess the PII detection evaluation data from CSV file.""" # Load from CSV file csv_path = "results/pii_detection_results.csv" if not os.path.exists(csv_path): raise FileNotFoundError(f"Results file not found: {csv_path}. Please ensure the CSV file exists in the results folder.") df = pd.read_csv(csv_path) # Clean and prepare data df = df.fillna('') # Round numeric columns for better display numeric_cols = [ 'Overall Accuracy', 'Precision', 'Recall', 'F1 Score', 'Over-redaction Rate', 'Processing Time (s)', 'Cost per Document ($)', 'Healthcare Accuracy', 'Financial Accuracy', 'Government Accuracy', 'Legal Accuracy', 'Personal Accuracy' ] for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').round(3) return df # Nutrient brand color palette COLORS = { # Light mode colors "white": "#FFFFFF", "black": "#000000", "disc_pink": "#DE9DCC", "code_coral": "#F25E45", "data_green": "#6EB579", "digital_pollen": "#F0C968", # Primary yellow accent "warm_black": "#1A1414", "off_white": "#EFEBE7", "pixel_mist": "#E2DBD9", "soft_grey": "#C2B8AE", "warm_grey": "#67594B", # Dark mode colors "disc_pink_dm": "#4F2B45", "code_coral_dm": "#672D23", "data_green_dm": "#2B412F", "digital_pollen_dm": "#5B481A", } # Header content with PII detection focus HEADER_CONTENT = f"""

🔒 LLM PII Detection Leaderboard

Comprehensive benchmark for language models' performance in detecting and redacting personally identifiable information (PII) across various document types and scenarios. "How well do LLMs protect sensitive information?"

4
Language Models
Cutting-edge Nutrient models
GPT-5-mini, GPT-5-nano, GPT-4.1-mini, GPT-4.1-nano
5
Document Types
Real-world scenarios
Healthcare, Financial, Government, Legal, Personal
98.0%
Best F1 Score
State-of-the-art performance
Nutrient & GPT-5-mini leading F1 performance
""" # Methodology section adapted for PII detection METHODOLOGY = """

Methodology

Our evaluation methodology assesses language models' capabilities in detecting and handling personally identifiable information (PII) across realistic document scenarios. Each model is tested on synthetic documents containing embedded PII entities across 5 document categories.

Evaluation Process

Key Metrics Explained

Powered by Nutrient

"""