muhyzater's picture
Update app.py
6694cbb verified
# Entity Counts
import gradio as gr
import pandas as pd
import numpy as np
from datetime import datetime
def get_leaderboard_data():
"""
Real PII Detection leaderboard data from your evaluation results.
Based on actual evaluation outputs from your normalized evaluation script.
NOW WITH 41 TEAMS INCLUDING LATEST SUBMISSIONS!
"""
data = {
'Rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
'Team': [
'Premise (submission 3)',
'Premise (submission 2)',
'صقور الأرض (submission 9)',
'Premise',
'Sebaweeh (submission 11)',
'صقور الأرض (submission 8)',
'صقور الأرض (submission 5)',
'TheConsultants (submission 3)',
'Dynamic (submission 4)',
'صقور الأرض (submission 4)',
'صقور الأرض (submission 3)',
'Dynamic (submission 3)',
'The LADS (submission 5)',
'Sebaweeh (submission 10)',
'Dynamic (submission 2)',
'Prophytech-AI (submission 2)', # New with 0.5341
'صقور الأرض (submission 1)',
'Sebaweeh (submission 9)',
'ByFi (submission 3)',
'Gang of Four', # New with 0.5153
'The LADS (submission 4)',
'صقور الأرض (submission 2)',
'Nutoq', # New with 0.5072
'Sebaweeh (submission 8)',
'Dynamic',
'ByFi',
'ByFi (submission 2)',
'TheConsultants',
'Sebaweeh (submission 7)',
'Prophytech-AI',
'The LADS (submission 3)',
'TheConsultants (submission 2)',
'SaRA (submission 2)',
'The LADS (submission 2)',
'Sebaweeh (submission 6)',
'Sebaweeh (submission 4)',
'Why Not',
'Sebaweeh (submission 5)',
'The LADS',
'AEye',
'Sebaweeh (submission 3)',
'NICE',
'SaRA (submission 1)',
'Sebaweeh (submission 2)',
'Sebaweeh (submission 1)'
],
# Main Score (Best Overall Score from your results)
'Best Overall Score': [0.6015, 0.5996, 0.5973, 0.5973, 0.5782, 0.5726, 0.5705, 0.5575, 0.5522, 0.5506, 0.5394, 0.5411, 0.5359, 0.5358, 0.5344, 0.5341, 0.5333, 0.5225, 0.5165, 0.5153, 0.5103, 0.5089, 0.5072, 0.5053, 0.5040, 0.5012, 0.4996, 0.4986, 0.4945, 0.4938, 0.4892, 0.4817, 0.4406, 0.4145, 0.4095, 0.3938, 0.3845, 0.3519, 0.3346, 0.3180, 0.2846, 0.2667, 0.2633, 0.2630, 0.2457],
# Exact Match Metrics (Macro)
'Exact F1': [0.0142, 0.0143, 0.0154, 0.0143, 0.0298, 0.0244, 0.0188, 0.0298, 0.0298, 0.0244, 0.0188, 0.0256, 0.0106, 0.0241, 0.0239, 0.0239, 0.0237, 0.0185, 0.0169, 0.0133, 0.0101, 0.0171, 0.0094, 0.0098, 0.0179, 0.0161, 0.0161, 0.0145, 0.0104, 0.0181, 0.0089, 0.0132, 0.0113, 0.0079, 0.0096, 0.0075, 0.0088, 0.0076, 0.0077, 0.0081, 0.0053, 0.0039, 0.0058, 0.0053, 0.0021],
'Exact Precision': [0.015, 0.015, 0.016, 0.015, 0.029, 0.029, 0.023, 0.029, 0.029, 0.029, 0.023, 0.029, 0.011, 0.029, 0.029, 0.029, 0.029, 0.023, 0.022, 0.016, 0.013, 0.021, 0.012, 0.014, 0.020, 0.021, 0.021, 0.018, 0.016, 0.020, 0.012, 0.018, 0.015, 0.011, 0.011, 0.009, 0.009, 0.006, 0.015, 0.013, 0.004, 0.003, 0.005, 0.004, 0.001],
'Exact Recall': [0.013, 0.013, 0.015, 0.014, 0.021, 0.021, 0.016, 0.021, 0.021, 0.021, 0.016, 0.020, 0.010, 0.021, 0.020, 0.020, 0.020, 0.016, 0.014, 0.011, 0.008, 0.015, 0.008, 0.007, 0.016, 0.013, 0.013, 0.012, 0.008, 0.017, 0.007, 0.010, 0.009, 0.006, 0.009, 0.007, 0.008, 0.010, 0.005, 0.006, 0.011, 0.005, 0.008, 0.010, 0.007],
# Partial Match Metrics (Macro)
'Partial F1': [0.6015, 0.5996, 0.5973, 0.5973, 0.5782, 0.5726, 0.5705, 0.5575, 0.5522, 0.5506, 0.5394, 0.5411, 0.5359, 0.5358, 0.5344, 0.5341, 0.5333, 0.5225, 0.5165, 0.5153, 0.5103, 0.5089, 0.5072, 0.5053, 0.5040, 0.5012, 0.4996, 0.4986, 0.4945, 0.4938, 0.4892, 0.4817, 0.4406, 0.4145, 0.4095, 0.3938, 0.3845, 0.3519, 0.3346, 0.3180, 0.2846, 0.2667, 0.2633, 0.2630, 0.2457],
'Partial Precision': [0.647, 0.642, 0.634, 0.637, 0.659, 0.457, 0.655, 0.659, 0.659, 0.657, 0.646, 0.647, 0.445, 0.636, 0.647, 0.647, 0.644, 0.630, 0.655, 0.622, 0.669, 0.610, 0.669, 0.740, 0.560, 0.662, 0.659, 0.634, 0.740, 0.536, 0.669, 0.670, 0.596, 0.590, 0.456, 0.458, 0.398, 0.280, 0.649, 0.494, 0.190, 0.231, 0.204, 0.179, 0.143],
'Partial Recall': [0.562, 0.562, 0.565, 0.562, 0.461, 0.495, 0.491, 0.461, 0.461, 0.488, 0.463, 0.461, 0.408, 0.463, 0.455, 0.455, 0.455, 0.410, 0.413, 0.440, 0.419, 0.436, 0.408, 0.384, 0.458, 0.403, 0.402, 0.411, 0.371, 0.457, 0.385, 0.376, 0.350, 0.319, 0.372, 0.346, 0.372, 0.474, 0.225, 0.234, 0.569, 0.316, 0.370, 0.495, 0.854],
# IoU 50% Metrics (Macro)
'IoU50 F1': [0.2518, 0.2557, 0.2571, 0.2543, 0.2584, 0.1867, 0.1867, 0.2684, 0.2584, 0.2461, 0.2414, 0.2474, 0.2220, 0.2431, 0.2439, 0.2439, 0.2434, 0.2142, 0.2162, 0.2141, 0.2070, 0.2289, 0.2252, 0.1759, 0.2088, 0.2170, 0.2165, 0.2118, 0.1717, 0.1992, 0.2100, 0.2071, 0.1807, 0.1676, 0.1539, 0.1490, 0.1444, 0.1409, 0.1244, 0.1058, 0.1099, 0.0646, 0.0733, 0.1012, 0.0871],
'IoU50 Precision': [0.271, 0.274, 0.273, 0.271, 0.298, 0.189, 0.187, 0.298, 0.298, 0.291, 0.289, 0.298, 0.159, 0.289, 0.295, 0.295, 0.294, 0.264, 0.280, 0.258, 0.276, 0.275, 0.297, 0.258, 0.232, 0.287, 0.286, 0.269, 0.257, 0.216, 0.287, 0.288, 0.244, 0.239, 0.171, 0.173, 0.149, 0.112, 0.241, 0.164, 0.073, 0.056, 0.057, 0.069, 0.051],
'IoU50 Recall': [0.235, 0.240, 0.243, 0.239, 0.218, 0.194, 0.192, 0.218, 0.218, 0.213, 0.207, 0.218, 0.146, 0.210, 0.208, 0.208, 0.208, 0.180, 0.176, 0.183, 0.166, 0.196, 0.181, 0.134, 0.190, 0.175, 0.174, 0.174, 0.129, 0.185, 0.165, 0.162, 0.143, 0.129, 0.140, 0.131, 0.140, 0.190, 0.084, 0.078, 0.220, 0.077, 0.103, 0.190, 0.303],
}
# Verify all arrays have exactly 42 elements
for key, values in data.items():
if len(values) != 42:
print(f"ERROR: {key} has {len(values)} values, expected 42")
else:
print(f"✓ {key}: {len(values)} values")
# Debug: Print the data to verify
df = pd.DataFrame(data)
print(f"DataFrame shape: {df.shape}")
print(f"Number of teams: {len(df)}")
print(f"Sebaweeh (submission 9) at rank 3: {df.iloc[2]['Team'] == 'Sebaweeh (submission 9)'}")
print(f"Teams: {df['Team'].tolist()}")
return df
def format_leaderboard(df):
"""Format the dataframe for better display"""
# Create a copy to avoid modifying original
display_df = df.copy()
# Format score columns to 4 decimal places for precision
score_columns = ['Best Overall Score', 'Exact F1', 'Exact Precision', 'Exact Recall',
'Partial F1', 'Partial Precision', 'Partial Recall',
'IoU50 F1', 'IoU50 Precision', 'IoU50 Recall',
'Value F1', 'Value Precision', 'Value Recall']
for col in score_columns:
if col in display_df.columns:
display_df[col] = display_df[col].apply(lambda x: f"{x:.4f}")
# Format entity counts
entity_columns = ['GT Entities', 'Pred Entities', 'TP Exact', 'TP Partial', 'TP IoU50', 'TP Value']
for col in entity_columns:
if col in display_df.columns:
display_df[col] = display_df[col].apply(lambda x: f"{x:,}")
return display_df
def update_leaderboard():
"""Update the leaderboard data"""
df = get_leaderboard_data()
formatted_df = format_leaderboard(df)
print(f"Formatted DataFrame shape: {formatted_df.shape}")
return formatted_df
# Custom CSS for styling
css = """
.gradio-container {
font-family: 'Helvetica Neue', Arial, sans-serif;
}
.leaderboard-title {
text-align: center;
color: #2c3e50;
margin-bottom: 20px;
}
.dataframe {
font-size: 14px;
}
.dataframe th {
background-color: #3498db !important;
color: white !important;
font-weight: bold;
text-align: center;
}
.dataframe td {
text-align: center;
padding: 8px;
}
.dataframe tr:nth-child(even) {
background-color: #f8f9fa;
}
.dataframe tr:nth-child(odd) {
background-color: white;
}
.dataframe tr:hover {
background-color: #e3f2fd;
}
.refresh-btn {
background-color: #27ae60 !important;
color: white !important;
}
/* Highlight the new world record and ultimate champion */
.dataframe tr:nth-child(2) {
background-color: #ffd700 !important;
border-left: 10px solid #ff1744;
font-weight: bold;
font-size: 18px;
box-shadow: 0 6px 15px rgba(255, 23, 68, 0.6);
animation: champion-glow 2s ease-in-out infinite alternate;
}
@keyframes champion-glow {
from {
box-shadow: 0 6px 15px rgba(255, 23, 68, 0.6);
background-color: #ffd700;
}
to {
box-shadow: 0 8px 20px rgba(255, 23, 68, 0.9);
background-color: #ffed4a;
}
}
.dataframe tr:nth-child(3) {
background-color: #fff8e1 !important;
border-left: 6px solid #ff6b35;
font-weight: bold;
}
.dataframe tr:nth-child(4) {
background-color: #fff3cd !important;
border-left: 6px solid #ffc107;
font-weight: bold;
}
"""
# Create the Gradio interface
def create_leaderboard():
with gr.Blocks(css=css, title="PII Detection Leaderboard") as demo:
gr.Markdown(
"""
# 🏆 PII Detection Model Leaderboard
A comprehensive ranking of PII detection teams based on exact, partial, and label-based matching performance.
Last updated: {}
""".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
elem_classes="leaderboard-title"
)
with gr.Row():
with gr.Column():
# Get initial data
initial_data = update_leaderboard()
print(f"Final initial data for display: {initial_data.shape}")
print(f"Final teams count: {len(initial_data)}")
leaderboard_table = gr.DataFrame(
value=initial_data,
headers=["Rank", "Team", "Best Overall Score", "Exact F1", "Exact Precision", "Exact Recall",
"Partial F1", "Partial Precision", "Partial Recall",
"IoU50 F1", "IoU50 Precision", "IoU50 Recall",
"Value F1", "Value Precision", "Value Recall",
"GT Entities", "Pred Entities",
"TP Exact", "TP Partial", "TP IoU50", "TP Value", "Date Added"],
datatype=["number"] + ["str"] * 21, # 22 total columns: 1 number + 21 strings
interactive=False,
wrap=True
)
# Statistics section
with gr.Row():
with gr.Column():
gr.Markdown("### 📊 Statistics")
def get_stats():
df = get_leaderboard_data()
return f"""Total Teams: {len(df)}"""
stats_text = gr.Textbox(
value=get_stats(),
label="Quick Stats",
lines=6,
interactive=False
)
# Info section
gr.Markdown(
"""
### ℹ️ About This PII Detection Leaderboard
This leaderboard ranks PII (Personally Identifiable Information) detection teams based on comprehensive benchmarks:
**Main Metrics:**
- **Best Overall Score**: Primary ranking metric (highest of all F1 scores)
- **Exact F1/Precision/Recall**: Perfect position and label match
- **Partial F1/Precision/Recall**: Overlapping entities with correct detection
- **IoU50 F1/Precision/Recall**: 50%+ IoU overlap with correct detection
- **Value F1/Precision/Recall**: Exact value match regardless of position
- **GT/Pred Entities**: Ground truth vs predicted entity counts
- **TP (True Positives)**: Successful detections for each match type
**Evaluation Types:**
- **Exact Match**: Most strict - requires perfect boundary and label alignment
- **Partial Match**: Allows overlapping boundaries but requires correct label
- **IoU50 Match**: Requires 50%+ overlap with correct detection
- **Value Match**: Exact value match regardless of position
"""
)
# Event handlers
def refresh_data():
return update_leaderboard()
return demo
# Launch the app
if __name__ == "__main__":
demo = create_leaderboard()
demo.launch(
server_name="0.0.0.0", # Important for Hugging Face Spaces
server_port=7860, # Default port for HF Spaces
share=False
)