Spaces:

vector-institute
/

HumaniBench

Running

File size: 40,200 Bytes

import gradio as gr
import pandas as pd

from src.display.css_html_js import custom_css, custom_js
from src.display.formatting import make_clickable_model, format_score, format_percentage, format_overall, format_type_badge

# ========================
# CONFIGURATION
# ========================

TITLE = "HumaniBench Leaderboard"
ARXIV_URL = "https://arxiv.org/abs/2505.11454"
GITHUB_URL = "https://github.com/VectorInstitute/humaniBench"
DATASET_URL = "https://huggingface.co/datasets/vector-institute/HumaniBench"
WEBSITE_URL = "https://vectorinstitute.github.io/humanibench/"

vector_logo_path    = "src/assets/vector-favicon-48x48.svg"
humanibench_logo_path = "src/assets/HumaniBenchLogo.ico"

# ========================
# MODEL REGISTRY  (Table A2 order)
# ========================

MODELS = [
    {"model": "GPT-4o",              "link": "https://openai.com/gpt-4o",                                              "org": "OpenAI",    "params": "-",    "type": "Closed"},
    {"model": "Gemini-2.0-Flash",    "link": "https://deepmind.google/technologies/gemini/",                           "org": "Google",    "params": "-",    "type": "Closed"},
    {"model": "Qwen-2.5-7B",         "link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",                    "org": "Alibaba",   "params": "7B",   "type": "Open"},
    {"model": "LLaVA-v1.6",          "link": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf",              "org": "LLaVA",     "params": "7B",   "type": "Open"},
    {"model": "Phi-4",               "link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",             "org": "Microsoft", "params": "5.6B", "type": "Open"},
    {"model": "Gemma-3",             "link": "https://huggingface.co/google/gemma-3-4b-it",                            "org": "Google",    "params": "4B",   "type": "Open"},
    {"model": "CogVLM2-19B",         "link": "https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B",                  "org": "THUDM",     "params": "19B",  "type": "Open"},
    {"model": "Phi-3.5",             "link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",               "org": "Microsoft", "params": "4B",   "type": "Open"},
    {"model": "Molmo-7V",            "link": "https://huggingface.co/allenai/Molmo-7B-O-0924",                         "org": "Allen AI",  "params": "7B",   "type": "Open"},
    {"model": "Aya-Vision-8B",       "link": "https://huggingface.co/CohereForAI/aya-vision-8b",                       "org": "Cohere",    "params": "8B",   "type": "Open"},
    {"model": "InternVL2.5",         "link": "https://huggingface.co/OpenGVLab/InternVL2_5-8B",                        "org": "OpenGVLab", "params": "8B",   "type": "Open"},
    {"model": "Janus-Pro-7B",        "link": "https://huggingface.co/deepseek-ai/Janus-Pro-7B",                        "org": "DeepSeek",  "params": "7B",   "type": "Open"},
    {"model": "GLM-4V-9B",           "link": "https://huggingface.co/THUDM/glm-4v-9b",                                 "org": "THUDM",     "params": "9B",   "type": "Open"},
    {"model": "Llama-3.2-11B",       "link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",        "org": "Meta",      "params": "11B",  "type": "Open"},
    {"model": "DeepSeek-VL2-Small",  "link": "https://huggingface.co/deepseek-ai/deepseek-vl2-small",                  "org": "DeepSeek",  "params": "3B",   "type": "Open"},
]

# ========================
# PRINCIPLE DATA  (Table A2)
# Scores are percentages; Overall = mean of all 7 principles
# ========================

PRINCIPLE_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Fairness": 61.1, "Ethics": 99.0, "Understanding": 74.8, "Reasoning": 79.2, "Language": 62.5, "Empathy": 90.5, "Robustness": 50.90, "Overall": 74.00},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Fairness": 61.0, "Ethics": 98.9, "Understanding": 73.5, "Reasoning": 78.8, "Language": 62.2, "Empathy": 89.5, "Robustness": 57.20, "Overall": 74.44},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Fairness": 63.1, "Ethics": 96.5, "Understanding": 84.9, "Reasoning": 67.1, "Language": 57.4, "Empathy": 73.8, "Robustness": 53.60, "Overall": 70.91},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Fairness": 59.7, "Ethics": 94.4, "Understanding": 80.3, "Reasoning": 68.1, "Language": 55.4, "Empathy": 66.3, "Robustness": 60.60, "Overall": 69.26},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Fairness": 59.2, "Ethics": 98.2, "Understanding": 78.6, "Reasoning": 77.4, "Language": 61.3, "Empathy": 79.0, "Robustness": 45.70, "Overall": 71.34},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Fairness": 57.5, "Ethics": 94.6, "Understanding": 73.2, "Reasoning": 67.8, "Language": 57.7, "Empathy": 79.8, "Robustness": 58.30, "Overall": 69.84},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Fairness": 53.1, "Ethics": 96.3, "Understanding": 67.5, "Reasoning": 74.4, "Language": 60.4, "Empathy": 68.0, "Robustness": 35.12, "Overall": 64.97},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Fairness": 56.0, "Ethics": 96.1, "Understanding": 72.3, "Reasoning": 69.7, "Language": 57.3, "Empathy": 70.8, "Robustness": 50.50, "Overall": 67.53},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Fairness": 52.4, "Ethics": 94.8, "Understanding": 66.2, "Reasoning": 65.8, "Language": 55.0, "Empathy": 58.8, "Robustness": 49.70, "Overall": 63.24},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Fairness": 51.7, "Ethics": 94.9, "Understanding": 64.4, "Reasoning": 68.1, "Language": 50.8, "Empathy": 77.8, "Robustness": 45.90, "Overall": 64.80},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Fairness": 50.9, "Ethics": 93.8, "Understanding": 63.8, "Reasoning": 64.4, "Language": 51.1, "Empathy": 74.5, "Robustness": 56.40, "Overall": 64.99},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Fairness": 50.2, "Ethics": 96.9, "Understanding": 63.3, "Reasoning": 65.2, "Language": 57.6, "Empathy": 69.5, "Robustness": 52.80, "Overall": 65.07},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Fairness": 50.2, "Ethics": 94.4, "Understanding": 63.9, "Reasoning": 63.0, "Language": 50.0, "Empathy": 67.8, "Robustness": 50.50, "Overall": 62.83},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Fairness": 50.2, "Ethics": 94.9, "Understanding": 58.9, "Reasoning": 63.0, "Language": 50.7, "Empathy": 71.3, "Robustness": 56.70, "Overall": 63.67},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Fairness": 48.8, "Ethics": 90.6, "Understanding": 54.8, "Reasoning": 61.6, "Language": 49.1, "Empathy": 59.3, "Robustness": 55.70, "Overall": 59.99},
]

# ========================
# TASK DATA  (Tables 4–10)
# T1–T7 per-model accuracy / scores
# ========================

def _task_rows(extra_keys: list) -> list:
    """Generate per-model rows with None scores for the given extra columns."""
    return [
        {"model": m["model"], "link": m["link"], **{k: None for k in extra_keys}}
        for m in MODELS
    ]

T1_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]

# T1: Scene Understanding (Open-Ended VQA)
T1_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Accuracy": 74.80, "Bias":  0.90, "Hallucination":  2.10, "Faithfulness": 76.50, "Context Rel.": 75.20, "Coherence": 75.80},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Accuracy": 73.20, "Bias":  1.10, "Hallucination":  1.70, "Faithfulness": 75.90, "Context Rel.": 74.30, "Coherence": 74.80},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Accuracy": 67.37, "Bias":  9.33, "Hallucination":  9.38, "Faithfulness": 67.92, "Context Rel.": 66.28, "Coherence": 66.40},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Accuracy": 64.34, "Bias":  9.03, "Hallucination":  9.12, "Faithfulness": 65.33, "Context Rel.": 68.10, "Coherence": 66.90},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Accuracy": 68.10, "Bias":  1.23, "Hallucination":  3.12, "Faithfulness": 72.38, "Context Rel.": 73.47, "Coherence": 73.20},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Accuracy": 66.50, "Bias":  8.50, "Hallucination":  8.20, "Faithfulness": 70.10, "Context Rel.": 68.30, "Coherence": 69.00},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Accuracy": 67.34, "Bias": 11.38, "Hallucination": 10.45, "Faithfulness": 69.01, "Context Rel.": 71.29, "Coherence": 69.80},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Accuracy": 67.19, "Bias":  2.40, "Hallucination":  5.21, "Faithfulness": 67.45, "Context Rel.": 65.28, "Coherence": 65.90},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Accuracy": 67.12, "Bias":  1.87, "Hallucination":  4.35, "Faithfulness": 64.78, "Context Rel.": 62.01, "Coherence": 62.60},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Accuracy": 62.19, "Bias":  8.12, "Hallucination":  8.46, "Faithfulness": 68.84, "Context Rel.": 68.22, "Coherence": 68.00},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Accuracy": 61.10, "Bias": 10.70, "Hallucination": 10.73, "Faithfulness": 65.71, "Context Rel.": 64.18, "Coherence": 64.20},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Accuracy": 62.10, "Bias":  1.35, "Hallucination":  3.21, "Faithfulness": 69.26, "Context Rel.": 67.09, "Coherence": 67.50},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Accuracy": 60.18, "Bias":  8.63, "Hallucination":  8.34, "Faithfulness": 69.98, "Context Rel.": 65.10, "Coherence": 65.40},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Accuracy": 63.40, "Bias": 19.30, "Hallucination": 15.67, "Faithfulness": 62.09, "Context Rel.": 66.01, "Coherence": 64.30},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 59.10, "Bias": 12.56, "Hallucination": 11.29, "Faithfulness": 62.14, "Context Rel.": 63.10, "Coherence": 63.00},
]

T2_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]

# T2: Instance Identity (Open-Ended VQA)
T2_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Accuracy": 68.10, "Bias":  1.50, "Hallucination":  3.00, "Faithfulness": 85.00, "Context Rel.": 85.00, "Coherence": 85.00},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Accuracy": 66.50, "Bias":  2.00, "Hallucination":  4.00, "Faithfulness": 83.00, "Context Rel.": 82.00, "Coherence": 82.00},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Accuracy": 62.37, "Bias": 10.21, "Hallucination":  6.27, "Faithfulness": 67.92, "Context Rel.": 68.65, "Coherence": 66.94},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Accuracy": 59.34, "Bias":  9.82, "Hallucination": 10.01, "Faithfulness": 65.33, "Context Rel.": 66.10, "Coherence": 65.02},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Accuracy": 63.10, "Bias":  2.07, "Hallucination":  4.08, "Faithfulness": 81.67, "Context Rel.": 82.21, "Coherence": 81.76},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Accuracy": 61.94, "Bias": 15.19, "Hallucination":  5.00, "Faithfulness": 78.96, "Context Rel.": 75.00, "Coherence": 76.00},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Accuracy": 62.34, "Bias": 12.31, "Hallucination":  6.53, "Faithfulness": 74.01, "Context Rel.": 70.14, "Coherence": 72.45},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Accuracy": 62.19, "Bias":  3.39, "Hallucination":  6.19, "Faithfulness": 67.45, "Context Rel.": 68.34, "Coherence": 67.80},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Accuracy": 57.19, "Bias":  9.02, "Hallucination":  9.39, "Faithfulness": 68.84, "Context Rel.": 67.74, "Coherence": 66.89},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Accuracy": 62.12, "Bias":  2.83, "Hallucination":  5.44, "Faithfulness": 64.78, "Context Rel.": 67.33, "Coherence": 65.41},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Accuracy": 56.10, "Bias": 11.74, "Hallucination": 11.69, "Faithfulness": 65.71, "Context Rel.": 64.49, "Coherence": 62.92},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Accuracy": 57.10, "Bias":  2.16, "Hallucination":  4.24, "Faithfulness": 69.26, "Context Rel.": 71.82, "Coherence": 71.09},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Accuracy": 55.18, "Bias":  9.59, "Hallucination":  9.18, "Faithfulness": 69.98, "Context Rel.": 65.73, "Coherence": 64.30},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Accuracy": 54.10, "Bias": 13.48, "Hallucination": 12.41, "Faithfulness": 64.05, "Context Rel.": 63.12, "Coherence": 61.37},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 58.40, "Bias": 20.42, "Hallucination": 16.72, "Faithfulness": 62.09, "Context Rel.": 60.04, "Coherence": 59.11},
]

T3_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]

# T3: Multiple-Choice VQA
T3_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Accuracy": 68.10, "Bias":  0.95, "Hallucination":  1.20, "Faithfulness": 82.30, "Context Rel.": 80.45, "Coherence": 73.90},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Accuracy": 70.40, "Bias":  0.85, "Hallucination":  0.95, "Faithfulness": 81.60, "Context Rel.": 82.10, "Coherence": 74.60},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Accuracy": 52.93, "Bias":  6.30, "Hallucination":  6.35, "Faithfulness": 69.22, "Context Rel.": 67.54, "Coherence": 66.63},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Accuracy": 50.89, "Bias":  7.68, "Hallucination":  7.22, "Faithfulness": 64.77, "Context Rel.": 63.06, "Coherence": 62.25},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Accuracy": 60.80, "Bias":  2.01, "Hallucination":  3.00, "Faithfulness": 76.55, "Context Rel.": 74.77, "Coherence": 73.86},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Accuracy": 54.22, "Bias":  5.43, "Hallucination":  5.80, "Faithfulness": 71.14, "Context Rel.": 69.37, "Coherence": 68.46},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Accuracy": 61.10, "Bias":  1.95, "Hallucination":  2.90, "Faithfulness": 77.20, "Context Rel.": 75.40, "Coherence": 74.50},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Accuracy": 53.18, "Bias":  6.13, "Hallucination":  6.24, "Faithfulness": 69.98, "Context Rel.": 68.16, "Coherence": 67.26},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Accuracy": 51.47, "Bias":  7.29, "Hallucination":  6.97, "Faithfulness": 66.02, "Context Rel.": 64.38, "Coherence": 63.56},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Accuracy": 51.64, "Bias":  7.17, "Hallucination":  6.90, "Faithfulness": 67.33, "Context Rel.": 65.69, "Coherence": 64.74},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Accuracy": 49.05, "Bias":  8.92, "Hallucination":  8.00, "Faithfulness": 61.01, "Context Rel.": 59.37, "Coherence": 58.53},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Accuracy": 55.51, "Bias":  4.56, "Hallucination":  5.25, "Faithfulness": 72.33, "Context Rel.": 70.47, "Coherence": 69.53},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Accuracy": 50.76, "Bias":  7.76, "Hallucination":  7.27, "Faithfulness": 63.26, "Context Rel.": 61.55, "Coherence": 60.73},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Accuracy": 45.67, "Bias": 18.28, "Hallucination": 12.98, "Faithfulness": 52.02, "Context Rel.": 55.29, "Coherence": 54.39},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 45.35, "Bias": 14.13, "Hallucination": 12.55, "Faithfulness": 54.21, "Context Rel.": 56.46, "Coherence": 54.52},
]

LANGUAGES = ["English", "French", "Spanish", "Portuguese", "Mandarin", "Korean", "Urdu", "Persian", "Bengali", "Punjabi", "Tamil"]

# T4: Multilingual VQA — Accuracy (%) per language
T4_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "English": 64.6, "French": 64.0, "Spanish": 63.4, "Portuguese": 62.8, "Mandarin": 62.3, "Korean": 61.8, "Urdu": 60.1, "Persian": 59.7, "Bengali": 59.1, "Punjabi": 58.6, "Tamil": 58.1, "Avg": 61.32},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "English": 64.4, "French": 63.8, "Spanish": 63.2, "Portuguese": 62.6, "Mandarin": 62.1, "Korean": 61.7, "Urdu": 60.0, "Persian": 59.5, "Bengali": 58.9, "Punjabi": 58.4, "Tamil": 58.0, "Avg": 61.15},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "English": 59.2, "French": 58.6, "Spanish": 57.9, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "English": 56.8, "French": 56.4, "Spanish": 55.6, "Portuguese": 55.1, "Mandarin": 54.6, "Korean": 54.1, "Urdu": 52.8, "Persian": 52.4, "Bengali": 51.8, "Punjabi": 51.4, "Tamil": 51.0, "Avg": 53.82},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "English": 63.3, "French": 62.8, "Spanish": 62.1, "Portuguese": 61.6, "Mandarin": 61.1, "Korean": 60.6, "Urdu": 58.9, "Persian": 58.5, "Bengali": 57.8, "Punjabi": 57.3, "Tamil": 56.9, "Avg": 60.08},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "English": 59.5, "French": 59.0, "Spanish": 58.2, "Portuguese": 57.7, "Mandarin": 57.3, "Korean": 56.9, "Urdu": 55.3, "Persian": 54.9, "Bengali": 54.3, "Punjabi": 53.8, "Tamil": 53.3, "Avg": 56.38},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "English": 61.6, "French": 61.3, "Spanish": 60.9, "Portuguese": 61.4, "Mandarin": 60.9, "Korean": 60.4, "Urdu": 58.7, "Persian": 58.3, "Bengali": 57.6, "Punjabi": 57.1, "Tamil": 56.6, "Avg": 59.53},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "English": 59.1, "French": 58.6, "Spanish": 58.0, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "English": 56.1, "French": 55.6, "Spanish": 54.9, "Portuguese": 54.5, "Mandarin": 54.2, "Korean": 53.8, "Urdu": 52.5, "Persian": 52.1, "Bengali": 51.5, "Punjabi": 51.1, "Tamil": 50.7, "Avg": 53.36},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "English": 55.8, "French": 55.0, "Spanish": 54.2, "Portuguese": 53.2, "Mandarin": 52.3, "Korean": 51.7, "Urdu": 51.3, "Persian": 51.7, "Bengali": 51.9, "Punjabi": 49.9, "Tamil": 49.1, "Avg": 52.37},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "English": 53.9, "French": 53.1, "Spanish": 52.4, "Portuguese": 51.1, "Mandarin": 50.5, "Korean": 49.7, "Urdu": 49.3, "Persian": 49.9, "Bengali": 50.1, "Punjabi": 47.9, "Tamil": 47.3, "Avg": 50.47},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "English": 58.5, "French": 58.1, "Spanish": 57.5, "Portuguese": 57.0, "Mandarin": 56.5, "Korean": 55.8, "Urdu": 54.5, "Persian": 54.1, "Bengali": 53.5, "Punjabi": 53.0, "Tamil": 52.6, "Avg": 55.55},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "English": 53.3, "French": 52.7, "Spanish": 51.8, "Portuguese": 50.8, "Mandarin": 50.1, "Korean": 49.4, "Urdu": 49.0, "Persian": 49.5, "Bengali": 49.7, "Punjabi": 47.6, "Tamil": 47.2, "Avg": 50.10},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "English": 51.9, "French": 51.5, "Spanish": 50.7, "Portuguese": 50.3, "Mandarin": 49.9, "Korean": 49.4, "Urdu": 48.0, "Persian": 47.6, "Bengali": 47.0, "Punjabi": 46.5, "Tamil": 46.1, "Avg": 49.00},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "English": 52.8, "French": 52.2, "Spanish": 51.3, "Portuguese": 50.3, "Mandarin": 49.5, "Korean": 48.9, "Urdu": 48.5, "Persian": 48.9, "Bengali": 49.1, "Punjabi": 47.0, "Tamil": 46.6, "Avg": 49.55},
]

T5_COLS = ["mAP@0.5", "mAP@0.75", "Mean IoU", "Missing (%)"]

# T5: Visual Grounding (Table 9) — mAP values are %; Mean IoU is 0–1; Missing (%) = images with no predicted box
T5_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "mAP@0.5": 63.46, "mAP@0.75": 40.32, "Mean IoU": 0.34, "Missing (%)": 72.73},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "mAP@0.5": 56.51, "mAP@0.75": 52.15, "Mean IoU": 0.23, "Missing (%)":  0.00},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "mAP@0.5": 98.43, "mAP@0.75": 94.16, "Mean IoU": 0.90, "Missing (%)":  0.00},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "mAP@0.5": 96.49, "mAP@0.75": 82.44, "Mean IoU": 0.78, "Missing (%)":  0.00},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "mAP@0.5": 72.11, "mAP@0.75": 46.18, "Mean IoU": 0.47, "Missing (%)":  0.00},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "mAP@0.5": 56.34, "mAP@0.75": 54.23, "Mean IoU": 0.49, "Missing (%)": 16.34},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "mAP@0.5": 50.88, "mAP@0.75": 50.42, "Mean IoU": 0.10, "Missing (%)":  0.00},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "mAP@0.5": 63.45, "mAP@0.75": 58.35, "Mean IoU": 0.37, "Missing (%)":  0.00},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "mAP@0.5": 43.32, "mAP@0.75": 34.34, "Mean IoU": 0.45, "Missing (%)":  0.00},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "mAP@0.5": 54.15, "mAP@0.75": 41.26, "Mean IoU": 0.07, "Missing (%)":  0.00},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "mAP@0.5": 56.39, "mAP@0.75": 36.52, "Mean IoU": 0.22, "Missing (%)":  6.67},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "mAP@0.5": 50.18, "mAP@0.75": 10.04, "Mean IoU": 0.14, "Missing (%)":  2.80},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "mAP@0.5": 52.20, "mAP@0.75": 35.55, "Mean IoU": 0.12, "Missing (%)":  4.21},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "mAP@0.5": 38.34, "mAP@0.75": 35.53, "Mean IoU": 0.25, "Missing (%)": 32.24},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "mAP@0.5": 25.34, "mAP@0.75": 21.23, "Mean IoU": 0.14, "Missing (%)":  5.35},
]

T6_COLS = ["Empathy", "Anxiety", "Sadness", "Joy"]

# T6: Empathetic Captioning (Table 10) — LLM-judge rubric, 0–100
T6_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Empathy": 95, "Anxiety": 15, "Sadness": 12, "Joy": 94},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Empathy": 92, "Anxiety": 13, "Sadness": 11, "Joy": 90},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Empathy": 68, "Anxiety": 25, "Sadness": 14, "Joy": 66},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Empathy": 70, "Anxiety": 37, "Sadness": 36, "Joy": 68},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Empathy": 83, "Anxiety": 22, "Sadness": 25, "Joy": 80},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Empathy": 84, "Anxiety": 23, "Sadness": 24, "Joy": 82},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Empathy": 76, "Anxiety": 44, "Sadness": 33, "Joy": 73},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Empathy": 70, "Anxiety": 28, "Sadness": 27, "Joy": 68},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Empathy": 60, "Anxiety": 47, "Sadness": 36, "Joy": 58},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Empathy": 72, "Anxiety": 12, "Sadness": 19, "Joy": 70},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Empathy": 72, "Anxiety": 20, "Sadness": 24, "Joy": 70},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Empathy": 66, "Anxiety": 32, "Sadness": 20, "Joy": 64},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Empathy": 74, "Anxiety": 42, "Sadness": 31, "Joy": 70},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Empathy": 78, "Anxiety": 46, "Sadness": 25, "Joy": 68},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Empathy": 68, "Anxiety": 59, "Sadness": 39, "Joy": 67},
]

T7_COLS = ["Clean Acc.", "Perturbated Acc.", "Retention (%)"]

# T7: Model Robustness under Perturbations (Table 11) — Retention = Perturbated / Clean × 100
T7_DATA = [
    {"model": "GPT-4o",             "link": MODELS[0]["link"],  "Clean Acc.": 65.85, "Perturbated Acc.": 40.80, "Retention (%)": 61.96},
    {"model": "Gemini-2.0-Flash",   "link": MODELS[1]["link"],  "Clean Acc.": 60.40, "Perturbated Acc.": 39.00, "Retention (%)": 64.57},
    {"model": "Qwen-2.5-7B",        "link": MODELS[2]["link"],  "Clean Acc.": 93.84, "Perturbated Acc.": 70.01, "Retention (%)": 74.63},
    {"model": "LLaVA-v1.6",         "link": MODELS[3]["link"],  "Clean Acc.": 87.50, "Perturbated Acc.": 67.36, "Retention (%)": 77.53},
    {"model": "Phi-4",              "link": MODELS[4]["link"],  "Clean Acc.": 72.05, "Perturbated Acc.": 44.43, "Retention (%)": 61.67},
    {"model": "Gemma-3",            "link": MODELS[5]["link"],  "Clean Acc.": 73.10, "Perturbated Acc.": 51.75, "Retention (%)": 70.82},
    {"model": "CogVLM2-19B",        "link": MODELS[6]["link"],  "Clean Acc.": 54.00, "Perturbated Acc.": 34.50, "Retention (%)": 63.89},
    {"model": "Phi-3.5",            "link": MODELS[7]["link"],  "Clean Acc.": 67.25, "Perturbated Acc.": 42.00, "Retention (%)": 62.45},
    {"model": "Molmo-7V",           "link": MODELS[8]["link"],  "Clean Acc.": 71.15, "Perturbated Acc.": 45.50, "Retention (%)": 63.96},
    {"model": "Aya-Vision-8B",      "link": MODELS[9]["link"],  "Clean Acc.": 59.50, "Perturbated Acc.": 32.20, "Retention (%)": 54.03},
    {"model": "InternVL2.5",        "link": MODELS[10]["link"], "Clean Acc.": 59.80, "Perturbated Acc.": 37.75, "Retention (%)": 63.12},
    {"model": "Janus-Pro-7B",       "link": MODELS[11]["link"], "Clean Acc.": 55.60, "Perturbated Acc.": 31.85, "Retention (%)": 57.31},
    {"model": "GLM-4V-9B",          "link": MODELS[12]["link"], "Clean Acc.": 54.75, "Perturbated Acc.": 29.85, "Retention (%)": 54.52},
    {"model": "Llama-3.2-11B",      "link": MODELS[13]["link"], "Clean Acc.": 62.15, "Perturbated Acc.": 40.25, "Retention (%)": 64.74},
    {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Clean Acc.": 55.90, "Perturbated Acc.": 33.60, "Retention (%)": 60.11},
]


# ========================
# HEADER / INTRO HTML
# ========================

INTRODUCTION_HTML = f"""
<div style="text-align: center; margin: 1.5rem auto; max-width: 1100px;">
    <p style="font-size: 1.15rem; color: #64748b; line-height: 1.6;">
        A <strong>human-centric evaluation framework</strong> for Large Multimodal Models (LMMs) across 7 tasks,
        7 HC principles, 5 social attributes, and 11 languages — built on 32,000+ expert-verified real-world
        image–question pairs.
    </p>
</div>

<div class="badges-container">
    <a href="{ARXIV_URL}" target="_blank" rel="noopener noreferrer">
        <img src="https://img.shields.io/badge/arXiv-2505.11454-b31b1b?logo=arxiv&logoColor=white" alt="arXiv">
    </a>
    <a href="{GITHUB_URL}" target="_blank">
        <img src="https://img.shields.io/badge/GitHub-humaniBench-181717?logo=github" alt="GitHub">
    </a>
    <a href="{DATASET_URL}" target="_blank">
        <img src="https://img.shields.io/badge/🤗_Dataset-HumaniBench-ffd21e" alt="Dataset">
    </a>
    <a href="{WEBSITE_URL}" target="_blank">
        <img src="https://img.shields.io/badge/Website-vectorinstitute.github.io-0ea5e9" alt="Website">
    </a>
</div>

<div class="stats-container">
    <div class="stat-box">
        <div class="stat-value">32K+</div>
        <div class="stat-label">Image–Question Pairs</div>
    </div>
    <div class="stat-box">
        <div class="stat-value">~1,500</div>
        <div class="stat-label">Unique Images</div>
    </div>
    <div class="stat-box">
        <div class="stat-value">7</div>
        <div class="stat-label">Evaluation Tasks</div>
    </div>
    <div class="stat-box">
        <div class="stat-value">15</div>
        <div class="stat-label">LMMs Evaluated</div>
    </div>
    <div class="stat-box">
        <div class="stat-value">11</div>
        <div class="stat-label">Languages</div>
    </div>
</div>
"""

ABOUT_TEXT = f"""
## What is HumaniBench?

**HumaniBench** is a human-centric benchmark designed to evaluate Large Multimodal Models (LMMs) on tasks that
reflect real-world diversity and inclusion. It assesses 15 state-of-the-art LMMs across seven evaluation tasks
grounded in seven human-centric (HC) principles.

### Dataset Overview

- **32,000+ expert-verified** image–question pairs from real-world news imagery
- **~1,500 unique images** spanning diverse social contexts
- **7 evaluation tasks** (T1–T7) covering scene understanding, identity, reasoning, language, grounding, empathy, and robustness
- **7 HC principles**: Fairness, Ethics, Understanding, Reasoning, Language, Empathy, Robustness
- **5 social attributes**: Age, Gender, Race, Occupation, Sports
- **11 languages** for multilingual evaluation
- **15 LMMs** evaluated: 13 open-source + 2 proprietary

### Evaluation Tasks

| Task | Name | Description |
|:----:|:-----|:------------|
| **T1** | Scene Understanding | Classify scene-level social attributes from images |
| **T2** | Instance Identity | Identify fine-grained individual attributes |
| **T3** | Multiple-Choice VQA | Answer questions requiring reasoning about human subjects |
| **T4** | Multilingual VQA | Cross-lingual visual question answering (11 languages) |
| **T5** | Visual Grounding | Localize people with specified social attributes |
| **T6** | Empathetic Captioning | Generate empathetic, socially-aware image captions |
| **T7** | Image Resilience | Evaluate robustness to image perturbations |

### Human-Centric Principles

**Fairness** · **Ethics** · **Understanding** · **Reasoning** · **Language** · **Empathy** · **Robustness**

### Key Findings

- Closed-source models (GPT-4o, Gemini-2.0) consistently outperform open-source counterparts
- Persistent bias across gender and race attributes, especially in Tasks T1–T3
- Multilingual performance degrades significantly for low-resource languages
- Inference-time techniques (chain-of-thought, self-refinement) yield 8–12% improvements on several HC dimensions

### Citation

```bibtex
@article{{humanibench2025,
  title={{HumaniBench: A Human-Centric Framework for Large Multimodal Models Evaluation}},
  author={{...}},
  journal={{arXiv preprint arXiv:2505.11454}},
  year={{2025}}
}}
```

### License

This dataset is released under **CC BY-NC-SA 4.0**.

### Contact

- **Email:** [shaina.raza@vectorinstitute.ai](mailto:shaina.raza@vectorinstitute.ai)
- **Website:** [{WEBSITE_URL}]({WEBSITE_URL})
- **Dataset:** [HuggingFace]({DATASET_URL})
- **Code:** [GitHub]({GITHUB_URL})
- **Paper:** [arXiv]({ARXIV_URL})

---
*Built with ❤️ by the [Vector Institute](https://vectorinstitute.ai)*
"""


# ========================
# TABLE BUILDERS
# ========================

MEDAL = {1: "🥇", 2: "🥈", 3: "🥉"}
SORT_NOTE = '<p style="color:#888;font-size:0.82rem;margin:0.4rem 0 0 0.2rem;">↕ Click any column header to sort</p>'
COLOR_LEGEND = '<p style="color:#888;font-size:0.82rem;margin:0.4rem 0 0 0.2rem;">↕ Click any column header to sort &nbsp;·&nbsp; <span style="color:#4ade80">■</span> ≥75% &nbsp;<span style="color:#fbbf24">■</span> 60–74% &nbsp;<span style="color:#f87171">■</span> &lt;60%</p>'


def _make_df(data: list, score_cols: list, pct: bool = True, sort_col: str = None, lower_is_better_cols: list = None) -> pd.DataFrame:
    key = sort_col or score_cols[0]
    lower_is_better_cols = lower_is_better_cols or []
    sorted_data = sorted(data, key=lambda x: x.get(key) or 0, reverse=True)
    rows = []
    for rank, item in enumerate(sorted_data, 1):
        row = {
            "#": MEDAL.get(rank, str(rank)),
            "Model": make_clickable_model(item["model"], item.get("link")),
        }
        for col in score_cols:
            val = item.get(col)
            row[col] = format_percentage(val, inverted=(col in lower_is_better_cols)) if pct else format_score(val)
        rows.append(row)
    return pd.DataFrame(rows)


def build_overall_leaderboard():
    PRINCIPLE_COLS = ["Fairness", "Ethics", "Understanding", "Reasoning", "Language", "Empathy", "Robustness"]
    paired = sorted(zip(MODELS, PRINCIPLE_DATA), key=lambda x: x[1].get("Overall", 0), reverse=True)
    rows = []
    for rank, (m, p) in enumerate(paired, 1):
        row = {
            "#":      MEDAL.get(rank, str(rank)),
            "Model":  make_clickable_model(m["model"], m["link"]),
            "Org":    m["org"],
            "Params": m["params"],
            "Type":   format_type_badge(m["type"]),
        }
        for col in PRINCIPLE_COLS:
            row[col] = format_percentage(p.get(col))
        row["Overall"] = format_overall(p.get("Overall"))
        rows.append(row)
    df = pd.DataFrame(rows)
    # #, Model, Org, Params, Type, 7 principles, Overall
    datatype = ["str", "html", "str", "str", "html"] + ["html"] * 8
    gr.Dataframe(
        value=df,
        datatype=datatype,
        wrap=True,
        interactive=False,
        elem_classes="humani-leaderboard-table",
    )
    gr.HTML(COLOR_LEGEND)


def build_task_leaderboard(task_data: list, score_cols: list, pct: bool = True, sort_col: str = None, lower_is_better_cols: list = None):
    df = _make_df(task_data, score_cols, pct=pct, sort_col=sort_col, lower_is_better_cols=lower_is_better_cols)
    datatype = ["str", "html"] + (["html"] * len(score_cols) if pct else ["str"] * len(score_cols))
    gr.Dataframe(
        value=df,
        datatype=datatype,
        wrap=True,
        interactive=False,
        elem_classes="humani-leaderboard-table",
    )
    gr.HTML(COLOR_LEGEND if pct else SORT_NOTE)


def build_vqa_leaderboard(task_data: list):
    cols = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]
    build_task_leaderboard(task_data, cols, pct=True, lower_is_better_cols=["Bias", "Hallucination"])


def build_multilingual_leaderboard():
    LANG_COLS = LANGUAGES + ["Avg"]
    build_task_leaderboard(T4_DATA, LANG_COLS, pct=True, sort_col="Avg")


# ========================
# GRADIO APP
# ========================

demo = gr.Blocks(title=TITLE, css=custom_css, js=custom_js)

with demo:
    gr.HTML(f"""
    <div id="page-header">
        <div id="header-container">
            <div id="left-container">
                <a href="https://vectorinstitute.ai" target="_blank" rel="noopener noreferrer">
                    <img id="vector-logo" src="/gradio_api/file={vector_logo_path}"
                         alt="Vector Institute" onerror="this.style.display='none'">
                </a>
            </div>
            <div id="centre-container">
                <h1>HumaniBench Leaderboard</h1>
                <p>A Human-Centric Evaluation Framework for Large Multimodal Models</p>
            </div>
            <div id="right-container">
                <img id="humanibench-logo" src="/gradio_api/file={humanibench_logo_path}"
                     alt="HumaniBench" onerror="this.style.display='none'">
            </div>
        </div>
    </div>
    """)

    gr.HTML(INTRODUCTION_HTML)

    gr.HTML("""
    <div style="text-align: center; margin: 1.5rem auto; max-width: 960px;">
        <img src="/gradio_api/file=src/assets/teaser_figure_humanibench.png"
             style="width: 100%; border-radius: 8px; box-shadow: 0 2px 12px rgba(0,0,0,0.12);"
             alt="HumaniBench teaser figure">
        <p style="color:#777; font-size:0.9rem; margin-top:0.65rem; font-style:italic; text-align:center;">
            HumaniBench evaluates 15 LMMs across 7 human-centric tasks using 32K+ expert-verified real-world image–question pairs spanning 5 social attributes and 11 languages.
        </p>
    </div>
    """)

    with gr.Tabs():

        # ── Tab 1: Overall Rankings ──────────────────────────────────────────
        with gr.Tab("Overall Rankings"):
            gr.Markdown("""
                <div class="info-box">
                <h3>HC Principle Scores</h3>
                Aggregate accuracy (%) per Human-Centric principle across all relevant tasks.
                Higher is better. Click model names to visit their official pages.
                </div>
            """, elem_classes="markdown-text")
            build_overall_leaderboard()
            gr.Markdown("*Overall = mean of all 7 principle scores. -- indicates data not yet available.*")

        # ── Tab 2: Task Results ──────────────────────────────────────────────
        with gr.Tab("Task Results"):
            gr.Markdown("""
                <div class="info-box">
                <h3>Per-Task Breakdown (T1–T7)</h3>
                Detailed metrics for each of the seven HumaniBench evaluation tasks.
                </div>
            """, elem_classes="markdown-text")

            with gr.Tabs():
                with gr.Tab("T1 · Scene Understanding"):
                    gr.Markdown("**Metrics:** Accuracy (%) · Bias · Hallucination · Faithfulness · Context Rel. · Coherence")
                    build_vqa_leaderboard(T1_DATA)

                with gr.Tab("T2 · Instance Identity"):
                    gr.Markdown("**Metrics:** Accuracy (%) · Bias · Hallucination · Faithfulness · Context Rel. · Coherence")
                    build_vqa_leaderboard(T2_DATA)

                with gr.Tab("T3 · MC-VQA"):
                    gr.Markdown("**Metrics:** Accuracy (%) · Bias · Hallucination · Faithfulness · Context Rel. · Coherence")
                    build_vqa_leaderboard(T3_DATA)

                with gr.Tab("T4 · Multilingual"):
                    gr.Markdown("**Metric:** Accuracy (%) across 11 languages · Avg = macro-average")
                    build_multilingual_leaderboard()

                with gr.Tab("T5 · Visual Grounding"):
                    gr.Markdown("**Metrics:** `mAP@0.5` (%) · `mAP@0.75` (%) · Mean IoU (0–1) · Missing Pred. (%) ↓")
                    build_task_leaderboard(T5_DATA, T5_COLS, pct=False)

                with gr.Tab("T6 · Empathetic Captioning"):
                    gr.Markdown("**Metrics:** Empathy · Anxiety · Sadness · Joy (LLM-judge rubric, 0–100)")
                    build_task_leaderboard(T6_DATA, T6_COLS, pct=False)

                with gr.Tab("T7 · Image Resilience"):
                    gr.Markdown("**Metrics:** Clean Acc. (%) · Perturbated Acc. (%) · Retention (%) = Perturbated / Clean × 100")
                    build_task_leaderboard(T7_DATA, T7_COLS, pct=True)

        # ── Tab 3: About ─────────────────────────────────────────────────────
        with gr.Tab("About"):
            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")

    gr.HTML(f"""
        <div id="footer">
            <p><strong>Built with ❤️ by the <a href="https://vectorinstitute.ai" target="_blank">Vector Institute</a></strong></p>
            <p>
                <a href="{DATASET_URL}" target="_blank">Dataset</a> ·
                <a href="{GITHUB_URL}" target="_blank">GitHub</a> ·
                <a href="{ARXIV_URL}" target="_blank">Paper</a> ·
                <a href="{WEBSITE_URL}" target="_blank">Website</a>
            </p>
        </div>
    """)


if __name__ == "__main__":
    demo.launch(allowed_paths=["src/assets"])