import gradio as gr import pandas as pd from src.display.css_html_js import custom_css, custom_js from src.display.formatting import make_clickable_model, format_score, format_percentage, format_overall, format_type_badge # ======================== # CONFIGURATION # ======================== TITLE = "HumaniBench Leaderboard" ARXIV_URL = "https://arxiv.org/abs/2505.11454" GITHUB_URL = "https://github.com/VectorInstitute/humaniBench" DATASET_URL = "https://huggingface.co/datasets/vector-institute/HumaniBench" WEBSITE_URL = "https://vectorinstitute.github.io/humanibench/" vector_logo_path = "src/assets/vector-favicon-48x48.svg" humanibench_logo_path = "src/assets/HumaniBenchLogo.ico" # ======================== # MODEL REGISTRY (Table A2 order) # ======================== MODELS = [ {"model": "GPT-4o", "link": "https://openai.com/gpt-4o", "org": "OpenAI", "params": "-", "type": "Closed"}, {"model": "Gemini-2.0-Flash", "link": "https://deepmind.google/technologies/gemini/", "org": "Google", "params": "-", "type": "Closed"}, {"model": "Qwen-2.5-7B", "link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "org": "Alibaba", "params": "7B", "type": "Open"}, {"model": "LLaVA-v1.6", "link": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", "org": "LLaVA", "params": "7B", "type": "Open"}, {"model": "Phi-4", "link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "org": "Microsoft", "params": "5.6B", "type": "Open"}, {"model": "Gemma-3", "link": "https://huggingface.co/google/gemma-3-4b-it", "org": "Google", "params": "4B", "type": "Open"}, {"model": "CogVLM2-19B", "link": "https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B", "org": "THUDM", "params": "19B", "type": "Open"}, {"model": "Phi-3.5", "link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "org": "Microsoft", "params": "4B", "type": "Open"}, {"model": "Molmo-7V", "link": "https://huggingface.co/allenai/Molmo-7B-O-0924", "org": "Allen AI", "params": "7B", "type": "Open"}, {"model": "Aya-Vision-8B", "link": "https://huggingface.co/CohereForAI/aya-vision-8b", "org": "Cohere", "params": "8B", "type": "Open"}, {"model": "InternVL2.5", "link": "https://huggingface.co/OpenGVLab/InternVL2_5-8B", "org": "OpenGVLab", "params": "8B", "type": "Open"}, {"model": "Janus-Pro-7B", "link": "https://huggingface.co/deepseek-ai/Janus-Pro-7B", "org": "DeepSeek", "params": "7B", "type": "Open"}, {"model": "GLM-4V-9B", "link": "https://huggingface.co/THUDM/glm-4v-9b", "org": "THUDM", "params": "9B", "type": "Open"}, {"model": "Llama-3.2-11B", "link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "org": "Meta", "params": "11B", "type": "Open"}, {"model": "DeepSeek-VL2-Small", "link": "https://huggingface.co/deepseek-ai/deepseek-vl2-small", "org": "DeepSeek", "params": "3B", "type": "Open"}, ] # ======================== # PRINCIPLE DATA (Table A2) # Scores are percentages; Overall = mean of all 7 principles # ======================== PRINCIPLE_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Fairness": 61.1, "Ethics": 99.0, "Understanding": 74.8, "Reasoning": 79.2, "Language": 62.5, "Empathy": 90.5, "Robustness": 50.90, "Overall": 74.00}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Fairness": 61.0, "Ethics": 98.9, "Understanding": 73.5, "Reasoning": 78.8, "Language": 62.2, "Empathy": 89.5, "Robustness": 57.20, "Overall": 74.44}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Fairness": 63.1, "Ethics": 96.5, "Understanding": 84.9, "Reasoning": 67.1, "Language": 57.4, "Empathy": 73.8, "Robustness": 53.60, "Overall": 70.91}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Fairness": 59.7, "Ethics": 94.4, "Understanding": 80.3, "Reasoning": 68.1, "Language": 55.4, "Empathy": 66.3, "Robustness": 60.60, "Overall": 69.26}, {"model": "Phi-4", "link": MODELS[4]["link"], "Fairness": 59.2, "Ethics": 98.2, "Understanding": 78.6, "Reasoning": 77.4, "Language": 61.3, "Empathy": 79.0, "Robustness": 45.70, "Overall": 71.34}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Fairness": 57.5, "Ethics": 94.6, "Understanding": 73.2, "Reasoning": 67.8, "Language": 57.7, "Empathy": 79.8, "Robustness": 58.30, "Overall": 69.84}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Fairness": 53.1, "Ethics": 96.3, "Understanding": 67.5, "Reasoning": 74.4, "Language": 60.4, "Empathy": 68.0, "Robustness": 35.12, "Overall": 64.97}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Fairness": 56.0, "Ethics": 96.1, "Understanding": 72.3, "Reasoning": 69.7, "Language": 57.3, "Empathy": 70.8, "Robustness": 50.50, "Overall": 67.53}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Fairness": 52.4, "Ethics": 94.8, "Understanding": 66.2, "Reasoning": 65.8, "Language": 55.0, "Empathy": 58.8, "Robustness": 49.70, "Overall": 63.24}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Fairness": 51.7, "Ethics": 94.9, "Understanding": 64.4, "Reasoning": 68.1, "Language": 50.8, "Empathy": 77.8, "Robustness": 45.90, "Overall": 64.80}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Fairness": 50.9, "Ethics": 93.8, "Understanding": 63.8, "Reasoning": 64.4, "Language": 51.1, "Empathy": 74.5, "Robustness": 56.40, "Overall": 64.99}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Fairness": 50.2, "Ethics": 96.9, "Understanding": 63.3, "Reasoning": 65.2, "Language": 57.6, "Empathy": 69.5, "Robustness": 52.80, "Overall": 65.07}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Fairness": 50.2, "Ethics": 94.4, "Understanding": 63.9, "Reasoning": 63.0, "Language": 50.0, "Empathy": 67.8, "Robustness": 50.50, "Overall": 62.83}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Fairness": 50.2, "Ethics": 94.9, "Understanding": 58.9, "Reasoning": 63.0, "Language": 50.7, "Empathy": 71.3, "Robustness": 56.70, "Overall": 63.67}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Fairness": 48.8, "Ethics": 90.6, "Understanding": 54.8, "Reasoning": 61.6, "Language": 49.1, "Empathy": 59.3, "Robustness": 55.70, "Overall": 59.99}, ] # ======================== # TASK DATA (Tables 4–10) # T1–T7 per-model accuracy / scores # ======================== def _task_rows(extra_keys: list) -> list: """Generate per-model rows with None scores for the given extra columns.""" return [ {"model": m["model"], "link": m["link"], **{k: None for k in extra_keys}} for m in MODELS ] T1_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"] # T1: Scene Understanding (Open-Ended VQA) T1_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 74.80, "Bias": 0.90, "Hallucination": 2.10, "Faithfulness": 76.50, "Context Rel.": 75.20, "Coherence": 75.80}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 73.20, "Bias": 1.10, "Hallucination": 1.70, "Faithfulness": 75.90, "Context Rel.": 74.30, "Coherence": 74.80}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 67.37, "Bias": 9.33, "Hallucination": 9.38, "Faithfulness": 67.92, "Context Rel.": 66.28, "Coherence": 66.40}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 64.34, "Bias": 9.03, "Hallucination": 9.12, "Faithfulness": 65.33, "Context Rel.": 68.10, "Coherence": 66.90}, {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 68.10, "Bias": 1.23, "Hallucination": 3.12, "Faithfulness": 72.38, "Context Rel.": 73.47, "Coherence": 73.20}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 66.50, "Bias": 8.50, "Hallucination": 8.20, "Faithfulness": 70.10, "Context Rel.": 68.30, "Coherence": 69.00}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 67.34, "Bias": 11.38, "Hallucination": 10.45, "Faithfulness": 69.01, "Context Rel.": 71.29, "Coherence": 69.80}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 67.19, "Bias": 2.40, "Hallucination": 5.21, "Faithfulness": 67.45, "Context Rel.": 65.28, "Coherence": 65.90}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 67.12, "Bias": 1.87, "Hallucination": 4.35, "Faithfulness": 64.78, "Context Rel.": 62.01, "Coherence": 62.60}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 62.19, "Bias": 8.12, "Hallucination": 8.46, "Faithfulness": 68.84, "Context Rel.": 68.22, "Coherence": 68.00}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 61.10, "Bias": 10.70, "Hallucination": 10.73, "Faithfulness": 65.71, "Context Rel.": 64.18, "Coherence": 64.20}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 62.10, "Bias": 1.35, "Hallucination": 3.21, "Faithfulness": 69.26, "Context Rel.": 67.09, "Coherence": 67.50}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 60.18, "Bias": 8.63, "Hallucination": 8.34, "Faithfulness": 69.98, "Context Rel.": 65.10, "Coherence": 65.40}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 63.40, "Bias": 19.30, "Hallucination": 15.67, "Faithfulness": 62.09, "Context Rel.": 66.01, "Coherence": 64.30}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 59.10, "Bias": 12.56, "Hallucination": 11.29, "Faithfulness": 62.14, "Context Rel.": 63.10, "Coherence": 63.00}, ] T2_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"] # T2: Instance Identity (Open-Ended VQA) T2_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 68.10, "Bias": 1.50, "Hallucination": 3.00, "Faithfulness": 85.00, "Context Rel.": 85.00, "Coherence": 85.00}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 66.50, "Bias": 2.00, "Hallucination": 4.00, "Faithfulness": 83.00, "Context Rel.": 82.00, "Coherence": 82.00}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 62.37, "Bias": 10.21, "Hallucination": 6.27, "Faithfulness": 67.92, "Context Rel.": 68.65, "Coherence": 66.94}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 59.34, "Bias": 9.82, "Hallucination": 10.01, "Faithfulness": 65.33, "Context Rel.": 66.10, "Coherence": 65.02}, {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 63.10, "Bias": 2.07, "Hallucination": 4.08, "Faithfulness": 81.67, "Context Rel.": 82.21, "Coherence": 81.76}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 61.94, "Bias": 15.19, "Hallucination": 5.00, "Faithfulness": 78.96, "Context Rel.": 75.00, "Coherence": 76.00}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 62.34, "Bias": 12.31, "Hallucination": 6.53, "Faithfulness": 74.01, "Context Rel.": 70.14, "Coherence": 72.45}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 62.19, "Bias": 3.39, "Hallucination": 6.19, "Faithfulness": 67.45, "Context Rel.": 68.34, "Coherence": 67.80}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 57.19, "Bias": 9.02, "Hallucination": 9.39, "Faithfulness": 68.84, "Context Rel.": 67.74, "Coherence": 66.89}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 62.12, "Bias": 2.83, "Hallucination": 5.44, "Faithfulness": 64.78, "Context Rel.": 67.33, "Coherence": 65.41}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 56.10, "Bias": 11.74, "Hallucination": 11.69, "Faithfulness": 65.71, "Context Rel.": 64.49, "Coherence": 62.92}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 57.10, "Bias": 2.16, "Hallucination": 4.24, "Faithfulness": 69.26, "Context Rel.": 71.82, "Coherence": 71.09}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 55.18, "Bias": 9.59, "Hallucination": 9.18, "Faithfulness": 69.98, "Context Rel.": 65.73, "Coherence": 64.30}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 54.10, "Bias": 13.48, "Hallucination": 12.41, "Faithfulness": 64.05, "Context Rel.": 63.12, "Coherence": 61.37}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 58.40, "Bias": 20.42, "Hallucination": 16.72, "Faithfulness": 62.09, "Context Rel.": 60.04, "Coherence": 59.11}, ] T3_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"] # T3: Multiple-Choice VQA T3_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 68.10, "Bias": 0.95, "Hallucination": 1.20, "Faithfulness": 82.30, "Context Rel.": 80.45, "Coherence": 73.90}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 70.40, "Bias": 0.85, "Hallucination": 0.95, "Faithfulness": 81.60, "Context Rel.": 82.10, "Coherence": 74.60}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 52.93, "Bias": 6.30, "Hallucination": 6.35, "Faithfulness": 69.22, "Context Rel.": 67.54, "Coherence": 66.63}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 50.89, "Bias": 7.68, "Hallucination": 7.22, "Faithfulness": 64.77, "Context Rel.": 63.06, "Coherence": 62.25}, {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 60.80, "Bias": 2.01, "Hallucination": 3.00, "Faithfulness": 76.55, "Context Rel.": 74.77, "Coherence": 73.86}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 54.22, "Bias": 5.43, "Hallucination": 5.80, "Faithfulness": 71.14, "Context Rel.": 69.37, "Coherence": 68.46}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 61.10, "Bias": 1.95, "Hallucination": 2.90, "Faithfulness": 77.20, "Context Rel.": 75.40, "Coherence": 74.50}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 53.18, "Bias": 6.13, "Hallucination": 6.24, "Faithfulness": 69.98, "Context Rel.": 68.16, "Coherence": 67.26}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 51.47, "Bias": 7.29, "Hallucination": 6.97, "Faithfulness": 66.02, "Context Rel.": 64.38, "Coherence": 63.56}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 51.64, "Bias": 7.17, "Hallucination": 6.90, "Faithfulness": 67.33, "Context Rel.": 65.69, "Coherence": 64.74}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 49.05, "Bias": 8.92, "Hallucination": 8.00, "Faithfulness": 61.01, "Context Rel.": 59.37, "Coherence": 58.53}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 55.51, "Bias": 4.56, "Hallucination": 5.25, "Faithfulness": 72.33, "Context Rel.": 70.47, "Coherence": 69.53}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 50.76, "Bias": 7.76, "Hallucination": 7.27, "Faithfulness": 63.26, "Context Rel.": 61.55, "Coherence": 60.73}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 45.67, "Bias": 18.28, "Hallucination": 12.98, "Faithfulness": 52.02, "Context Rel.": 55.29, "Coherence": 54.39}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 45.35, "Bias": 14.13, "Hallucination": 12.55, "Faithfulness": 54.21, "Context Rel.": 56.46, "Coherence": 54.52}, ] LANGUAGES = ["English", "French", "Spanish", "Portuguese", "Mandarin", "Korean", "Urdu", "Persian", "Bengali", "Punjabi", "Tamil"] # T4: Multilingual VQA — Accuracy (%) per language T4_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "English": 64.6, "French": 64.0, "Spanish": 63.4, "Portuguese": 62.8, "Mandarin": 62.3, "Korean": 61.8, "Urdu": 60.1, "Persian": 59.7, "Bengali": 59.1, "Punjabi": 58.6, "Tamil": 58.1, "Avg": 61.32}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "English": 64.4, "French": 63.8, "Spanish": 63.2, "Portuguese": 62.6, "Mandarin": 62.1, "Korean": 61.7, "Urdu": 60.0, "Persian": 59.5, "Bengali": 58.9, "Punjabi": 58.4, "Tamil": 58.0, "Avg": 61.15}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "English": 59.2, "French": 58.6, "Spanish": 57.9, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "English": 56.8, "French": 56.4, "Spanish": 55.6, "Portuguese": 55.1, "Mandarin": 54.6, "Korean": 54.1, "Urdu": 52.8, "Persian": 52.4, "Bengali": 51.8, "Punjabi": 51.4, "Tamil": 51.0, "Avg": 53.82}, {"model": "Phi-4", "link": MODELS[4]["link"], "English": 63.3, "French": 62.8, "Spanish": 62.1, "Portuguese": 61.6, "Mandarin": 61.1, "Korean": 60.6, "Urdu": 58.9, "Persian": 58.5, "Bengali": 57.8, "Punjabi": 57.3, "Tamil": 56.9, "Avg": 60.08}, {"model": "Gemma-3", "link": MODELS[5]["link"], "English": 59.5, "French": 59.0, "Spanish": 58.2, "Portuguese": 57.7, "Mandarin": 57.3, "Korean": 56.9, "Urdu": 55.3, "Persian": 54.9, "Bengali": 54.3, "Punjabi": 53.8, "Tamil": 53.3, "Avg": 56.38}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "English": 61.6, "French": 61.3, "Spanish": 60.9, "Portuguese": 61.4, "Mandarin": 60.9, "Korean": 60.4, "Urdu": 58.7, "Persian": 58.3, "Bengali": 57.6, "Punjabi": 57.1, "Tamil": 56.6, "Avg": 59.53}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "English": 59.1, "French": 58.6, "Spanish": 58.0, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "English": 56.1, "French": 55.6, "Spanish": 54.9, "Portuguese": 54.5, "Mandarin": 54.2, "Korean": 53.8, "Urdu": 52.5, "Persian": 52.1, "Bengali": 51.5, "Punjabi": 51.1, "Tamil": 50.7, "Avg": 53.36}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "English": 55.8, "French": 55.0, "Spanish": 54.2, "Portuguese": 53.2, "Mandarin": 52.3, "Korean": 51.7, "Urdu": 51.3, "Persian": 51.7, "Bengali": 51.9, "Punjabi": 49.9, "Tamil": 49.1, "Avg": 52.37}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "English": 53.9, "French": 53.1, "Spanish": 52.4, "Portuguese": 51.1, "Mandarin": 50.5, "Korean": 49.7, "Urdu": 49.3, "Persian": 49.9, "Bengali": 50.1, "Punjabi": 47.9, "Tamil": 47.3, "Avg": 50.47}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "English": 58.5, "French": 58.1, "Spanish": 57.5, "Portuguese": 57.0, "Mandarin": 56.5, "Korean": 55.8, "Urdu": 54.5, "Persian": 54.1, "Bengali": 53.5, "Punjabi": 53.0, "Tamil": 52.6, "Avg": 55.55}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "English": 53.3, "French": 52.7, "Spanish": 51.8, "Portuguese": 50.8, "Mandarin": 50.1, "Korean": 49.4, "Urdu": 49.0, "Persian": 49.5, "Bengali": 49.7, "Punjabi": 47.6, "Tamil": 47.2, "Avg": 50.10}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "English": 51.9, "French": 51.5, "Spanish": 50.7, "Portuguese": 50.3, "Mandarin": 49.9, "Korean": 49.4, "Urdu": 48.0, "Persian": 47.6, "Bengali": 47.0, "Punjabi": 46.5, "Tamil": 46.1, "Avg": 49.00}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "English": 52.8, "French": 52.2, "Spanish": 51.3, "Portuguese": 50.3, "Mandarin": 49.5, "Korean": 48.9, "Urdu": 48.5, "Persian": 48.9, "Bengali": 49.1, "Punjabi": 47.0, "Tamil": 46.6, "Avg": 49.55}, ] T5_COLS = ["mAP@0.5", "mAP@0.75", "Mean IoU", "Missing (%)"] # T5: Visual Grounding (Table 9) — mAP values are %; Mean IoU is 0–1; Missing (%) = images with no predicted box T5_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "mAP@0.5": 63.46, "mAP@0.75": 40.32, "Mean IoU": 0.34, "Missing (%)": 72.73}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "mAP@0.5": 56.51, "mAP@0.75": 52.15, "Mean IoU": 0.23, "Missing (%)": 0.00}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "mAP@0.5": 98.43, "mAP@0.75": 94.16, "Mean IoU": 0.90, "Missing (%)": 0.00}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "mAP@0.5": 96.49, "mAP@0.75": 82.44, "Mean IoU": 0.78, "Missing (%)": 0.00}, {"model": "Phi-4", "link": MODELS[4]["link"], "mAP@0.5": 72.11, "mAP@0.75": 46.18, "Mean IoU": 0.47, "Missing (%)": 0.00}, {"model": "Gemma-3", "link": MODELS[5]["link"], "mAP@0.5": 56.34, "mAP@0.75": 54.23, "Mean IoU": 0.49, "Missing (%)": 16.34}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "mAP@0.5": 50.88, "mAP@0.75": 50.42, "Mean IoU": 0.10, "Missing (%)": 0.00}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "mAP@0.5": 63.45, "mAP@0.75": 58.35, "Mean IoU": 0.37, "Missing (%)": 0.00}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "mAP@0.5": 43.32, "mAP@0.75": 34.34, "Mean IoU": 0.45, "Missing (%)": 0.00}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "mAP@0.5": 54.15, "mAP@0.75": 41.26, "Mean IoU": 0.07, "Missing (%)": 0.00}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "mAP@0.5": 56.39, "mAP@0.75": 36.52, "Mean IoU": 0.22, "Missing (%)": 6.67}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "mAP@0.5": 50.18, "mAP@0.75": 10.04, "Mean IoU": 0.14, "Missing (%)": 2.80}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "mAP@0.5": 52.20, "mAP@0.75": 35.55, "Mean IoU": 0.12, "Missing (%)": 4.21}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "mAP@0.5": 38.34, "mAP@0.75": 35.53, "Mean IoU": 0.25, "Missing (%)": 32.24}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "mAP@0.5": 25.34, "mAP@0.75": 21.23, "Mean IoU": 0.14, "Missing (%)": 5.35}, ] T6_COLS = ["Empathy", "Anxiety", "Sadness", "Joy"] # T6: Empathetic Captioning (Table 10) — LLM-judge rubric, 0–100 T6_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Empathy": 95, "Anxiety": 15, "Sadness": 12, "Joy": 94}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Empathy": 92, "Anxiety": 13, "Sadness": 11, "Joy": 90}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Empathy": 68, "Anxiety": 25, "Sadness": 14, "Joy": 66}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Empathy": 70, "Anxiety": 37, "Sadness": 36, "Joy": 68}, {"model": "Phi-4", "link": MODELS[4]["link"], "Empathy": 83, "Anxiety": 22, "Sadness": 25, "Joy": 80}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Empathy": 84, "Anxiety": 23, "Sadness": 24, "Joy": 82}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Empathy": 76, "Anxiety": 44, "Sadness": 33, "Joy": 73}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Empathy": 70, "Anxiety": 28, "Sadness": 27, "Joy": 68}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Empathy": 60, "Anxiety": 47, "Sadness": 36, "Joy": 58}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Empathy": 72, "Anxiety": 12, "Sadness": 19, "Joy": 70}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Empathy": 72, "Anxiety": 20, "Sadness": 24, "Joy": 70}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Empathy": 66, "Anxiety": 32, "Sadness": 20, "Joy": 64}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Empathy": 74, "Anxiety": 42, "Sadness": 31, "Joy": 70}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Empathy": 78, "Anxiety": 46, "Sadness": 25, "Joy": 68}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Empathy": 68, "Anxiety": 59, "Sadness": 39, "Joy": 67}, ] T7_COLS = ["Clean Acc.", "Perturbated Acc.", "Retention (%)"] # T7: Model Robustness under Perturbations (Table 11) — Retention = Perturbated / Clean × 100 T7_DATA = [ {"model": "GPT-4o", "link": MODELS[0]["link"], "Clean Acc.": 65.85, "Perturbated Acc.": 40.80, "Retention (%)": 61.96}, {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Clean Acc.": 60.40, "Perturbated Acc.": 39.00, "Retention (%)": 64.57}, {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Clean Acc.": 93.84, "Perturbated Acc.": 70.01, "Retention (%)": 74.63}, {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Clean Acc.": 87.50, "Perturbated Acc.": 67.36, "Retention (%)": 77.53}, {"model": "Phi-4", "link": MODELS[4]["link"], "Clean Acc.": 72.05, "Perturbated Acc.": 44.43, "Retention (%)": 61.67}, {"model": "Gemma-3", "link": MODELS[5]["link"], "Clean Acc.": 73.10, "Perturbated Acc.": 51.75, "Retention (%)": 70.82}, {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Clean Acc.": 54.00, "Perturbated Acc.": 34.50, "Retention (%)": 63.89}, {"model": "Phi-3.5", "link": MODELS[7]["link"], "Clean Acc.": 67.25, "Perturbated Acc.": 42.00, "Retention (%)": 62.45}, {"model": "Molmo-7V", "link": MODELS[8]["link"], "Clean Acc.": 71.15, "Perturbated Acc.": 45.50, "Retention (%)": 63.96}, {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Clean Acc.": 59.50, "Perturbated Acc.": 32.20, "Retention (%)": 54.03}, {"model": "InternVL2.5", "link": MODELS[10]["link"], "Clean Acc.": 59.80, "Perturbated Acc.": 37.75, "Retention (%)": 63.12}, {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Clean Acc.": 55.60, "Perturbated Acc.": 31.85, "Retention (%)": 57.31}, {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Clean Acc.": 54.75, "Perturbated Acc.": 29.85, "Retention (%)": 54.52}, {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Clean Acc.": 62.15, "Perturbated Acc.": 40.25, "Retention (%)": 64.74}, {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Clean Acc.": 55.90, "Perturbated Acc.": 33.60, "Retention (%)": 60.11}, ] # ======================== # HEADER / INTRO HTML # ======================== INTRODUCTION_HTML = f"""
A human-centric evaluation framework for Large Multimodal Models (LMMs) across 7 tasks, 7 HC principles, 5 social attributes, and 11 languages — built on 32,000+ expert-verified real-world image–question pairs.
↕ Click any column header to sort
' COLOR_LEGEND = '↕ Click any column header to sort · ■ ≥75% ■ 60–74% ■ <60%
' def _make_df(data: list, score_cols: list, pct: bool = True, sort_col: str = None, lower_is_better_cols: list = None) -> pd.DataFrame: key = sort_col or score_cols[0] lower_is_better_cols = lower_is_better_cols or [] sorted_data = sorted(data, key=lambda x: x.get(key) or 0, reverse=True) rows = [] for rank, item in enumerate(sorted_data, 1): row = { "#": MEDAL.get(rank, str(rank)), "Model": make_clickable_model(item["model"], item.get("link")), } for col in score_cols: val = item.get(col) row[col] = format_percentage(val, inverted=(col in lower_is_better_cols)) if pct else format_score(val) rows.append(row) return pd.DataFrame(rows) def build_overall_leaderboard(): PRINCIPLE_COLS = ["Fairness", "Ethics", "Understanding", "Reasoning", "Language", "Empathy", "Robustness"] paired = sorted(zip(MODELS, PRINCIPLE_DATA), key=lambda x: x[1].get("Overall", 0), reverse=True) rows = [] for rank, (m, p) in enumerate(paired, 1): row = { "#": MEDAL.get(rank, str(rank)), "Model": make_clickable_model(m["model"], m["link"]), "Org": m["org"], "Params": m["params"], "Type": format_type_badge(m["type"]), } for col in PRINCIPLE_COLS: row[col] = format_percentage(p.get(col)) row["Overall"] = format_overall(p.get("Overall")) rows.append(row) df = pd.DataFrame(rows) # #, Model, Org, Params, Type, 7 principles, Overall datatype = ["str", "html", "str", "str", "html"] + ["html"] * 8 gr.Dataframe( value=df, datatype=datatype, wrap=True, interactive=False, elem_classes="humani-leaderboard-table", ) gr.HTML(COLOR_LEGEND) def build_task_leaderboard(task_data: list, score_cols: list, pct: bool = True, sort_col: str = None, lower_is_better_cols: list = None): df = _make_df(task_data, score_cols, pct=pct, sort_col=sort_col, lower_is_better_cols=lower_is_better_cols) datatype = ["str", "html"] + (["html"] * len(score_cols) if pct else ["str"] * len(score_cols)) gr.Dataframe( value=df, datatype=datatype, wrap=True, interactive=False, elem_classes="humani-leaderboard-table", ) gr.HTML(COLOR_LEGEND if pct else SORT_NOTE) def build_vqa_leaderboard(task_data: list): cols = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"] build_task_leaderboard(task_data, cols, pct=True, lower_is_better_cols=["Bias", "Hallucination"]) def build_multilingual_leaderboard(): LANG_COLS = LANGUAGES + ["Avg"] build_task_leaderboard(T4_DATA, LANG_COLS, pct=True, sort_col="Avg") # ======================== # GRADIO APP # ======================== demo = gr.Blocks(title=TITLE, css=custom_css, js=custom_js) with demo: gr.HTML(f""" """) gr.HTML(INTRODUCTION_HTML) gr.HTML("""
HumaniBench evaluates 15 LMMs across 7 human-centric tasks using 32K+ expert-verified real-world image–question pairs spanning 5 social attributes and 11 languages.