Kevin Xie commited on
Commit ·
3070e58
1
Parent(s): 9aca690
Upload main processing scripts for the leaderboard
Browse files- scripts/helpers/CONSTANTS.py +124 -0
- scripts/helpers/__init__.py +3 -0
- scripts/helpers/excel_processor.py +358 -0
- scripts/helpers/leaderboards.py +15 -0
- scripts/helpers/reorganize_indices.py +234 -0
- scripts/main.py +151 -0
scripts/helpers/CONSTANTS.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TASK_MAPPING = {
|
| 2 |
+
"1-1.ADE-ADE identification": "ADE-Identification",
|
| 3 |
+
"1-2.ADE-ADE relation": "ADE-Extraction",
|
| 4 |
+
"1-3.ADE-Drug dosage": "ADE-Drug dosage",
|
| 5 |
+
"3-2.BARR2-resolution": "BARR2",
|
| 6 |
+
"5.BrainMRI-AIS": "BrainMRI-AIS",
|
| 7 |
+
"6.Brateca.hospitalization": "Brateca-Hospitalization",
|
| 8 |
+
"6.Brateca.mortality": "Brateca-Mortality",
|
| 9 |
+
"7.Cantemist.CODING": "Cantemist-Coding",
|
| 10 |
+
"7.Cantemist.NER": "Cantemis-NER",
|
| 11 |
+
"7.Cantemist.Norm": "Cantemis-Norm",
|
| 12 |
+
"8.CARES.area": "CARES-Area",
|
| 13 |
+
"8.CARES.icd10_block": "CARES ICD10 Block",
|
| 14 |
+
"8.CARES.icd10_chapter": "CARES-ICD10 Chapter",
|
| 15 |
+
"8.CARES.icd10_sub_block": "CARES-ICD10 Subblock",
|
| 16 |
+
"9.CHIP-CDEE": "CHIP-CDEE",
|
| 17 |
+
"12.C-EMRS": "C-EMRS",
|
| 18 |
+
"17-1.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-CM": "CodiEsp-ICD-10-CM",
|
| 19 |
+
"17-2.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-PCS": "CodiEsp-ICD-10-PCS",
|
| 20 |
+
"19.ClinicalNotes-UPMC": "ClinicalNotes-UPMC",
|
| 21 |
+
"20.clinical records from the Mexican Social Security Institute": "PPTS",
|
| 22 |
+
"21.CLINpt": "CLINpt-NER",
|
| 23 |
+
"22.CLIP": "CLIP",
|
| 24 |
+
"23.cMedQA": "cMedQA",
|
| 25 |
+
"26.DialMed": "DialMed",
|
| 26 |
+
"27.DiSMed": "DiSMed-NER",
|
| 27 |
+
"28.MIE": "MIE",
|
| 28 |
+
"29.EHRQA.primary_department": "EHRQA-Primary department",
|
| 29 |
+
"29.EHRQA.qa": "EHRQA-QA",
|
| 30 |
+
"29.EHRQA.sub_department": "EHRQA-Sub department",
|
| 31 |
+
"31.Ex4CDS": "Ex4CDS",
|
| 32 |
+
"33.GOUT-CC.consensus": "GOUT-CC-Consensus",
|
| 33 |
+
"35.n2c2 2006 - De-identification": "n2c2 2006-De-identification",
|
| 34 |
+
"37.i2b2-2009-Medication-Extraction-Challenge": "Medication extraction",
|
| 35 |
+
"38-1.i2b2-2010-Relations-Challenge-concept": "n2c2 2010-Concept",
|
| 36 |
+
"38-2.i2b2-2010-Relations-Challenge-assertion": "n2c2 2010-Assertion",
|
| 37 |
+
"38-3.i2b2-2010-Relations-Challenge-relation": "n2c2 2010-Relation",
|
| 38 |
+
"41.n2c2 2014 - De-identification": "n2c2 2014-De-identification",
|
| 39 |
+
"43.IMCS-V2-NER": "IMCS-V2-NER",
|
| 40 |
+
"46.Japanese Case Reports": "JP-STS",
|
| 41 |
+
"48.meddocan": "meddocan",
|
| 42 |
+
"51.MEDIQA_2019_Task2_RQE": "MEDIQA 2019-RQE",
|
| 43 |
+
"55.MedNLI": "MedNLI",
|
| 44 |
+
"57.MedSTS": "MedSTS",
|
| 45 |
+
"62.mtsamples": "MTS",
|
| 46 |
+
"63.MTSamples-temporal annotation": "MTS-Temporal",
|
| 47 |
+
"65.n2c2-2018-Track2-Adverse-Drug-Events-and-Medication-Extraction": "n2c2 2018-ADE&medication",
|
| 48 |
+
"66-1.NorSynthClinical-entity": "NorSynthClinical-NER",
|
| 49 |
+
"66-2.NorSynthClinical-relation": "NorSynthClinical-RE",
|
| 50 |
+
"68.NUBES": "NUBES",
|
| 51 |
+
"76-1.MTS-Dialog-MEDIQA-2023-chat-task-A": "MEDIQA 2023-chat-A",
|
| 52 |
+
"76-2.MTS-Dialog-MEDIQA-2023-sum-task-A": "MEDIQA 2023-sum-A",
|
| 53 |
+
"76-3.MTS-Dialog-MEDIQA-2023-sum-task-B": "MEDIQA 2023-sum-B",
|
| 54 |
+
"80.RuMedDaNet": "RuMedDaNet",
|
| 55 |
+
"81.CHIP-CDN": "CBLUE-CDN",
|
| 56 |
+
"82.CHIP-CTC": "CHIP-CTC",
|
| 57 |
+
"83.CHIP-MDCFNPC": "CHIP-MDCFNPC",
|
| 58 |
+
"84.MedDG": "MedDG",
|
| 59 |
+
"85.IMCS-V2-SR": "IMCS-V2-SR",
|
| 60 |
+
"86.IMCS-V2-MRG": "IMCS-V2-MRG",
|
| 61 |
+
"87.IMCS-V2-DAC": "IMCS-V2-DAC",
|
| 62 |
+
"90-1.n2c2 2014 - Heart Disease Challenge - Diabete": "n2c2 2014-Diabetes",
|
| 63 |
+
"90-2.n2c2 2014 - Heart Disease Challenge - CAD": "n2c2 2014-CAD",
|
| 64 |
+
"90-3.n2c2 2014 - Heart Disease Challenge - Hyperlipidemia": "n2c2 2014-Hyperlipidemia",
|
| 65 |
+
"90-4.n2c2 2014 - Heart Disease Challenge - Hypertension": "n2c2 2014-Hypertension",
|
| 66 |
+
"90-8.n2c2 2014 - Heart Disease Challenge - Medication": "n2c2 2014-Medication",
|
| 67 |
+
"91-1.CAS.label": "CAS-label",
|
| 68 |
+
"91-2.CAS.evidence": "CAS-evidence",
|
| 69 |
+
"93.RuMedNLI": "RuMedNLI",
|
| 70 |
+
"94.RuDReC": "RuDReC-NER",
|
| 71 |
+
"95.NorSynthClinical-PHI": "NorSynthClinical-PHI",
|
| 72 |
+
"96.RuCCoN.NER": "RuCCoN",
|
| 73 |
+
"97.CLISTER": "CLISTER",
|
| 74 |
+
"98.BRONCO150.NER_status": "BRONCO150-NER&Status",
|
| 75 |
+
"99.CARDIO:DE": "CARDIO-DE",
|
| 76 |
+
"100.GraSSCo_PHI": "GraSSCo PHI",
|
| 77 |
+
"101.IFMIR.IncidentType": "IFMIR-Incident type",
|
| 78 |
+
"101.IFMIR.NER": "IFMIR-NER",
|
| 79 |
+
"101.IFMIR.NER_factuality": "IFMIR - NER&factuality",
|
| 80 |
+
"102.iCorpus": "iCorpus",
|
| 81 |
+
"103.icliniq-10k": "icliniq-10k",
|
| 82 |
+
"104.HealthCareMagic-100k": "HealthCareMagic-100k",
|
| 83 |
+
"105.MIMIC-IV CDM": "MIMIC-IV CDM",
|
| 84 |
+
"106.MIMIC-III Outcome.LoS": "MIMIC-III Outcome.LoS",
|
| 85 |
+
"106.MIMIC-III Outcome.Mortality": "MIMIC-III Outcome.Mortality",
|
| 86 |
+
"107.MIMIC-IV BHC": "MIMIC-IV BHC",
|
| 87 |
+
"108.MIMIC-IV DiReCT.Dis": "MIMIC-IV DiReCT.Dis",
|
| 88 |
+
"108.MIMIC-IV DiReCT.PDD": "MIMIC-IV DiReCT.PDD"
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
DATA_ACCESS_MAP = {
|
| 93 |
+
"OA": "Open Access",
|
| 94 |
+
"R": "Regulated",
|
| 95 |
+
"P": "Regulated",
|
| 96 |
+
"N": "Regulated"
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Mappings
|
| 100 |
+
DOMAIN_MAPPING = {
|
| 101 |
+
"General": "General",
|
| 102 |
+
"Medical": "Medical"
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
# In the filter, there should only be two categories (Open, Private)
|
| 106 |
+
# In the table itself, there should be more categories
|
| 107 |
+
LICENSE_MAPPING = {
|
| 108 |
+
"Baichuan-M1-14B": "Open Source",
|
| 109 |
+
"MIT": "Open Source",
|
| 110 |
+
"Gemma": "Open Source",
|
| 111 |
+
"Llama-3.1": "Open Source",
|
| 112 |
+
"Llama-3.3": "Open Source",
|
| 113 |
+
"Apache 2.0": "Open Source",
|
| 114 |
+
"PhysioNet Credentialed Health Data License 1.5.0": "Open Source",
|
| 115 |
+
"Llama-3": "Open Source",
|
| 116 |
+
"MRL": "Open Source",
|
| 117 |
+
"Qwen": "Open Source",
|
| 118 |
+
"Nexusflow Research License": "Open Source",
|
| 119 |
+
"Proprietary": "Proprietary",
|
| 120 |
+
"Llama-4": "Open Source",
|
| 121 |
+
"Health AI Developer Foundations terms of use": "Open Source",
|
| 122 |
+
"llama2": "Open Source"
|
| 123 |
+
}
|
| 124 |
+
|
scripts/helpers/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .CONSTANTS import *
|
| 2 |
+
from .excel_processor import ExcelProcessor
|
| 3 |
+
from .reorganize_indices import reorganize_indices
|
scripts/helpers/excel_processor.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
from .CONSTANTS import *
|
| 4 |
+
|
| 5 |
+
class ExcelProcessor:
|
| 6 |
+
def __init__(self, excel_path, invalid_models=None):
|
| 7 |
+
"""Initialize the ExcelProcessor with an Excel file.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
excel_path (str): Path to the Excel file containing model and task data.
|
| 11 |
+
"""
|
| 12 |
+
# excel_path = path to excel file
|
| 13 |
+
self.sheet_path = excel_path
|
| 14 |
+
self.excel_data = self.load_excel()
|
| 15 |
+
self.model_sheet = self.load_sheet("Models (Simplified)")
|
| 16 |
+
self.invalid_models = invalid_models
|
| 17 |
+
|
| 18 |
+
print("You have excluded the following models: ", self.invalid_models)
|
| 19 |
+
|
| 20 |
+
# Get all of the valid models (exclude invalid models)
|
| 21 |
+
self.valid_models = self.get_valid_models(self.invalid_models)
|
| 22 |
+
|
| 23 |
+
# print("VALID MODELS: ", self.valid_models)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def load_excel(self):
|
| 27 |
+
"""Load the Excel file into a pandas ExcelFile object.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
pd.ExcelFile: The loaded Excel file object.
|
| 31 |
+
"""
|
| 32 |
+
return pd.ExcelFile(self.sheet_path)
|
| 33 |
+
|
| 34 |
+
def load_sheet(self, sheet_name):
|
| 35 |
+
"""Load a specific sheet from the Excel file.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
sheet_name (str): Name of the sheet to load.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
pd.DataFrame: The loaded sheet as a pandas DataFrame.
|
| 42 |
+
"""
|
| 43 |
+
return self.excel_data.parse(sheet_name)
|
| 44 |
+
|
| 45 |
+
def get_valid_models(self, invalid_models=None):
|
| 46 |
+
"""Get all valid models from the Models sheet, excluding invalid ones.
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
list: List of valid model names that should be included in evaluation.
|
| 50 |
+
"""
|
| 51 |
+
valid_models = []
|
| 52 |
+
|
| 53 |
+
for idx, model_name in enumerate(self.model_sheet["Name"]):
|
| 54 |
+
if model_name not in invalid_models:
|
| 55 |
+
valid_models.append(model_name)
|
| 56 |
+
|
| 57 |
+
return valid_models
|
| 58 |
+
|
| 59 |
+
def get_valid_columns(self, sheet_name):
|
| 60 |
+
"""Get all non-empty columns from a specified sheet.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
sheet_name (str): Name of the sheet to analyze.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
list: List of valid column names (excluding unnamed columns).
|
| 67 |
+
"""
|
| 68 |
+
valid_columns = []
|
| 69 |
+
|
| 70 |
+
for column in self.load_sheet(sheet_name).columns:
|
| 71 |
+
if column.split(' ')[0] != "Unnamed:":
|
| 72 |
+
valid_columns.append(column.strip())
|
| 73 |
+
|
| 74 |
+
return valid_columns
|
| 75 |
+
|
| 76 |
+
def get_model_information(self,
|
| 77 |
+
sheet_name = "Models (Simplified)",
|
| 78 |
+
name_column = "Name",
|
| 79 |
+
domain_column = "Domain",
|
| 80 |
+
license_column = "License",
|
| 81 |
+
size_column = "Size (B)",
|
| 82 |
+
):
|
| 83 |
+
"""Extract model information from the Models sheet.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
sheet_name (str, optional): Name of the sheet containing model info.
|
| 87 |
+
Defaults to "Models (Simplified)".
|
| 88 |
+
name_column (str, optional): Column name containing model names.
|
| 89 |
+
Defaults to "Name".
|
| 90 |
+
domain_column (str, optional): Column name containing model domains.
|
| 91 |
+
Defaults to "Domain".
|
| 92 |
+
license_column (str, optional): Column name containing license info.
|
| 93 |
+
Defaults to "License".
|
| 94 |
+
size_column (str, optional): Column name containing model sizes.
|
| 95 |
+
Defaults to "Size (B)".
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
tuple: A tuple containing 7 dictionaries:
|
| 99 |
+
- model_name_info: Model names indexed by position
|
| 100 |
+
- domain_info: Model domains mapped using DOMAIN_MAPPING
|
| 101 |
+
- license_info: License information (abbreviated if needed)
|
| 102 |
+
- accessibility_info: Accessibility mapped using LICENSE_MAPPING
|
| 103 |
+
- displayed_size_info: Raw size values for display
|
| 104 |
+
- hidden_size_info: Size ranges for filtering
|
| 105 |
+
- T_info: Position markers for the leaderboard
|
| 106 |
+
"""
|
| 107 |
+
# Load the model sheet
|
| 108 |
+
model_sheet = self.load_sheet(sheet_name)
|
| 109 |
+
|
| 110 |
+
# Everything to be returned.
|
| 111 |
+
T_info = {}
|
| 112 |
+
model_name_info = {}
|
| 113 |
+
domain_info = {}
|
| 114 |
+
license_info = {}
|
| 115 |
+
accessibility_info = {}
|
| 116 |
+
displayed_size_info = {} # shown on leaderboard
|
| 117 |
+
hidden_size_info = {} # hidden column
|
| 118 |
+
|
| 119 |
+
def map_size(param_size):
|
| 120 |
+
"""Map parameter size to predefined ranges.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
param_size: The parameter size value.
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
str: Size range category.
|
| 127 |
+
"""
|
| 128 |
+
if param_size == "/":
|
| 129 |
+
return "None"
|
| 130 |
+
if param_size == "Unknown":
|
| 131 |
+
return "Unknown"
|
| 132 |
+
size = int(param_size)
|
| 133 |
+
if size < 5:
|
| 134 |
+
return "0-5"
|
| 135 |
+
elif size < 10:
|
| 136 |
+
return "5-10"
|
| 137 |
+
elif size < 40:
|
| 138 |
+
return "10-40"
|
| 139 |
+
elif size < 80:
|
| 140 |
+
return "40-80"
|
| 141 |
+
else:
|
| 142 |
+
return ">80"
|
| 143 |
+
|
| 144 |
+
i = 0
|
| 145 |
+
for name, domain, license, size in zip(model_sheet[name_column],
|
| 146 |
+
model_sheet[domain_column],
|
| 147 |
+
model_sheet[license_column],
|
| 148 |
+
model_sheet[size_column]):
|
| 149 |
+
|
| 150 |
+
# If it is a valid model (used in evaluation)
|
| 151 |
+
if name in self.valid_models:
|
| 152 |
+
T_info[f"{i}"] = "\ud83d\udd36"
|
| 153 |
+
model_name_info[f"{i}"] = name
|
| 154 |
+
|
| 155 |
+
domain_info[f"{i}"] = DOMAIN_MAPPING[domain]
|
| 156 |
+
|
| 157 |
+
if license == "PhysioNet Credentialed Health Data License 1.5.0":
|
| 158 |
+
license_info[f"{i}"] = "PhysioNet 1.5.0" # Abbreviate license name to fit on leaderboard
|
| 159 |
+
else:
|
| 160 |
+
license_info[f"{i}"] = license
|
| 161 |
+
|
| 162 |
+
accessibility_info[f"{i}"] = LICENSE_MAPPING[license]
|
| 163 |
+
displayed_size_info[f"{i}"] = size
|
| 164 |
+
hidden_size_info[f"{i}"] = map_size(size)
|
| 165 |
+
|
| 166 |
+
i += 1
|
| 167 |
+
|
| 168 |
+
else:
|
| 169 |
+
print("Invalid model: ", name)
|
| 170 |
+
|
| 171 |
+
return model_name_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info
|
| 172 |
+
|
| 173 |
+
def get_sheet_information(self, sheets_list, task_names_list, task_types_list):
|
| 174 |
+
"""Extract task performance information from specified sheets.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
sheets_list (list): List of sheet names to process.
|
| 178 |
+
task_names_list (list): List of task names corresponding to each sheet.
|
| 179 |
+
task_types_list (list): List of task types ('ext', 'gen', etc.) for each sheet.
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
dict: Dictionary mapping task names to model performance data.
|
| 183 |
+
Format: {task_name: {model_index: performance_score}}
|
| 184 |
+
"""
|
| 185 |
+
task_info = {}
|
| 186 |
+
|
| 187 |
+
# Iterate through each row
|
| 188 |
+
for idx, sheet in enumerate(sheets_list):
|
| 189 |
+
# Get the task type (tt)
|
| 190 |
+
tt = task_types_list[idx]
|
| 191 |
+
|
| 192 |
+
# Load the sheet
|
| 193 |
+
model_sheet = self.load_sheet(sheet)
|
| 194 |
+
|
| 195 |
+
# Name of the task (i.e. 1.1-ADE Identification)
|
| 196 |
+
task_name = task_names_list[idx]
|
| 197 |
+
|
| 198 |
+
# Get all columns in the sheet
|
| 199 |
+
for i, t in enumerate(model_sheet['Task Type']):
|
| 200 |
+
if i == 0:
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
# Break out of loop when it reaches the end of the sheet
|
| 204 |
+
if t == "-":
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
row = i
|
| 208 |
+
task_counter = 0
|
| 209 |
+
|
| 210 |
+
for model in self.valid_models:
|
| 211 |
+
column_name = model.strip()
|
| 212 |
+
|
| 213 |
+
if column_name == "gpt-35-turbo-0125":
|
| 214 |
+
column_name = "gpt-35-turbo"
|
| 215 |
+
elif column_name == "gpt-4o-0806":
|
| 216 |
+
column_name = "gpt-4o"
|
| 217 |
+
elif column_name == "gemini-2.0-flash-001":
|
| 218 |
+
column_name = "gemini-2.0-flash"
|
| 219 |
+
elif column_name == "gemini-1.5-pro-002":
|
| 220 |
+
column_name = "gemini-1.5-pro"
|
| 221 |
+
|
| 222 |
+
if column_name == "gpt-oss-20b":
|
| 223 |
+
column_name = "gpt-oss-20b-high"
|
| 224 |
+
elif column_name == "gpt-oss-120b":
|
| 225 |
+
column_name = "gpt-oss-120b-high"
|
| 226 |
+
|
| 227 |
+
if tt == 'ext':
|
| 228 |
+
column_name = column_name + '.1'
|
| 229 |
+
|
| 230 |
+
elif tt == 'gen':
|
| 231 |
+
column_name = column_name + '.1'
|
| 232 |
+
|
| 233 |
+
# Name of the task (i.e 1.1-ADE Identification)
|
| 234 |
+
task = model_sheet[task_name][row]
|
| 235 |
+
|
| 236 |
+
# Update task name to more simple version
|
| 237 |
+
task = TASK_MAPPING[task]
|
| 238 |
+
|
| 239 |
+
if task == "Average score":
|
| 240 |
+
break
|
| 241 |
+
|
| 242 |
+
# Update the information for each task
|
| 243 |
+
if task not in task_info:
|
| 244 |
+
task_info[task] = {}
|
| 245 |
+
|
| 246 |
+
task_info[task][f"{task_counter}"] = round(float(model_sheet[column_name.strip()][row].split(" ")[0]), 2)
|
| 247 |
+
task_counter += 1
|
| 248 |
+
|
| 249 |
+
return task_info
|
| 250 |
+
|
| 251 |
+
def add_average_performance(self, task_info):
|
| 252 |
+
"""Calculate average performance across all tasks for each model.
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
task_info (dict): Dictionary containing task performance data.
|
| 256 |
+
Format: {task_name: {model_index: performance_score}}
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
dict: Dictionary mapping model indices to average performance scores.
|
| 260 |
+
Format: {model_index: average_score}
|
| 261 |
+
"""
|
| 262 |
+
for task in task_info:
|
| 263 |
+
n = len(task_info[task])
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
average_performance_info = {}
|
| 267 |
+
for i in range(n):
|
| 268 |
+
perf = 0
|
| 269 |
+
num_tasks = 0
|
| 270 |
+
for task in task_info:
|
| 271 |
+
perf += float(task_info[task][str(i)])
|
| 272 |
+
num_tasks += 1
|
| 273 |
+
|
| 274 |
+
average_performance_info[f"{i}"] = str(round(perf / num_tasks, 2))
|
| 275 |
+
|
| 276 |
+
return average_performance_info
|
| 277 |
+
|
| 278 |
+
def create_leaderboards(
|
| 279 |
+
self,
|
| 280 |
+
sheet_names_list=None,
|
| 281 |
+
task_names_list=["Task-Classification", "Task-Extraction", "Task-Generation"],
|
| 282 |
+
task_types_list=["cls", "ext", "gen"],
|
| 283 |
+
output_path=None):
|
| 284 |
+
"""Create a leaderboard JSON file from Excel data.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
sheet_names_list (list, optional): List of sheet names to process.
|
| 288 |
+
task_names_list (list, optional): List of task names corresponding to sheets.
|
| 289 |
+
task_types_list (list, optional): List of task types for each sheet.
|
| 290 |
+
leaderboard_name (str, optional): Name of the leaderboard being created.
|
| 291 |
+
output_path (str, optional): Path where the JSON file should be saved.
|
| 292 |
+
|
| 293 |
+
Note:
|
| 294 |
+
Creates one leaderboard per call (CoT, Direct, or Few-Shot).
|
| 295 |
+
The output JSON contains model information, task performance, and metadata.
|
| 296 |
+
"""
|
| 297 |
+
data = {}
|
| 298 |
+
|
| 299 |
+
model_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info = self.get_model_information()
|
| 300 |
+
|
| 301 |
+
task_info = self.get_sheet_information(sheet_names_list, task_names_list, task_types_list)
|
| 302 |
+
average_performance_info = self.add_average_performance(task_info)
|
| 303 |
+
|
| 304 |
+
data["T"] = T_info
|
| 305 |
+
data["Model"] = model_info
|
| 306 |
+
data["Model: Domain"] = domain_info
|
| 307 |
+
data["Model: License"] = license_info
|
| 308 |
+
data["Model: Accessibility"] = accessibility_info
|
| 309 |
+
data["Size (B)"] = displayed_size_info
|
| 310 |
+
data["Model: Size Range"] = hidden_size_info
|
| 311 |
+
data["Average Performance"] = average_performance_info
|
| 312 |
+
|
| 313 |
+
for task in task_info:
|
| 314 |
+
data[task] = task_info[task]
|
| 315 |
+
|
| 316 |
+
with open(output_path, 'w') as file:
|
| 317 |
+
json.dump(data, file, indent=4)
|
| 318 |
+
|
| 319 |
+
def create_task_information(self, output_path: str):
|
| 320 |
+
"""Create a JSON file containing detailed task information.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
output_path (str): Path where the task information JSON should be saved.
|
| 324 |
+
|
| 325 |
+
Note:
|
| 326 |
+
Extracts task metadata from the "Task-all" sheet including language,
|
| 327 |
+
task type, clinical context, data access requirements, applications,
|
| 328 |
+
and clinical stage information.
|
| 329 |
+
"""
|
| 330 |
+
task_sheet = self.load_sheet("Task-all")
|
| 331 |
+
|
| 332 |
+
# Initialize a map to store the json information
|
| 333 |
+
info = {}
|
| 334 |
+
|
| 335 |
+
# Iterate through the "Task-Original" column, which contains all of the task names
|
| 336 |
+
for idx, task in enumerate(task_sheet["Task name"]):
|
| 337 |
+
# Add the task to the final json
|
| 338 |
+
|
| 339 |
+
if task not in info:
|
| 340 |
+
info[task] = {}
|
| 341 |
+
|
| 342 |
+
# Add all of the attributes to the task
|
| 343 |
+
language = task_sheet["Language"][idx]
|
| 344 |
+
task_type = task_sheet["Task Type - fine grained"][idx]
|
| 345 |
+
clinical_context = task_sheet["Clinical context"][idx]
|
| 346 |
+
data_access = task_sheet["Data Access\nOpen Access (OA) / \nRegulated (R) / \nPhysionet (P) / \nn2c2 (N)"][idx]
|
| 347 |
+
application = task_sheet['Clinical Application'][idx]
|
| 348 |
+
clinical_stage = task_sheet['Clinical Stage'][idx]
|
| 349 |
+
|
| 350 |
+
info[task]["Language"] = language.strip()
|
| 351 |
+
info[task]["Task Type"] = task_type.strip()
|
| 352 |
+
info[task]["Clinical Context"] = clinical_context.strip()
|
| 353 |
+
info[task]["Data Access"] = DATA_ACCESS_MAP[data_access.strip()]
|
| 354 |
+
info[task]['Applications'] = application.strip()
|
| 355 |
+
info[task]['Clinical Stage'] = clinical_stage.strip()
|
| 356 |
+
|
| 357 |
+
with open(output_path, 'w') as file:
|
| 358 |
+
json.dump(info, file, indent=4)
|
scripts/helpers/leaderboards.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class LeaderboardProcessor:
|
| 6 |
+
def __init__(self, output_path):
|
| 7 |
+
self.output_path = output_path
|
| 8 |
+
|
| 9 |
+
def update_leaderboards(self, old_leaderboard_json, new_models):
|
| 10 |
+
"""
|
| 11 |
+
Args:
|
| 12 |
+
- old_leaderboard_json: json file including the previous leaderboard data
|
| 13 |
+
- new_models: List[str] --> a list of strings of new models to update the leaderboard with
|
| 14 |
+
"""
|
| 15 |
+
pass
|
scripts/helpers/reorganize_indices.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from collections import OrderedDict
|
| 5 |
+
from typing import Dict, List, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
def extract_model_size(model_name: str) -> float:
|
| 8 |
+
"""
|
| 9 |
+
Extract the size (in billions of parameters) from a model name.
|
| 10 |
+
Returns a large number for models without explicit size (like DeepSeek-R1).
|
| 11 |
+
"""
|
| 12 |
+
# Look for patterns like "1.5B", "7B", "70B", "32B", etc.
|
| 13 |
+
size_match = re.search(r'(\d+(?:\.\d+)?)[Bb]', model_name)
|
| 14 |
+
if size_match:
|
| 15 |
+
return float(size_match.group(1))
|
| 16 |
+
|
| 17 |
+
# Special handling for models without explicit size
|
| 18 |
+
if 'DeepSeek-R1' in model_name and 'Distill' not in model_name:
|
| 19 |
+
return 999.0 # Treat as very large model
|
| 20 |
+
|
| 21 |
+
# Default fallback - treat as medium size
|
| 22 |
+
return 50.0
|
| 23 |
+
|
| 24 |
+
def get_size_based_order(models: Dict[str, str]) -> List[Tuple[str, str, float]]:
|
| 25 |
+
"""
|
| 26 |
+
Get models sorted by size with their original indices.
|
| 27 |
+
Returns list of (original_index, model_name, size) tuples sorted by size.
|
| 28 |
+
"""
|
| 29 |
+
model_data = []
|
| 30 |
+
for idx, model_name in models.items():
|
| 31 |
+
size = extract_model_size(model_name)
|
| 32 |
+
model_data.append((idx, model_name, size))
|
| 33 |
+
|
| 34 |
+
# Sort by size (ascending), then by name for ties
|
| 35 |
+
return sorted(model_data, key=lambda x: (x[2], x[1]))
|
| 36 |
+
|
| 37 |
+
def create_size_based_mapping(leaderboard_json_path: str) -> Dict[str, str]:
|
| 38 |
+
"""
|
| 39 |
+
Create a mapping from current indices to size-based indices for a specific leaderboard.
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 43 |
+
data = json.load(f)
|
| 44 |
+
|
| 45 |
+
if 'Model' not in data:
|
| 46 |
+
raise ValueError(f"No 'Model' section found in {leaderboard_json_path}")
|
| 47 |
+
|
| 48 |
+
models = data['Model']
|
| 49 |
+
|
| 50 |
+
# Get the first 8 models (the main ones we want to reorder)
|
| 51 |
+
first_8_models = {k: v for k, v in list(models.items())[:8]}
|
| 52 |
+
|
| 53 |
+
# Get size-based ordering
|
| 54 |
+
sorted_models = get_size_based_order(first_8_models)
|
| 55 |
+
|
| 56 |
+
# Create mapping from old index to new index
|
| 57 |
+
mapping = {}
|
| 58 |
+
for new_idx, (old_idx, model_name, size) in enumerate(sorted_models):
|
| 59 |
+
mapping[old_idx] = str(new_idx)
|
| 60 |
+
print(f" {model_name} ({size}B): {old_idx} → {new_idx}")
|
| 61 |
+
|
| 62 |
+
# For indices 8 and beyond, they stay the same
|
| 63 |
+
# Now properly handle all models (up to 99 instead of hard-coded 73)
|
| 64 |
+
max_index = max(int(k) for k in models.keys())
|
| 65 |
+
print(f" Total models: {len(models)}, max index: {max_index}")
|
| 66 |
+
for i in range(8, max_index + 1):
|
| 67 |
+
mapping[str(i)] = str(i)
|
| 68 |
+
|
| 69 |
+
return mapping
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Error creating mapping for {leaderboard_json_path}: {e}")
|
| 73 |
+
raise
|
| 74 |
+
|
| 75 |
+
def reorganize_indices(leaderboard_json_path: str, custom_mapping: Optional[Dict[str, str]] = None):
|
| 76 |
+
"""
|
| 77 |
+
Reorganize the indices of a leaderboard JSON file based on model size.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
leaderboard_json_path: Path to the leaderboard JSON file
|
| 81 |
+
custom_mapping: Optional custom mapping dict. If None, will auto-generate based on model sizes.
|
| 82 |
+
"""
|
| 83 |
+
try:
|
| 84 |
+
print(f"\nProcessing {leaderboard_json_path}...")
|
| 85 |
+
|
| 86 |
+
# Create mapping based on model sizes if not provided
|
| 87 |
+
if custom_mapping is None:
|
| 88 |
+
print(" Creating size-based mapping...")
|
| 89 |
+
mapping = create_size_based_mapping(leaderboard_json_path)
|
| 90 |
+
else:
|
| 91 |
+
mapping = custom_mapping
|
| 92 |
+
print(" Using provided custom mapping...")
|
| 93 |
+
|
| 94 |
+
# Load the data
|
| 95 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 96 |
+
data = json.load(f)
|
| 97 |
+
|
| 98 |
+
# Create new data structure with proper ordering
|
| 99 |
+
new_data = OrderedDict()
|
| 100 |
+
|
| 101 |
+
# Process each section
|
| 102 |
+
for section_name, section_data in data.items():
|
| 103 |
+
new_section = OrderedDict()
|
| 104 |
+
|
| 105 |
+
# First, collect all the remapped data with their new indices
|
| 106 |
+
temp_dict = {}
|
| 107 |
+
for old_idx, value in section_data.items():
|
| 108 |
+
new_idx = mapping.get(old_idx, old_idx)
|
| 109 |
+
temp_dict[int(new_idx)] = value
|
| 110 |
+
|
| 111 |
+
# Sort by new index and add to ordered dict - this ensures physical ordering
|
| 112 |
+
for key in sorted(temp_dict.keys()):
|
| 113 |
+
new_section[str(key)] = temp_dict[key]
|
| 114 |
+
|
| 115 |
+
new_data[section_name] = new_section
|
| 116 |
+
|
| 117 |
+
# Write the reorganized data with proper physical ordering
|
| 118 |
+
with open(leaderboard_json_path, 'w') as f:
|
| 119 |
+
json.dump(new_data, f, indent=4, ensure_ascii=False)
|
| 120 |
+
|
| 121 |
+
print(f" ✅ Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
|
| 122 |
+
|
| 123 |
+
# Print the new order for verification
|
| 124 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 125 |
+
example_data = json.load(f)
|
| 126 |
+
|
| 127 |
+
print(f'\n New model order (first 8) from {leaderboard_json_path}:')
|
| 128 |
+
model_section = example_data['Model']
|
| 129 |
+
model_keys = list(model_section.keys())[:8]
|
| 130 |
+
for i, key in enumerate(model_keys):
|
| 131 |
+
model_name = model_section[key]
|
| 132 |
+
size = extract_model_size(model_name)
|
| 133 |
+
print(f' Position {i} (Index {key}): {model_name} ({size}B)')
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f" ❌ Error processing {leaderboard_json_path}: {e}")
|
| 137 |
+
raise
|
| 138 |
+
|
| 139 |
+
def reorganize_all_leaderboards(leaderboard_dir: str = "leaderboards"):
|
| 140 |
+
"""
|
| 141 |
+
Reorganize all leaderboard files in the specified directory.
|
| 142 |
+
"""
|
| 143 |
+
leaderboard_files = [
|
| 144 |
+
f"{leaderboard_dir}/CoT_leaderboard.json",
|
| 145 |
+
f"{leaderboard_dir}/Zero-Shot_leaderboard.json",
|
| 146 |
+
f"{leaderboard_dir}/Few-Shot_leaderboard.json"
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
print("🔄 Starting reorganization of all leaderboards based on model size...")
|
| 150 |
+
|
| 151 |
+
for file_path in leaderboard_files:
|
| 152 |
+
if os.path.exists(file_path):
|
| 153 |
+
reorganize_indices(file_path)
|
| 154 |
+
else:
|
| 155 |
+
print(f" ⚠️ Warning: {file_path} not found, skipping...")
|
| 156 |
+
|
| 157 |
+
print("\n✅ All leaderboards have been reorganized!")
|
| 158 |
+
|
| 159 |
+
# Legacy function for backward compatibility (but with dynamic range)
|
| 160 |
+
def reorganize_indices_legacy(leaderboard_json_path: str):
|
| 161 |
+
"""
|
| 162 |
+
Legacy function that uses the old hard-coded mapping style but with dynamic range.
|
| 163 |
+
This is kept for backward compatibility but now properly handles all 99 models.
|
| 164 |
+
"""
|
| 165 |
+
# Create the mapping from old indices to new indices (ordered by model size)
|
| 166 |
+
mapping = {
|
| 167 |
+
'0': '7', # DeepSeek-R1-Distill-Llama-70B (70B) goes to 7 (end)
|
| 168 |
+
'1': '0', # DeepSeek-R1-Distill-Qwen-1.5B (1.5B) goes to 0 (start)
|
| 169 |
+
'2': '6', # DeepSeek-R1 (large model) goes to 6
|
| 170 |
+
'3': '1', # DeepSeek-R1-Distill-Qwen-7B (7B) goes to 1
|
| 171 |
+
'4': '3', # DeepSeek-R1-Distill-Qwen-14B (14B) goes to 3
|
| 172 |
+
'5': '2', # DeepSeek-R1-Distill-Llama-8B (8B) goes to 2
|
| 173 |
+
'6': '5', # Baichuan-M2-32B (32B) goes to 5
|
| 174 |
+
'7': '4', # Baichuan-M1-14B-Instruct (14B) goes to 4
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# Dynamically determine the range based on actual data
|
| 178 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 179 |
+
data = json.load(f)
|
| 180 |
+
|
| 181 |
+
if 'Model' in data:
|
| 182 |
+
max_index = max(int(k) for k in data['Model'].keys())
|
| 183 |
+
print(f" Found {len(data['Model'])} models (indices 0-{max_index})")
|
| 184 |
+
|
| 185 |
+
# For indices 8 and beyond, they stay the same
|
| 186 |
+
for i in range(8, max_index + 1):
|
| 187 |
+
mapping[str(i)] = str(i)
|
| 188 |
+
else:
|
| 189 |
+
print(" Warning: No 'Model' section found, using default range")
|
| 190 |
+
# Fallback to 99 models (0-98)
|
| 191 |
+
for i in range(8, 99):
|
| 192 |
+
mapping[str(i)] = str(i)
|
| 193 |
+
|
| 194 |
+
# Process each JSON file
|
| 195 |
+
print(f"\nProcessing {leaderboard_json_path}...")
|
| 196 |
+
|
| 197 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 198 |
+
data = json.load(f)
|
| 199 |
+
|
| 200 |
+
# Create new data structure with proper ordering
|
| 201 |
+
new_data = OrderedDict()
|
| 202 |
+
|
| 203 |
+
# Process each section
|
| 204 |
+
for section_name, section_data in data.items():
|
| 205 |
+
new_section = OrderedDict()
|
| 206 |
+
|
| 207 |
+
# First, collect all the remapped data with their new indices
|
| 208 |
+
temp_dict = {}
|
| 209 |
+
for old_idx, value in section_data.items():
|
| 210 |
+
new_idx = mapping.get(old_idx, old_idx)
|
| 211 |
+
temp_dict[int(new_idx)] = value
|
| 212 |
+
|
| 213 |
+
# Sort by new index and add to ordered dict - this ensures physical ordering
|
| 214 |
+
for key in sorted(temp_dict.keys()):
|
| 215 |
+
new_section[str(key)] = temp_dict[key]
|
| 216 |
+
|
| 217 |
+
new_data[section_name] = new_section
|
| 218 |
+
|
| 219 |
+
# Write the reorganized data with proper physical ordering
|
| 220 |
+
with open(leaderboard_json_path, 'w') as f:
|
| 221 |
+
json.dump(new_data, f, indent=4, ensure_ascii=False)
|
| 222 |
+
|
| 223 |
+
print(f" Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
|
| 224 |
+
|
| 225 |
+
# Print the new order for verification
|
| 226 |
+
with open(leaderboard_json_path, 'r') as f:
|
| 227 |
+
example_data = json.load(f)
|
| 228 |
+
|
| 229 |
+
print(f'\nNew model order (first 8) from {leaderboard_json_path}:')
|
| 230 |
+
model_section = example_data['Model']
|
| 231 |
+
# Since we're using OrderedDict and sorted insertion, the first 8 entries should be indices 0-7
|
| 232 |
+
model_keys = list(model_section.keys())[:8]
|
| 233 |
+
for i, key in enumerate(model_keys):
|
| 234 |
+
print(f' Position {i} (Index {key}): {model_section[key]}')
|
scripts/main.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from helpers.excel_processor import ExcelProcessor
|
| 2 |
+
from helpers.reorganize_indices import reorganize_indices
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
def update_ranks():
|
| 6 |
+
final_leaderboard_paths = [
|
| 7 |
+
"/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/CoT_leaderboard.json",
|
| 8 |
+
"/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Few-Shot_leaderboard.json",
|
| 9 |
+
"/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Zero-Shot_leaderboard.json"
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
for leaderboard_path in final_leaderboard_paths:
|
| 13 |
+
with open(leaderboard_path, 'r') as f:
|
| 14 |
+
data = json.load(f)
|
| 15 |
+
|
| 16 |
+
avg_performance_dict = data['Average Performance']
|
| 17 |
+
|
| 18 |
+
# Tuples of the original index (key) and the performance score
|
| 19 |
+
tps = []
|
| 20 |
+
for idx, value in avg_performance_dict.items():
|
| 21 |
+
tps.append((idx, value))
|
| 22 |
+
|
| 23 |
+
# Sort the tuples by the performance score in descending order
|
| 24 |
+
tps.sort(key=lambda x: float(x[1]), reverse=True)
|
| 25 |
+
|
| 26 |
+
for rank, tp in enumerate(tps):
|
| 27 |
+
original_idx = tp[0]
|
| 28 |
+
|
| 29 |
+
data['T'][original_idx] = rank + 1 # Rank starts from 1
|
| 30 |
+
|
| 31 |
+
with open(leaderboard_path, 'w') as f:
|
| 32 |
+
json.dump(data, f, indent=4, ensure_ascii=False)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def create_leaderboards(
|
| 36 |
+
excel_path: str,
|
| 37 |
+
output_path: str,
|
| 38 |
+
sheet_names_list: list,
|
| 39 |
+
invalid_models=None
|
| 40 |
+
):
|
| 41 |
+
|
| 42 |
+
"""
|
| 43 |
+
Function that updates a singular leaderboard (JSON).
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
excel_path: Path to the excel file
|
| 47 |
+
output_path: Path to the output file
|
| 48 |
+
sheet_names_list: List of sheet names to create leaderboards from
|
| 49 |
+
invalid_models: List of models to exclude from the leaderboards
|
| 50 |
+
"""
|
| 51 |
+
excel_processor = ExcelProcessor(excel_path, invalid_models)
|
| 52 |
+
|
| 53 |
+
# Create leaderboards (JSON)
|
| 54 |
+
excel_processor.create_leaderboards(sheet_names_list=sheet_names_list, output_path=output_path)
|
| 55 |
+
|
| 56 |
+
# Reorganize the leaderboard inices
|
| 57 |
+
reorganize_indices(output_path)
|
| 58 |
+
|
| 59 |
+
# Create task information JSON
|
| 60 |
+
excel_processor.create_task_information('task_information.json')
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def create_all_leaderboards(
|
| 64 |
+
excel_path: str,
|
| 65 |
+
leaderboard_configs: list,
|
| 66 |
+
invalid_models=None
|
| 67 |
+
):
|
| 68 |
+
|
| 69 |
+
"""
|
| 70 |
+
Loops through each leaderboard's configs to update all leaderboards
|
| 71 |
+
(calls the above function multiple times)
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
excel_path: Path to the excel file
|
| 75 |
+
leaderboard_configs: List of leaderboard configs
|
| 76 |
+
invalid_models: List of models to exclude from the leaderboards
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
for config in leaderboard_configs:
|
| 80 |
+
print(f"Creating {config['name']} leaderboard...")
|
| 81 |
+
create_leaderboards(
|
| 82 |
+
excel_path,
|
| 83 |
+
config['output_path'],
|
| 84 |
+
config['sheet_names'],
|
| 85 |
+
invalid_models=invalid_models
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
print(f"{config['name']} leaderboard created successfully!")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
print("***" * 50)
|
| 93 |
+
print("Starting script...")
|
| 94 |
+
|
| 95 |
+
# # ######################################################### #
|
| 96 |
+
# # ######################################################### #
|
| 97 |
+
|
| 98 |
+
# HOW TO UPDATE LEADERBOARDS
|
| 99 |
+
# 1. Download the new excel sheet and/or update the path to the excel sheet
|
| 100 |
+
# 2. Specify which models to exclude from the leaderboard in "invalid_models" list
|
| 101 |
+
# 3. Run scripts/main.py
|
| 102 |
+
# 4. Done! All leaderboards and task information have been updated.
|
| 103 |
+
# 5. Push to GitHub and deploy to Hugging Face Spaces.
|
| 104 |
+
|
| 105 |
+
# # ######################################################### #
|
| 106 |
+
# # ######################################################### #
|
| 107 |
+
|
| 108 |
+
# excel_path --> path to the Google Sheet version you want to use (Clinical Benchmark and LLM)
|
| 109 |
+
excel_path = "/Users/kevinxie/Desktop/projects/BRIDGE-Leaderboard-INTERNAL/Clinical Benchmark and LLM.xlsx"
|
| 110 |
+
|
| 111 |
+
# Configuration for all leaderboards
|
| 112 |
+
leaderboard_configs = [
|
| 113 |
+
{
|
| 114 |
+
'name': 'Zero-Shot',
|
| 115 |
+
'output_path': 'leaderboards/Zero-Shot_leaderboard.json',
|
| 116 |
+
'sheet_names': ["B-CLF", "B-EXT", "B-GEN"]
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
'name': 'Few-Shot',
|
| 120 |
+
'output_path': 'leaderboards/Few-Shot_leaderboard.json',
|
| 121 |
+
'sheet_names': ["B-CLF-5shot", "B-EXT-5shot", "B-GEN-5shot"]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
'name': 'CoT',
|
| 125 |
+
'output_path': 'leaderboards/CoT_leaderboard.json',
|
| 126 |
+
'sheet_names': ["B-CLF-CoT", "B-EXT-CoT", "B-GEN-CoT"]
|
| 127 |
+
}
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
invalid_models = [
|
| 131 |
+
"gemma-3-27b-pt",
|
| 132 |
+
"gemma-3-12b-pt",
|
| 133 |
+
"gemma-3-12b-pt-ylab-4-1-1",
|
| 134 |
+
"gemma-3-12b-pt-ylab-8-1-1",
|
| 135 |
+
"gemma-3-12b-pt-ylab-16-1-1"
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
# Create all leaderboards with a single function call
|
| 139 |
+
create_all_leaderboards(excel_path, leaderboard_configs, invalid_models)
|
| 140 |
+
|
| 141 |
+
print("***" * 50)
|
| 142 |
+
print("Leaderboards created successfully!")
|
| 143 |
+
|
| 144 |
+
# Update the ranks of the leaderboards (leftmost column)
|
| 145 |
+
update_ranks()
|
| 146 |
+
|
| 147 |
+
print("***" * 50)
|
| 148 |
+
print("Ranks updated successfully!")
|
| 149 |
+
print("***" * 50)
|
| 150 |
+
print("Complete!")
|
| 151 |
+
|