Spaces:

YLab-Open
/

BRIDGE-Medical-Leaderboard

Running

App Files Files Community

Kevin Xie commited on Nov 1, 2025

Commit

3070e58

1 Parent(s): 9aca690

Upload main processing scripts for the leaderboard

Browse files

Files changed (6) hide show

scripts/helpers/CONSTANTS.py +124 -0
scripts/helpers/__init__.py +3 -0
scripts/helpers/excel_processor.py +358 -0
scripts/helpers/leaderboards.py +15 -0
scripts/helpers/reorganize_indices.py +234 -0
scripts/main.py +151 -0

scripts/helpers/CONSTANTS.py ADDED Viewed

	@@ -0,0 +1,124 @@

+TASK_MAPPING = {
+    "1-1.ADE-ADE identification": "ADE-Identification",
+    "1-2.ADE-ADE relation": "ADE-Extraction",
+    "1-3.ADE-Drug dosage": "ADE-Drug dosage",
+    "3-2.BARR2-resolution": "BARR2",
+    "5.BrainMRI-AIS": "BrainMRI-AIS",
+    "6.Brateca.hospitalization": "Brateca-Hospitalization",
+    "6.Brateca.mortality": "Brateca-Mortality",
+    "7.Cantemist.CODING": "Cantemist-Coding",
+    "7.Cantemist.NER": "Cantemis-NER",
+    "7.Cantemist.Norm": "Cantemis-Norm",
+    "8.CARES.area": "CARES-Area",
+    "8.CARES.icd10_block": "CARES ICD10 Block",
+    "8.CARES.icd10_chapter": "CARES-ICD10 Chapter",
+    "8.CARES.icd10_sub_block": "CARES-ICD10 Subblock",
+    "9.CHIP-CDEE": "CHIP-CDEE",
+    "12.C-EMRS": "C-EMRS",
+    "17-1.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-CM": "CodiEsp-ICD-10-CM",
+    "17-2.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-PCS": "CodiEsp-ICD-10-PCS",
+    "19.ClinicalNotes-UPMC": "ClinicalNotes-UPMC",
+    "20.clinical records from the Mexican Social Security Institute": "PPTS",
+    "21.CLINpt": "CLINpt-NER",
+    "22.CLIP": "CLIP",
+    "23.cMedQA": "cMedQA",
+    "26.DialMed": "DialMed",
+    "27.DiSMed": "DiSMed-NER",
+    "28.MIE": "MIE",
+    "29.EHRQA.primary_department": "EHRQA-Primary department",
+    "29.EHRQA.qa": "EHRQA-QA",
+    "29.EHRQA.sub_department": "EHRQA-Sub department",
+    "31.Ex4CDS": "Ex4CDS",
+    "33.GOUT-CC.consensus": "GOUT-CC-Consensus",
+    "35.n2c2 2006 - De-identification": "n2c2 2006-De-identification",
+    "37.i2b2-2009-Medication-Extraction-Challenge": "Medication extraction",
+    "38-1.i2b2-2010-Relations-Challenge-concept": "n2c2 2010-Concept",
+    "38-2.i2b2-2010-Relations-Challenge-assertion": "n2c2 2010-Assertion",
+    "38-3.i2b2-2010-Relations-Challenge-relation": "n2c2 2010-Relation",
+    "41.n2c2 2014 - De-identification": "n2c2 2014-De-identification",
+    "43.IMCS-V2-NER": "IMCS-V2-NER",
+    "46.Japanese Case Reports": "JP-STS",
+    "48.meddocan": "meddocan",
+    "51.MEDIQA_2019_Task2_RQE": "MEDIQA 2019-RQE",
+    "55.MedNLI": "MedNLI",
+    "57.MedSTS": "MedSTS",
+    "62.mtsamples": "MTS",
+    "63.MTSamples-temporal annotation": "MTS-Temporal",
+    "65.n2c2-2018-Track2-Adverse-Drug-Events-and-Medication-Extraction": "n2c2 2018-ADE&medication",
+    "66-1.NorSynthClinical-entity": "NorSynthClinical-NER",
+    "66-2.NorSynthClinical-relation": "NorSynthClinical-RE",
+    "68.NUBES": "NUBES",
+    "76-1.MTS-Dialog-MEDIQA-2023-chat-task-A": "MEDIQA 2023-chat-A",
+    "76-2.MTS-Dialog-MEDIQA-2023-sum-task-A": "MEDIQA 2023-sum-A",
+    "76-3.MTS-Dialog-MEDIQA-2023-sum-task-B": "MEDIQA 2023-sum-B",
+    "80.RuMedDaNet": "RuMedDaNet",
+    "81.CHIP-CDN": "CBLUE-CDN",
+    "82.CHIP-CTC": "CHIP-CTC",
+    "83.CHIP-MDCFNPC": "CHIP-MDCFNPC",
+    "84.MedDG": "MedDG",
+    "85.IMCS-V2-SR": "IMCS-V2-SR",
+    "86.IMCS-V2-MRG": "IMCS-V2-MRG",
+    "87.IMCS-V2-DAC": "IMCS-V2-DAC",
+    "90-1.n2c2 2014 - Heart Disease Challenge - Diabete": "n2c2 2014-Diabetes",
+    "90-2.n2c2 2014 - Heart Disease Challenge - CAD": "n2c2 2014-CAD",
+    "90-3.n2c2 2014 - Heart Disease Challenge - Hyperlipidemia": "n2c2 2014-Hyperlipidemia",
+    "90-4.n2c2 2014 - Heart Disease Challenge - Hypertension": "n2c2 2014-Hypertension",
+    "90-8.n2c2 2014 - Heart Disease Challenge - Medication": "n2c2 2014-Medication",
+    "91-1.CAS.label": "CAS-label",
+    "91-2.CAS.evidence": "CAS-evidence",
+    "93.RuMedNLI": "RuMedNLI",
+    "94.RuDReC": "RuDReC-NER",
+    "95.NorSynthClinical-PHI": "NorSynthClinical-PHI",
+    "96.RuCCoN.NER": "RuCCoN",
+    "97.CLISTER": "CLISTER",
+    "98.BRONCO150.NER_status": "BRONCO150-NER&Status",
+    "99.CARDIO:DE": "CARDIO-DE",
+    "100.GraSSCo_PHI": "GraSSCo PHI",
+    "101.IFMIR.IncidentType": "IFMIR-Incident type",
+    "101.IFMIR.NER": "IFMIR-NER",
+    "101.IFMIR.NER_factuality": "IFMIR - NER&factuality",
+    "102.iCorpus": "iCorpus",
+    "103.icliniq-10k": "icliniq-10k",
+    "104.HealthCareMagic-100k": "HealthCareMagic-100k",
+    "105.MIMIC-IV CDM": "MIMIC-IV CDM",
+    "106.MIMIC-III Outcome.LoS": "MIMIC-III Outcome.LoS",
+    "106.MIMIC-III Outcome.Mortality": "MIMIC-III Outcome.Mortality",
+    "107.MIMIC-IV BHC": "MIMIC-IV BHC",
+    "108.MIMIC-IV DiReCT.Dis": "MIMIC-IV DiReCT.Dis",
+    "108.MIMIC-IV DiReCT.PDD": "MIMIC-IV DiReCT.PDD"
+}
+DATA_ACCESS_MAP = {
+    "OA": "Open Access",
+    "R": "Regulated",
+    "P": "Regulated",
+    "N": "Regulated"
+}
+# Mappings
+DOMAIN_MAPPING = {
+    "General": "General",
+    "Medical": "Medical"
+}
+# In the filter, there should only be two categories (Open, Private)
+# In the table itself, there should be more categories
+LICENSE_MAPPING = {
+    "Baichuan-M1-14B": "Open Source",
+    "MIT": "Open Source",
+    "Gemma": "Open Source",
+    "Llama-3.1": "Open Source",
+    "Llama-3.3": "Open Source",
+    "Apache 2.0": "Open Source",
+    "PhysioNet Credentialed Health Data License 1.5.0": "Open Source",
+    "Llama-3": "Open Source",
+    "MRL": "Open Source",
+    "Qwen": "Open Source",
+    "Nexusflow Research License": "Open Source",
+    "Proprietary": "Proprietary",
+    "Llama-4": "Open Source",
+    "Health AI Developer Foundations terms of use": "Open Source",
+    "llama2": "Open Source"
+}

scripts/helpers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .CONSTANTS import *
+from .excel_processor import ExcelProcessor
+from .reorganize_indices import reorganize_indices

scripts/helpers/excel_processor.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import pandas as pd
+import json
+from .CONSTANTS import *
+class ExcelProcessor:
+    def __init__(self, excel_path, invalid_models=None):
+        """Initialize the ExcelProcessor with an Excel file.
+        Args:
+            excel_path (str): Path to the Excel file containing model and task data.
+        """
+        # excel_path = path to excel file
+        self.sheet_path = excel_path
+        self.excel_data = self.load_excel()
+        self.model_sheet = self.load_sheet("Models (Simplified)")
+        self.invalid_models = invalid_models
+        print("You have excluded the following models: ", self.invalid_models)
+        # Get all of the valid models (exclude invalid models)
+        self.valid_models = self.get_valid_models(self.invalid_models)
+        # print("VALID MODELS: ", self.valid_models)
+    def load_excel(self):
+        """Load the Excel file into a pandas ExcelFile object.
+        Returns:
+            pd.ExcelFile: The loaded Excel file object.
+        """
+        return pd.ExcelFile(self.sheet_path)
+    def load_sheet(self, sheet_name):
+        """Load a specific sheet from the Excel file.
+        Args:
+            sheet_name (str): Name of the sheet to load.
+        Returns:
+            pd.DataFrame: The loaded sheet as a pandas DataFrame.
+        """
+        return self.excel_data.parse(sheet_name)
+    def get_valid_models(self, invalid_models=None):
+        """Get all valid models from the Models sheet, excluding invalid ones.
+        Returns:
+            list: List of valid model names that should be included in evaluation.
+        """
+        valid_models = []
+        for idx, model_name in enumerate(self.model_sheet["Name"]):
+            if model_name not in invalid_models:
+                valid_models.append(model_name)
+        return valid_models
+    def get_valid_columns(self, sheet_name):
+        """Get all non-empty columns from a specified sheet.
+        Args:
+            sheet_name (str): Name of the sheet to analyze.
+        Returns:
+            list: List of valid column names (excluding unnamed columns).
+        """
+        valid_columns = []
+        for column in self.load_sheet(sheet_name).columns:
+            if column.split(' ')[0] != "Unnamed:":
+                valid_columns.append(column.strip())
+        return valid_columns
+    def get_model_information(self,
+                            sheet_name = "Models (Simplified)",
+                            name_column = "Name",
+                            domain_column = "Domain",
+                            license_column = "License",
+                            size_column = "Size (B)",
+                          ):
+        """Extract model information from the Models sheet.
+        Args:
+            sheet_name (str, optional): Name of the sheet containing model info.
+                Defaults to "Models (Simplified)".
+            name_column (str, optional): Column name containing model names.
+                Defaults to "Name".
+            domain_column (str, optional): Column name containing model domains.
+                Defaults to "Domain".
+            license_column (str, optional): Column name containing license info.
+                Defaults to "License".
+            size_column (str, optional): Column name containing model sizes.
+                Defaults to "Size (B)".
+        Returns:
+            tuple: A tuple containing 7 dictionaries:
+                - model_name_info: Model names indexed by position
+                - domain_info: Model domains mapped using DOMAIN_MAPPING
+                - license_info: License information (abbreviated if needed)
+                - accessibility_info: Accessibility mapped using LICENSE_MAPPING
+                - displayed_size_info: Raw size values for display
+                - hidden_size_info: Size ranges for filtering
+                - T_info: Position markers for the leaderboard
+        """
+        # Load the model sheet
+        model_sheet = self.load_sheet(sheet_name)
+        # Everything to be returned.
+        T_info = {}
+        model_name_info = {}
+        domain_info = {}
+        license_info = {}
+        accessibility_info = {}
+        displayed_size_info = {} # shown on leaderboard
+        hidden_size_info = {} # hidden column
+        def map_size(param_size):
+            """Map parameter size to predefined ranges.
+            Args:
+                param_size: The parameter size value.
+            Returns:
+                str: Size range category.
+            """
+            if param_size == "/":
+                return "None"
+            if param_size == "Unknown":
+                return "Unknown"
+            size = int(param_size)
+            if size < 5:
+                return "0-5"
+            elif size < 10:
+                return "5-10"
+            elif size < 40:
+                return "10-40"
+            elif size < 80:
+                return "40-80"
+            else:
+                return ">80"
+        i = 0
+        for name, domain, license, size in zip(model_sheet[name_column],
+                                                    model_sheet[domain_column],
+                                                    model_sheet[license_column],
+                                                    model_sheet[size_column]):
+            # If it is a valid model (used in evaluation)
+            if name in self.valid_models:
+                T_info[f"{i}"] = "\ud83d\udd36"
+                model_name_info[f"{i}"] = name
+                domain_info[f"{i}"] = DOMAIN_MAPPING[domain]
+                if license == "PhysioNet Credentialed Health Data License 1.5.0":
+                    license_info[f"{i}"] = "PhysioNet 1.5.0"   # Abbreviate license name to fit on leaderboard
+                else:
+                    license_info[f"{i}"] = license
+                accessibility_info[f"{i}"] = LICENSE_MAPPING[license]
+                displayed_size_info[f"{i}"] = size
+                hidden_size_info[f"{i}"] = map_size(size)
+                i += 1
+            else:
+                print("Invalid model: ", name)
+        return model_name_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info
+    def get_sheet_information(self, sheets_list, task_names_list, task_types_list):
+        """Extract task performance information from specified sheets.
+        Args:
+            sheets_list (list): List of sheet names to process.
+            task_names_list (list): List of task names corresponding to each sheet.
+            task_types_list (list): List of task types ('ext', 'gen', etc.) for each sheet.
+        Returns:
+            dict: Dictionary mapping task names to model performance data.
+                Format: {task_name: {model_index: performance_score}}
+        """
+        task_info = {}
+        # Iterate through each row
+        for idx, sheet in enumerate(sheets_list):
+            # Get the task type (tt)
+            tt = task_types_list[idx]
+            # Load the sheet
+            model_sheet = self.load_sheet(sheet)
+            # Name of the task (i.e. 1.1-ADE Identification)
+            task_name = task_names_list[idx]
+            # Get all columns in the sheet
+            for i, t in enumerate(model_sheet['Task Type']):
+                if i == 0:
+                    continue
+                # Break out of loop when it reaches the end of the sheet
+                if t == "-":
+                    break
+                row = i
+                task_counter = 0
+                for model in self.valid_models:
+                    column_name = model.strip()
+                    if column_name == "gpt-35-turbo-0125":
+                        column_name = "gpt-35-turbo"
+                    elif column_name == "gpt-4o-0806":
+                        column_name = "gpt-4o"
+                    elif column_name == "gemini-2.0-flash-001":
+                        column_name = "gemini-2.0-flash"
+                    elif column_name == "gemini-1.5-pro-002":
+                        column_name = "gemini-1.5-pro"
+                    if column_name == "gpt-oss-20b":
+                        column_name = "gpt-oss-20b-high"
+                    elif column_name == "gpt-oss-120b":
+                        column_name = "gpt-oss-120b-high"
+                    if tt == 'ext':
+                        column_name = column_name + '.1'
+                    elif tt == 'gen':
+                        column_name = column_name + '.1'
+                    # Name of the task (i.e 1.1-ADE Identification)
+                    task = model_sheet[task_name][row]
+                    # Update task name to more simple version
+                    task = TASK_MAPPING[task]
+                    if task == "Average score":
+                        break
+                    # Update the information for each task
+                    if task not in task_info:
+                        task_info[task] = {}
+                    task_info[task][f"{task_counter}"] = round(float(model_sheet[column_name.strip()][row].split(" ")[0]), 2)
+                    task_counter += 1
+        return task_info
+    def add_average_performance(self, task_info):
+        """Calculate average performance across all tasks for each model.
+        Args:
+            task_info (dict): Dictionary containing task performance data.
+                Format: {task_name: {model_index: performance_score}}
+        Returns:
+            dict: Dictionary mapping model indices to average performance scores.
+                Format: {model_index: average_score}
+        """
+        for task in task_info:
+            n = len(task_info[task])
+            break
+        average_performance_info = {}
+        for i in range(n):
+            perf = 0
+            num_tasks = 0
+            for task in task_info:
+                perf += float(task_info[task][str(i)])
+                num_tasks += 1
+            average_performance_info[f"{i}"] = str(round(perf / num_tasks, 2))
+        return average_performance_info
+    def create_leaderboards(
+            self,
+            sheet_names_list=None,
+            task_names_list=["Task-Classification", "Task-Extraction", "Task-Generation"],
+            task_types_list=["cls", "ext", "gen"],
+            output_path=None):
+        """Create a leaderboard JSON file from Excel data.
+        Args:
+            sheet_names_list (list, optional): List of sheet names to process.
+            task_names_list (list, optional): List of task names corresponding to sheets.
+            task_types_list (list, optional): List of task types for each sheet.
+            leaderboard_name (str, optional): Name of the leaderboard being created.
+            output_path (str, optional): Path where the JSON file should be saved.
+        Note:
+            Creates one leaderboard per call (CoT, Direct, or Few-Shot).
+            The output JSON contains model information, task performance, and metadata.
+        """
+        data = {}
+        model_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info = self.get_model_information()
+        task_info = self.get_sheet_information(sheet_names_list, task_names_list, task_types_list)
+        average_performance_info = self.add_average_performance(task_info)
+        data["T"] = T_info
+        data["Model"] = model_info
+        data["Model: Domain"] = domain_info
+        data["Model: License"] = license_info
+        data["Model: Accessibility"] = accessibility_info
+        data["Size (B)"] = displayed_size_info
+        data["Model: Size Range"] = hidden_size_info
+        data["Average Performance"] = average_performance_info
+        for task in task_info:
+            data[task] = task_info[task]
+        with open(output_path, 'w') as file:
+            json.dump(data, file, indent=4)
+    def create_task_information(self, output_path: str):
+        """Create a JSON file containing detailed task information.
+        Args:
+            output_path (str): Path where the task information JSON should be saved.
+        Note:
+            Extracts task metadata from the "Task-all" sheet including language,
+            task type, clinical context, data access requirements, applications,
+            and clinical stage information.
+        """
+        task_sheet = self.load_sheet("Task-all")
+        # Initialize a map to store the json information
+        info = {}
+        # Iterate through the "Task-Original" column, which contains all of the task names
+        for idx, task in enumerate(task_sheet["Task name"]):
+            # Add the task to the final json
+            if task not in info:
+                info[task] = {}
+            # Add all of the attributes to the task
+            language = task_sheet["Language"][idx]
+            task_type = task_sheet["Task Type - fine grained"][idx]
+            clinical_context = task_sheet["Clinical context"][idx]
+            data_access = task_sheet["Data Access\nOpen Access (OA) / \nRegulated (R) / \nPhysionet (P) / \nn2c2 (N)"][idx]
+            application = task_sheet['Clinical Application'][idx]
+            clinical_stage = task_sheet['Clinical Stage'][idx]
+            info[task]["Language"] = language.strip()
+            info[task]["Task Type"] = task_type.strip()
+            info[task]["Clinical Context"] = clinical_context.strip()
+            info[task]["Data Access"] = DATA_ACCESS_MAP[data_access.strip()]
+            info[task]['Applications'] = application.strip()
+            info[task]['Clinical Stage'] = clinical_stage.strip()
+        with open(output_path, 'w') as file:
+            json.dump(info, file, indent=4)

scripts/helpers/leaderboards.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+import pandas as pd
+class LeaderboardProcessor:
+    def __init__(self, output_path):
+        self.output_path = output_path
+    def update_leaderboards(self, old_leaderboard_json, new_models):
+        """
+        Args:
+            - old_leaderboard_json: json file including the previous leaderboard data
+            - new_models: List[str] --> a list of strings of new models to update the leaderboard with
+        """
+        pass

scripts/helpers/reorganize_indices.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import json
+import re
+from collections import OrderedDict
+from typing import Dict, List, Tuple, Optional
+def extract_model_size(model_name: str) -> float:
+    """
+    Extract the size (in billions of parameters) from a model name.
+    Returns a large number for models without explicit size (like DeepSeek-R1).
+    """
+    # Look for patterns like "1.5B", "7B", "70B", "32B", etc.
+    size_match = re.search(r'(\d+(?:\.\d+)?)[Bb]', model_name)
+    if size_match:
+        return float(size_match.group(1))
+    # Special handling for models without explicit size
+    if 'DeepSeek-R1' in model_name and 'Distill' not in model_name:
+        return 999.0  # Treat as very large model
+    # Default fallback - treat as medium size
+    return 50.0
+def get_size_based_order(models: Dict[str, str]) -> List[Tuple[str, str, float]]:
+    """
+    Get models sorted by size with their original indices.
+    Returns list of (original_index, model_name, size) tuples sorted by size.
+    """
+    model_data = []
+    for idx, model_name in models.items():
+        size = extract_model_size(model_name)
+        model_data.append((idx, model_name, size))
+    # Sort by size (ascending), then by name for ties
+    return sorted(model_data, key=lambda x: (x[2], x[1]))
+def create_size_based_mapping(leaderboard_json_path: str) -> Dict[str, str]:
+    """
+    Create a mapping from current indices to size-based indices for a specific leaderboard.
+    """
+    try:
+        with open(leaderboard_json_path, 'r') as f:
+            data = json.load(f)
+        if 'Model' not in data:
+            raise ValueError(f"No 'Model' section found in {leaderboard_json_path}")
+        models = data['Model']
+        # Get the first 8 models (the main ones we want to reorder)
+        first_8_models = {k: v for k, v in list(models.items())[:8]}
+        # Get size-based ordering
+        sorted_models = get_size_based_order(first_8_models)
+        # Create mapping from old index to new index
+        mapping = {}
+        for new_idx, (old_idx, model_name, size) in enumerate(sorted_models):
+            mapping[old_idx] = str(new_idx)
+            print(f"  {model_name} ({size}B): {old_idx} → {new_idx}")
+        # For indices 8 and beyond, they stay the same
+        # Now properly handle all models (up to 99 instead of hard-coded 73)
+        max_index = max(int(k) for k in models.keys())
+        print(f"  Total models: {len(models)}, max index: {max_index}")
+        for i in range(8, max_index + 1):
+            mapping[str(i)] = str(i)
+        return mapping
+    except Exception as e:
+        print(f"Error creating mapping for {leaderboard_json_path}: {e}")
+        raise
+def reorganize_indices(leaderboard_json_path: str, custom_mapping: Optional[Dict[str, str]] = None):
+    """
+    Reorganize the indices of a leaderboard JSON file based on model size.
+    Args:
+        leaderboard_json_path: Path to the leaderboard JSON file
+        custom_mapping: Optional custom mapping dict. If None, will auto-generate based on model sizes.
+    """
+    try:
+        print(f"\nProcessing {leaderboard_json_path}...")
+        # Create mapping based on model sizes if not provided
+        if custom_mapping is None:
+            print("  Creating size-based mapping...")
+            mapping = create_size_based_mapping(leaderboard_json_path)
+        else:
+            mapping = custom_mapping
+            print("  Using provided custom mapping...")
+        # Load the data
+        with open(leaderboard_json_path, 'r') as f:
+            data = json.load(f)
+        # Create new data structure with proper ordering
+        new_data = OrderedDict()
+        # Process each section
+        for section_name, section_data in data.items():
+            new_section = OrderedDict()
+            # First, collect all the remapped data with their new indices
+            temp_dict = {}
+            for old_idx, value in section_data.items():
+                new_idx = mapping.get(old_idx, old_idx)
+                temp_dict[int(new_idx)] = value
+            # Sort by new index and add to ordered dict - this ensures physical ordering
+            for key in sorted(temp_dict.keys()):
+                new_section[str(key)] = temp_dict[key]
+            new_data[section_name] = new_section
+        # Write the reorganized data with proper physical ordering
+        with open(leaderboard_json_path, 'w') as f:
+            json.dump(new_data, f, indent=4, ensure_ascii=False)
+        print(f"  ✅ Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
+        # Print the new order for verification
+        with open(leaderboard_json_path, 'r') as f:
+            example_data = json.load(f)
+        print(f'\n  New model order (first 8) from {leaderboard_json_path}:')
+        model_section = example_data['Model']
+        model_keys = list(model_section.keys())[:8]
+        for i, key in enumerate(model_keys):
+            model_name = model_section[key]
+            size = extract_model_size(model_name)
+            print(f'    Position {i} (Index {key}): {model_name} ({size}B)')
+    except Exception as e:
+        print(f"  ❌ Error processing {leaderboard_json_path}: {e}")
+        raise
+def reorganize_all_leaderboards(leaderboard_dir: str = "leaderboards"):
+    """
+    Reorganize all leaderboard files in the specified directory.
+    """
+    leaderboard_files = [
+        f"{leaderboard_dir}/CoT_leaderboard.json",
+        f"{leaderboard_dir}/Zero-Shot_leaderboard.json",
+        f"{leaderboard_dir}/Few-Shot_leaderboard.json"
+    ]
+    print("🔄 Starting reorganization of all leaderboards based on model size...")
+    for file_path in leaderboard_files:
+        if os.path.exists(file_path):
+            reorganize_indices(file_path)
+        else:
+            print(f"  ⚠️  Warning: {file_path} not found, skipping...")
+    print("\n✅ All leaderboards have been reorganized!")
+# Legacy function for backward compatibility (but with dynamic range)
+def reorganize_indices_legacy(leaderboard_json_path: str):
+    """
+    Legacy function that uses the old hard-coded mapping style but with dynamic range.
+    This is kept for backward compatibility but now properly handles all 99 models.
+    """
+    # Create the mapping from old indices to new indices (ordered by model size)
+    mapping = {
+        '0': '7',   # DeepSeek-R1-Distill-Llama-70B (70B) goes to 7 (end)
+        '1': '0',   # DeepSeek-R1-Distill-Qwen-1.5B (1.5B) goes to 0 (start)
+        '2': '6',   # DeepSeek-R1 (large model) goes to 6
+        '3': '1',   # DeepSeek-R1-Distill-Qwen-7B (7B) goes to 1
+        '4': '3',   # DeepSeek-R1-Distill-Qwen-14B (14B) goes to 3
+        '5': '2',   # DeepSeek-R1-Distill-Llama-8B (8B) goes to 2
+        '6': '5',   # Baichuan-M2-32B (32B) goes to 5
+        '7': '4',   # Baichuan-M1-14B-Instruct (14B) goes to 4
+    }
+    # Dynamically determine the range based on actual data
+    with open(leaderboard_json_path, 'r') as f:
+        data = json.load(f)
+    if 'Model' in data:
+        max_index = max(int(k) for k in data['Model'].keys())
+        print(f"  Found {len(data['Model'])} models (indices 0-{max_index})")
+        # For indices 8 and beyond, they stay the same
+        for i in range(8, max_index + 1):
+            mapping[str(i)] = str(i)
+    else:
+        print("  Warning: No 'Model' section found, using default range")
+        # Fallback to 99 models (0-98)
+        for i in range(8, 99):
+            mapping[str(i)] = str(i)
+    # Process each JSON file
+    print(f"\nProcessing {leaderboard_json_path}...")
+    with open(leaderboard_json_path, 'r') as f:
+        data = json.load(f)
+        # Create new data structure with proper ordering
+        new_data = OrderedDict()
+        # Process each section
+        for section_name, section_data in data.items():
+            new_section = OrderedDict()
+            # First, collect all the remapped data with their new indices
+            temp_dict = {}
+            for old_idx, value in section_data.items():
+                new_idx = mapping.get(old_idx, old_idx)
+                temp_dict[int(new_idx)] = value
+            # Sort by new index and add to ordered dict - this ensures physical ordering
+            for key in sorted(temp_dict.keys()):
+                new_section[str(key)] = temp_dict[key]
+            new_data[section_name] = new_section
+        # Write the reorganized data with proper physical ordering
+        with open(leaderboard_json_path, 'w') as f:
+            json.dump(new_data, f, indent=4, ensure_ascii=False)
+    print(f"  Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
+    # Print the new order for verification
+    with open(leaderboard_json_path, 'r') as f:
+        example_data = json.load(f)
+    print(f'\nNew model order (first 8) from {leaderboard_json_path}:')
+    model_section = example_data['Model']
+    # Since we're using OrderedDict and sorted insertion, the first 8 entries should be indices 0-7
+    model_keys = list(model_section.keys())[:8]
+    for i, key in enumerate(model_keys):
+        print(f'  Position {i} (Index {key}): {model_section[key]}')

scripts/main.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from helpers.excel_processor import ExcelProcessor
+from helpers.reorganize_indices import reorganize_indices
+import json
+def update_ranks():
+    final_leaderboard_paths = [
+        "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/CoT_leaderboard.json",
+        "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Few-Shot_leaderboard.json",
+        "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Zero-Shot_leaderboard.json"
+    ]
+    for leaderboard_path in final_leaderboard_paths:
+        with open(leaderboard_path, 'r') as f:
+            data = json.load(f)
+        avg_performance_dict = data['Average Performance']
+        # Tuples of the original index (key) and the performance score
+        tps = []
+        for idx, value in avg_performance_dict.items():
+            tps.append((idx, value))
+        # Sort the tuples by the performance score in descending order
+        tps.sort(key=lambda x: float(x[1]), reverse=True)
+        for rank, tp in enumerate(tps):
+            original_idx = tp[0]
+            data['T'][original_idx] = rank + 1  # Rank starts from 1
+        with open(leaderboard_path, 'w') as f:
+            json.dump(data, f, indent=4, ensure_ascii=False)
+def create_leaderboards(
+        excel_path: str,
+        output_path: str,
+        sheet_names_list: list,
+        invalid_models=None
+        ):
+    """
+    Function that updates a singular leaderboard (JSON).
+    Args:
+        excel_path: Path to the excel file
+        output_path: Path to the output file
+        sheet_names_list: List of sheet names to create leaderboards from
+        invalid_models: List of models to exclude from the leaderboards
+    """
+    excel_processor = ExcelProcessor(excel_path, invalid_models)
+    # Create leaderboards (JSON)
+    excel_processor.create_leaderboards(sheet_names_list=sheet_names_list, output_path=output_path)
+    # Reorganize the leaderboard inices
+    reorganize_indices(output_path)
+    # Create task information JSON
+    excel_processor.create_task_information('task_information.json')
+def create_all_leaderboards(
+        excel_path: str,
+        leaderboard_configs: list,
+        invalid_models=None
+        ):
+    """
+    Loops through each leaderboard's configs to update all leaderboards
+    (calls the above function multiple times)
+    Args:
+        excel_path: Path to the excel file
+        leaderboard_configs: List of leaderboard configs
+        invalid_models: List of models to exclude from the leaderboards
+    """
+    for config in leaderboard_configs:
+        print(f"Creating {config['name']} leaderboard...")
+        create_leaderboards(
+            excel_path,
+            config['output_path'],
+            config['sheet_names'],
+            invalid_models=invalid_models
+            )
+        print(f"{config['name']} leaderboard created successfully!")
+if __name__ == "__main__":
+    print("***" * 50)
+    print("Starting script...")
+    # # ######################################################### #
+    # # ######################################################### #
+    #  HOW TO UPDATE LEADERBOARDS
+    # 1. Download the new excel sheet and/or update the path to the excel sheet
+    # 2. Specify which models to exclude from the leaderboard in "invalid_models" list
+    # 3. Run scripts/main.py
+    # 4. Done! All leaderboards and task information have been updated.
+    # 5. Push to GitHub and deploy to Hugging Face Spaces.
+    # # ######################################################### #
+    # # ######################################################### #
+    # excel_path --> path to the Google Sheet version you want to use (Clinical Benchmark and LLM)
+    excel_path = "/Users/kevinxie/Desktop/projects/BRIDGE-Leaderboard-INTERNAL/Clinical Benchmark and LLM.xlsx"
+    # Configuration for all leaderboards
+    leaderboard_configs = [
+        {
+            'name': 'Zero-Shot',
+            'output_path': 'leaderboards/Zero-Shot_leaderboard.json',
+            'sheet_names': ["B-CLF", "B-EXT", "B-GEN"]
+        },
+        {
+            'name': 'Few-Shot',
+            'output_path': 'leaderboards/Few-Shot_leaderboard.json',
+            'sheet_names': ["B-CLF-5shot", "B-EXT-5shot", "B-GEN-5shot"]
+        },
+        {
+            'name': 'CoT',
+            'output_path': 'leaderboards/CoT_leaderboard.json',
+            'sheet_names': ["B-CLF-CoT", "B-EXT-CoT", "B-GEN-CoT"]
+        }
+    ]
+    invalid_models = [
+            "gemma-3-27b-pt",
+            "gemma-3-12b-pt",
+            "gemma-3-12b-pt-ylab-4-1-1",
+            "gemma-3-12b-pt-ylab-8-1-1",
+            "gemma-3-12b-pt-ylab-16-1-1"
+        ]
+    # Create all leaderboards with a single function call
+    create_all_leaderboards(excel_path, leaderboard_configs, invalid_models)
+    print("***" * 50)
+    print("Leaderboards created successfully!")
+    # Update the ranks of the leaderboards (leftmost column)
+    update_ranks()
+    print("***" * 50)
+    print("Ranks updated successfully!")
+    print("***" * 50)
+    print("Complete!")