Kevin Xie commited on
Commit
3070e58
·
1 Parent(s): 9aca690

Upload main processing scripts for the leaderboard

Browse files
scripts/helpers/CONSTANTS.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TASK_MAPPING = {
2
+ "1-1.ADE-ADE identification": "ADE-Identification",
3
+ "1-2.ADE-ADE relation": "ADE-Extraction",
4
+ "1-3.ADE-Drug dosage": "ADE-Drug dosage",
5
+ "3-2.BARR2-resolution": "BARR2",
6
+ "5.BrainMRI-AIS": "BrainMRI-AIS",
7
+ "6.Brateca.hospitalization": "Brateca-Hospitalization",
8
+ "6.Brateca.mortality": "Brateca-Mortality",
9
+ "7.Cantemist.CODING": "Cantemist-Coding",
10
+ "7.Cantemist.NER": "Cantemis-NER",
11
+ "7.Cantemist.Norm": "Cantemis-Norm",
12
+ "8.CARES.area": "CARES-Area",
13
+ "8.CARES.icd10_block": "CARES ICD10 Block",
14
+ "8.CARES.icd10_chapter": "CARES-ICD10 Chapter",
15
+ "8.CARES.icd10_sub_block": "CARES-ICD10 Subblock",
16
+ "9.CHIP-CDEE": "CHIP-CDEE",
17
+ "12.C-EMRS": "C-EMRS",
18
+ "17-1.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-CM": "CodiEsp-ICD-10-CM",
19
+ "17-2.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-PCS": "CodiEsp-ICD-10-PCS",
20
+ "19.ClinicalNotes-UPMC": "ClinicalNotes-UPMC",
21
+ "20.clinical records from the Mexican Social Security Institute": "PPTS",
22
+ "21.CLINpt": "CLINpt-NER",
23
+ "22.CLIP": "CLIP",
24
+ "23.cMedQA": "cMedQA",
25
+ "26.DialMed": "DialMed",
26
+ "27.DiSMed": "DiSMed-NER",
27
+ "28.MIE": "MIE",
28
+ "29.EHRQA.primary_department": "EHRQA-Primary department",
29
+ "29.EHRQA.qa": "EHRQA-QA",
30
+ "29.EHRQA.sub_department": "EHRQA-Sub department",
31
+ "31.Ex4CDS": "Ex4CDS",
32
+ "33.GOUT-CC.consensus": "GOUT-CC-Consensus",
33
+ "35.n2c2 2006 - De-identification": "n2c2 2006-De-identification",
34
+ "37.i2b2-2009-Medication-Extraction-Challenge": "Medication extraction",
35
+ "38-1.i2b2-2010-Relations-Challenge-concept": "n2c2 2010-Concept",
36
+ "38-2.i2b2-2010-Relations-Challenge-assertion": "n2c2 2010-Assertion",
37
+ "38-3.i2b2-2010-Relations-Challenge-relation": "n2c2 2010-Relation",
38
+ "41.n2c2 2014 - De-identification": "n2c2 2014-De-identification",
39
+ "43.IMCS-V2-NER": "IMCS-V2-NER",
40
+ "46.Japanese Case Reports": "JP-STS",
41
+ "48.meddocan": "meddocan",
42
+ "51.MEDIQA_2019_Task2_RQE": "MEDIQA 2019-RQE",
43
+ "55.MedNLI": "MedNLI",
44
+ "57.MedSTS": "MedSTS",
45
+ "62.mtsamples": "MTS",
46
+ "63.MTSamples-temporal annotation": "MTS-Temporal",
47
+ "65.n2c2-2018-Track2-Adverse-Drug-Events-and-Medication-Extraction": "n2c2 2018-ADE&medication",
48
+ "66-1.NorSynthClinical-entity": "NorSynthClinical-NER",
49
+ "66-2.NorSynthClinical-relation": "NorSynthClinical-RE",
50
+ "68.NUBES": "NUBES",
51
+ "76-1.MTS-Dialog-MEDIQA-2023-chat-task-A": "MEDIQA 2023-chat-A",
52
+ "76-2.MTS-Dialog-MEDIQA-2023-sum-task-A": "MEDIQA 2023-sum-A",
53
+ "76-3.MTS-Dialog-MEDIQA-2023-sum-task-B": "MEDIQA 2023-sum-B",
54
+ "80.RuMedDaNet": "RuMedDaNet",
55
+ "81.CHIP-CDN": "CBLUE-CDN",
56
+ "82.CHIP-CTC": "CHIP-CTC",
57
+ "83.CHIP-MDCFNPC": "CHIP-MDCFNPC",
58
+ "84.MedDG": "MedDG",
59
+ "85.IMCS-V2-SR": "IMCS-V2-SR",
60
+ "86.IMCS-V2-MRG": "IMCS-V2-MRG",
61
+ "87.IMCS-V2-DAC": "IMCS-V2-DAC",
62
+ "90-1.n2c2 2014 - Heart Disease Challenge - Diabete": "n2c2 2014-Diabetes",
63
+ "90-2.n2c2 2014 - Heart Disease Challenge - CAD": "n2c2 2014-CAD",
64
+ "90-3.n2c2 2014 - Heart Disease Challenge - Hyperlipidemia": "n2c2 2014-Hyperlipidemia",
65
+ "90-4.n2c2 2014 - Heart Disease Challenge - Hypertension": "n2c2 2014-Hypertension",
66
+ "90-8.n2c2 2014 - Heart Disease Challenge - Medication": "n2c2 2014-Medication",
67
+ "91-1.CAS.label": "CAS-label",
68
+ "91-2.CAS.evidence": "CAS-evidence",
69
+ "93.RuMedNLI": "RuMedNLI",
70
+ "94.RuDReC": "RuDReC-NER",
71
+ "95.NorSynthClinical-PHI": "NorSynthClinical-PHI",
72
+ "96.RuCCoN.NER": "RuCCoN",
73
+ "97.CLISTER": "CLISTER",
74
+ "98.BRONCO150.NER_status": "BRONCO150-NER&Status",
75
+ "99.CARDIO:DE": "CARDIO-DE",
76
+ "100.GraSSCo_PHI": "GraSSCo PHI",
77
+ "101.IFMIR.IncidentType": "IFMIR-Incident type",
78
+ "101.IFMIR.NER": "IFMIR-NER",
79
+ "101.IFMIR.NER_factuality": "IFMIR - NER&factuality",
80
+ "102.iCorpus": "iCorpus",
81
+ "103.icliniq-10k": "icliniq-10k",
82
+ "104.HealthCareMagic-100k": "HealthCareMagic-100k",
83
+ "105.MIMIC-IV CDM": "MIMIC-IV CDM",
84
+ "106.MIMIC-III Outcome.LoS": "MIMIC-III Outcome.LoS",
85
+ "106.MIMIC-III Outcome.Mortality": "MIMIC-III Outcome.Mortality",
86
+ "107.MIMIC-IV BHC": "MIMIC-IV BHC",
87
+ "108.MIMIC-IV DiReCT.Dis": "MIMIC-IV DiReCT.Dis",
88
+ "108.MIMIC-IV DiReCT.PDD": "MIMIC-IV DiReCT.PDD"
89
+ }
90
+
91
+
92
+ DATA_ACCESS_MAP = {
93
+ "OA": "Open Access",
94
+ "R": "Regulated",
95
+ "P": "Regulated",
96
+ "N": "Regulated"
97
+ }
98
+
99
+ # Mappings
100
+ DOMAIN_MAPPING = {
101
+ "General": "General",
102
+ "Medical": "Medical"
103
+ }
104
+
105
+ # In the filter, there should only be two categories (Open, Private)
106
+ # In the table itself, there should be more categories
107
+ LICENSE_MAPPING = {
108
+ "Baichuan-M1-14B": "Open Source",
109
+ "MIT": "Open Source",
110
+ "Gemma": "Open Source",
111
+ "Llama-3.1": "Open Source",
112
+ "Llama-3.3": "Open Source",
113
+ "Apache 2.0": "Open Source",
114
+ "PhysioNet Credentialed Health Data License 1.5.0": "Open Source",
115
+ "Llama-3": "Open Source",
116
+ "MRL": "Open Source",
117
+ "Qwen": "Open Source",
118
+ "Nexusflow Research License": "Open Source",
119
+ "Proprietary": "Proprietary",
120
+ "Llama-4": "Open Source",
121
+ "Health AI Developer Foundations terms of use": "Open Source",
122
+ "llama2": "Open Source"
123
+ }
124
+
scripts/helpers/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .CONSTANTS import *
2
+ from .excel_processor import ExcelProcessor
3
+ from .reorganize_indices import reorganize_indices
scripts/helpers/excel_processor.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from .CONSTANTS import *
4
+
5
+ class ExcelProcessor:
6
+ def __init__(self, excel_path, invalid_models=None):
7
+ """Initialize the ExcelProcessor with an Excel file.
8
+
9
+ Args:
10
+ excel_path (str): Path to the Excel file containing model and task data.
11
+ """
12
+ # excel_path = path to excel file
13
+ self.sheet_path = excel_path
14
+ self.excel_data = self.load_excel()
15
+ self.model_sheet = self.load_sheet("Models (Simplified)")
16
+ self.invalid_models = invalid_models
17
+
18
+ print("You have excluded the following models: ", self.invalid_models)
19
+
20
+ # Get all of the valid models (exclude invalid models)
21
+ self.valid_models = self.get_valid_models(self.invalid_models)
22
+
23
+ # print("VALID MODELS: ", self.valid_models)
24
+
25
+
26
+ def load_excel(self):
27
+ """Load the Excel file into a pandas ExcelFile object.
28
+
29
+ Returns:
30
+ pd.ExcelFile: The loaded Excel file object.
31
+ """
32
+ return pd.ExcelFile(self.sheet_path)
33
+
34
+ def load_sheet(self, sheet_name):
35
+ """Load a specific sheet from the Excel file.
36
+
37
+ Args:
38
+ sheet_name (str): Name of the sheet to load.
39
+
40
+ Returns:
41
+ pd.DataFrame: The loaded sheet as a pandas DataFrame.
42
+ """
43
+ return self.excel_data.parse(sheet_name)
44
+
45
+ def get_valid_models(self, invalid_models=None):
46
+ """Get all valid models from the Models sheet, excluding invalid ones.
47
+
48
+ Returns:
49
+ list: List of valid model names that should be included in evaluation.
50
+ """
51
+ valid_models = []
52
+
53
+ for idx, model_name in enumerate(self.model_sheet["Name"]):
54
+ if model_name not in invalid_models:
55
+ valid_models.append(model_name)
56
+
57
+ return valid_models
58
+
59
+ def get_valid_columns(self, sheet_name):
60
+ """Get all non-empty columns from a specified sheet.
61
+
62
+ Args:
63
+ sheet_name (str): Name of the sheet to analyze.
64
+
65
+ Returns:
66
+ list: List of valid column names (excluding unnamed columns).
67
+ """
68
+ valid_columns = []
69
+
70
+ for column in self.load_sheet(sheet_name).columns:
71
+ if column.split(' ')[0] != "Unnamed:":
72
+ valid_columns.append(column.strip())
73
+
74
+ return valid_columns
75
+
76
+ def get_model_information(self,
77
+ sheet_name = "Models (Simplified)",
78
+ name_column = "Name",
79
+ domain_column = "Domain",
80
+ license_column = "License",
81
+ size_column = "Size (B)",
82
+ ):
83
+ """Extract model information from the Models sheet.
84
+
85
+ Args:
86
+ sheet_name (str, optional): Name of the sheet containing model info.
87
+ Defaults to "Models (Simplified)".
88
+ name_column (str, optional): Column name containing model names.
89
+ Defaults to "Name".
90
+ domain_column (str, optional): Column name containing model domains.
91
+ Defaults to "Domain".
92
+ license_column (str, optional): Column name containing license info.
93
+ Defaults to "License".
94
+ size_column (str, optional): Column name containing model sizes.
95
+ Defaults to "Size (B)".
96
+
97
+ Returns:
98
+ tuple: A tuple containing 7 dictionaries:
99
+ - model_name_info: Model names indexed by position
100
+ - domain_info: Model domains mapped using DOMAIN_MAPPING
101
+ - license_info: License information (abbreviated if needed)
102
+ - accessibility_info: Accessibility mapped using LICENSE_MAPPING
103
+ - displayed_size_info: Raw size values for display
104
+ - hidden_size_info: Size ranges for filtering
105
+ - T_info: Position markers for the leaderboard
106
+ """
107
+ # Load the model sheet
108
+ model_sheet = self.load_sheet(sheet_name)
109
+
110
+ # Everything to be returned.
111
+ T_info = {}
112
+ model_name_info = {}
113
+ domain_info = {}
114
+ license_info = {}
115
+ accessibility_info = {}
116
+ displayed_size_info = {} # shown on leaderboard
117
+ hidden_size_info = {} # hidden column
118
+
119
+ def map_size(param_size):
120
+ """Map parameter size to predefined ranges.
121
+
122
+ Args:
123
+ param_size: The parameter size value.
124
+
125
+ Returns:
126
+ str: Size range category.
127
+ """
128
+ if param_size == "/":
129
+ return "None"
130
+ if param_size == "Unknown":
131
+ return "Unknown"
132
+ size = int(param_size)
133
+ if size < 5:
134
+ return "0-5"
135
+ elif size < 10:
136
+ return "5-10"
137
+ elif size < 40:
138
+ return "10-40"
139
+ elif size < 80:
140
+ return "40-80"
141
+ else:
142
+ return ">80"
143
+
144
+ i = 0
145
+ for name, domain, license, size in zip(model_sheet[name_column],
146
+ model_sheet[domain_column],
147
+ model_sheet[license_column],
148
+ model_sheet[size_column]):
149
+
150
+ # If it is a valid model (used in evaluation)
151
+ if name in self.valid_models:
152
+ T_info[f"{i}"] = "\ud83d\udd36"
153
+ model_name_info[f"{i}"] = name
154
+
155
+ domain_info[f"{i}"] = DOMAIN_MAPPING[domain]
156
+
157
+ if license == "PhysioNet Credentialed Health Data License 1.5.0":
158
+ license_info[f"{i}"] = "PhysioNet 1.5.0" # Abbreviate license name to fit on leaderboard
159
+ else:
160
+ license_info[f"{i}"] = license
161
+
162
+ accessibility_info[f"{i}"] = LICENSE_MAPPING[license]
163
+ displayed_size_info[f"{i}"] = size
164
+ hidden_size_info[f"{i}"] = map_size(size)
165
+
166
+ i += 1
167
+
168
+ else:
169
+ print("Invalid model: ", name)
170
+
171
+ return model_name_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info
172
+
173
+ def get_sheet_information(self, sheets_list, task_names_list, task_types_list):
174
+ """Extract task performance information from specified sheets.
175
+
176
+ Args:
177
+ sheets_list (list): List of sheet names to process.
178
+ task_names_list (list): List of task names corresponding to each sheet.
179
+ task_types_list (list): List of task types ('ext', 'gen', etc.) for each sheet.
180
+
181
+ Returns:
182
+ dict: Dictionary mapping task names to model performance data.
183
+ Format: {task_name: {model_index: performance_score}}
184
+ """
185
+ task_info = {}
186
+
187
+ # Iterate through each row
188
+ for idx, sheet in enumerate(sheets_list):
189
+ # Get the task type (tt)
190
+ tt = task_types_list[idx]
191
+
192
+ # Load the sheet
193
+ model_sheet = self.load_sheet(sheet)
194
+
195
+ # Name of the task (i.e. 1.1-ADE Identification)
196
+ task_name = task_names_list[idx]
197
+
198
+ # Get all columns in the sheet
199
+ for i, t in enumerate(model_sheet['Task Type']):
200
+ if i == 0:
201
+ continue
202
+
203
+ # Break out of loop when it reaches the end of the sheet
204
+ if t == "-":
205
+ break
206
+
207
+ row = i
208
+ task_counter = 0
209
+
210
+ for model in self.valid_models:
211
+ column_name = model.strip()
212
+
213
+ if column_name == "gpt-35-turbo-0125":
214
+ column_name = "gpt-35-turbo"
215
+ elif column_name == "gpt-4o-0806":
216
+ column_name = "gpt-4o"
217
+ elif column_name == "gemini-2.0-flash-001":
218
+ column_name = "gemini-2.0-flash"
219
+ elif column_name == "gemini-1.5-pro-002":
220
+ column_name = "gemini-1.5-pro"
221
+
222
+ if column_name == "gpt-oss-20b":
223
+ column_name = "gpt-oss-20b-high"
224
+ elif column_name == "gpt-oss-120b":
225
+ column_name = "gpt-oss-120b-high"
226
+
227
+ if tt == 'ext':
228
+ column_name = column_name + '.1'
229
+
230
+ elif tt == 'gen':
231
+ column_name = column_name + '.1'
232
+
233
+ # Name of the task (i.e 1.1-ADE Identification)
234
+ task = model_sheet[task_name][row]
235
+
236
+ # Update task name to more simple version
237
+ task = TASK_MAPPING[task]
238
+
239
+ if task == "Average score":
240
+ break
241
+
242
+ # Update the information for each task
243
+ if task not in task_info:
244
+ task_info[task] = {}
245
+
246
+ task_info[task][f"{task_counter}"] = round(float(model_sheet[column_name.strip()][row].split(" ")[0]), 2)
247
+ task_counter += 1
248
+
249
+ return task_info
250
+
251
+ def add_average_performance(self, task_info):
252
+ """Calculate average performance across all tasks for each model.
253
+
254
+ Args:
255
+ task_info (dict): Dictionary containing task performance data.
256
+ Format: {task_name: {model_index: performance_score}}
257
+
258
+ Returns:
259
+ dict: Dictionary mapping model indices to average performance scores.
260
+ Format: {model_index: average_score}
261
+ """
262
+ for task in task_info:
263
+ n = len(task_info[task])
264
+ break
265
+
266
+ average_performance_info = {}
267
+ for i in range(n):
268
+ perf = 0
269
+ num_tasks = 0
270
+ for task in task_info:
271
+ perf += float(task_info[task][str(i)])
272
+ num_tasks += 1
273
+
274
+ average_performance_info[f"{i}"] = str(round(perf / num_tasks, 2))
275
+
276
+ return average_performance_info
277
+
278
+ def create_leaderboards(
279
+ self,
280
+ sheet_names_list=None,
281
+ task_names_list=["Task-Classification", "Task-Extraction", "Task-Generation"],
282
+ task_types_list=["cls", "ext", "gen"],
283
+ output_path=None):
284
+ """Create a leaderboard JSON file from Excel data.
285
+
286
+ Args:
287
+ sheet_names_list (list, optional): List of sheet names to process.
288
+ task_names_list (list, optional): List of task names corresponding to sheets.
289
+ task_types_list (list, optional): List of task types for each sheet.
290
+ leaderboard_name (str, optional): Name of the leaderboard being created.
291
+ output_path (str, optional): Path where the JSON file should be saved.
292
+
293
+ Note:
294
+ Creates one leaderboard per call (CoT, Direct, or Few-Shot).
295
+ The output JSON contains model information, task performance, and metadata.
296
+ """
297
+ data = {}
298
+
299
+ model_info, domain_info, license_info, accessibility_info, displayed_size_info, hidden_size_info, T_info = self.get_model_information()
300
+
301
+ task_info = self.get_sheet_information(sheet_names_list, task_names_list, task_types_list)
302
+ average_performance_info = self.add_average_performance(task_info)
303
+
304
+ data["T"] = T_info
305
+ data["Model"] = model_info
306
+ data["Model: Domain"] = domain_info
307
+ data["Model: License"] = license_info
308
+ data["Model: Accessibility"] = accessibility_info
309
+ data["Size (B)"] = displayed_size_info
310
+ data["Model: Size Range"] = hidden_size_info
311
+ data["Average Performance"] = average_performance_info
312
+
313
+ for task in task_info:
314
+ data[task] = task_info[task]
315
+
316
+ with open(output_path, 'w') as file:
317
+ json.dump(data, file, indent=4)
318
+
319
+ def create_task_information(self, output_path: str):
320
+ """Create a JSON file containing detailed task information.
321
+
322
+ Args:
323
+ output_path (str): Path where the task information JSON should be saved.
324
+
325
+ Note:
326
+ Extracts task metadata from the "Task-all" sheet including language,
327
+ task type, clinical context, data access requirements, applications,
328
+ and clinical stage information.
329
+ """
330
+ task_sheet = self.load_sheet("Task-all")
331
+
332
+ # Initialize a map to store the json information
333
+ info = {}
334
+
335
+ # Iterate through the "Task-Original" column, which contains all of the task names
336
+ for idx, task in enumerate(task_sheet["Task name"]):
337
+ # Add the task to the final json
338
+
339
+ if task not in info:
340
+ info[task] = {}
341
+
342
+ # Add all of the attributes to the task
343
+ language = task_sheet["Language"][idx]
344
+ task_type = task_sheet["Task Type - fine grained"][idx]
345
+ clinical_context = task_sheet["Clinical context"][idx]
346
+ data_access = task_sheet["Data Access\nOpen Access (OA) / \nRegulated (R) / \nPhysionet (P) / \nn2c2 (N)"][idx]
347
+ application = task_sheet['Clinical Application'][idx]
348
+ clinical_stage = task_sheet['Clinical Stage'][idx]
349
+
350
+ info[task]["Language"] = language.strip()
351
+ info[task]["Task Type"] = task_type.strip()
352
+ info[task]["Clinical Context"] = clinical_context.strip()
353
+ info[task]["Data Access"] = DATA_ACCESS_MAP[data_access.strip()]
354
+ info[task]['Applications'] = application.strip()
355
+ info[task]['Clinical Stage'] = clinical_stage.strip()
356
+
357
+ with open(output_path, 'w') as file:
358
+ json.dump(info, file, indent=4)
scripts/helpers/leaderboards.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+
4
+
5
+ class LeaderboardProcessor:
6
+ def __init__(self, output_path):
7
+ self.output_path = output_path
8
+
9
+ def update_leaderboards(self, old_leaderboard_json, new_models):
10
+ """
11
+ Args:
12
+ - old_leaderboard_json: json file including the previous leaderboard data
13
+ - new_models: List[str] --> a list of strings of new models to update the leaderboard with
14
+ """
15
+ pass
scripts/helpers/reorganize_indices.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from collections import OrderedDict
5
+ from typing import Dict, List, Tuple, Optional
6
+
7
+ def extract_model_size(model_name: str) -> float:
8
+ """
9
+ Extract the size (in billions of parameters) from a model name.
10
+ Returns a large number for models without explicit size (like DeepSeek-R1).
11
+ """
12
+ # Look for patterns like "1.5B", "7B", "70B", "32B", etc.
13
+ size_match = re.search(r'(\d+(?:\.\d+)?)[Bb]', model_name)
14
+ if size_match:
15
+ return float(size_match.group(1))
16
+
17
+ # Special handling for models without explicit size
18
+ if 'DeepSeek-R1' in model_name and 'Distill' not in model_name:
19
+ return 999.0 # Treat as very large model
20
+
21
+ # Default fallback - treat as medium size
22
+ return 50.0
23
+
24
+ def get_size_based_order(models: Dict[str, str]) -> List[Tuple[str, str, float]]:
25
+ """
26
+ Get models sorted by size with their original indices.
27
+ Returns list of (original_index, model_name, size) tuples sorted by size.
28
+ """
29
+ model_data = []
30
+ for idx, model_name in models.items():
31
+ size = extract_model_size(model_name)
32
+ model_data.append((idx, model_name, size))
33
+
34
+ # Sort by size (ascending), then by name for ties
35
+ return sorted(model_data, key=lambda x: (x[2], x[1]))
36
+
37
+ def create_size_based_mapping(leaderboard_json_path: str) -> Dict[str, str]:
38
+ """
39
+ Create a mapping from current indices to size-based indices for a specific leaderboard.
40
+ """
41
+ try:
42
+ with open(leaderboard_json_path, 'r') as f:
43
+ data = json.load(f)
44
+
45
+ if 'Model' not in data:
46
+ raise ValueError(f"No 'Model' section found in {leaderboard_json_path}")
47
+
48
+ models = data['Model']
49
+
50
+ # Get the first 8 models (the main ones we want to reorder)
51
+ first_8_models = {k: v for k, v in list(models.items())[:8]}
52
+
53
+ # Get size-based ordering
54
+ sorted_models = get_size_based_order(first_8_models)
55
+
56
+ # Create mapping from old index to new index
57
+ mapping = {}
58
+ for new_idx, (old_idx, model_name, size) in enumerate(sorted_models):
59
+ mapping[old_idx] = str(new_idx)
60
+ print(f" {model_name} ({size}B): {old_idx} → {new_idx}")
61
+
62
+ # For indices 8 and beyond, they stay the same
63
+ # Now properly handle all models (up to 99 instead of hard-coded 73)
64
+ max_index = max(int(k) for k in models.keys())
65
+ print(f" Total models: {len(models)}, max index: {max_index}")
66
+ for i in range(8, max_index + 1):
67
+ mapping[str(i)] = str(i)
68
+
69
+ return mapping
70
+
71
+ except Exception as e:
72
+ print(f"Error creating mapping for {leaderboard_json_path}: {e}")
73
+ raise
74
+
75
+ def reorganize_indices(leaderboard_json_path: str, custom_mapping: Optional[Dict[str, str]] = None):
76
+ """
77
+ Reorganize the indices of a leaderboard JSON file based on model size.
78
+
79
+ Args:
80
+ leaderboard_json_path: Path to the leaderboard JSON file
81
+ custom_mapping: Optional custom mapping dict. If None, will auto-generate based on model sizes.
82
+ """
83
+ try:
84
+ print(f"\nProcessing {leaderboard_json_path}...")
85
+
86
+ # Create mapping based on model sizes if not provided
87
+ if custom_mapping is None:
88
+ print(" Creating size-based mapping...")
89
+ mapping = create_size_based_mapping(leaderboard_json_path)
90
+ else:
91
+ mapping = custom_mapping
92
+ print(" Using provided custom mapping...")
93
+
94
+ # Load the data
95
+ with open(leaderboard_json_path, 'r') as f:
96
+ data = json.load(f)
97
+
98
+ # Create new data structure with proper ordering
99
+ new_data = OrderedDict()
100
+
101
+ # Process each section
102
+ for section_name, section_data in data.items():
103
+ new_section = OrderedDict()
104
+
105
+ # First, collect all the remapped data with their new indices
106
+ temp_dict = {}
107
+ for old_idx, value in section_data.items():
108
+ new_idx = mapping.get(old_idx, old_idx)
109
+ temp_dict[int(new_idx)] = value
110
+
111
+ # Sort by new index and add to ordered dict - this ensures physical ordering
112
+ for key in sorted(temp_dict.keys()):
113
+ new_section[str(key)] = temp_dict[key]
114
+
115
+ new_data[section_name] = new_section
116
+
117
+ # Write the reorganized data with proper physical ordering
118
+ with open(leaderboard_json_path, 'w') as f:
119
+ json.dump(new_data, f, indent=4, ensure_ascii=False)
120
+
121
+ print(f" ✅ Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
122
+
123
+ # Print the new order for verification
124
+ with open(leaderboard_json_path, 'r') as f:
125
+ example_data = json.load(f)
126
+
127
+ print(f'\n New model order (first 8) from {leaderboard_json_path}:')
128
+ model_section = example_data['Model']
129
+ model_keys = list(model_section.keys())[:8]
130
+ for i, key in enumerate(model_keys):
131
+ model_name = model_section[key]
132
+ size = extract_model_size(model_name)
133
+ print(f' Position {i} (Index {key}): {model_name} ({size}B)')
134
+
135
+ except Exception as e:
136
+ print(f" ❌ Error processing {leaderboard_json_path}: {e}")
137
+ raise
138
+
139
+ def reorganize_all_leaderboards(leaderboard_dir: str = "leaderboards"):
140
+ """
141
+ Reorganize all leaderboard files in the specified directory.
142
+ """
143
+ leaderboard_files = [
144
+ f"{leaderboard_dir}/CoT_leaderboard.json",
145
+ f"{leaderboard_dir}/Zero-Shot_leaderboard.json",
146
+ f"{leaderboard_dir}/Few-Shot_leaderboard.json"
147
+ ]
148
+
149
+ print("🔄 Starting reorganization of all leaderboards based on model size...")
150
+
151
+ for file_path in leaderboard_files:
152
+ if os.path.exists(file_path):
153
+ reorganize_indices(file_path)
154
+ else:
155
+ print(f" ⚠️ Warning: {file_path} not found, skipping...")
156
+
157
+ print("\n✅ All leaderboards have been reorganized!")
158
+
159
+ # Legacy function for backward compatibility (but with dynamic range)
160
+ def reorganize_indices_legacy(leaderboard_json_path: str):
161
+ """
162
+ Legacy function that uses the old hard-coded mapping style but with dynamic range.
163
+ This is kept for backward compatibility but now properly handles all 99 models.
164
+ """
165
+ # Create the mapping from old indices to new indices (ordered by model size)
166
+ mapping = {
167
+ '0': '7', # DeepSeek-R1-Distill-Llama-70B (70B) goes to 7 (end)
168
+ '1': '0', # DeepSeek-R1-Distill-Qwen-1.5B (1.5B) goes to 0 (start)
169
+ '2': '6', # DeepSeek-R1 (large model) goes to 6
170
+ '3': '1', # DeepSeek-R1-Distill-Qwen-7B (7B) goes to 1
171
+ '4': '3', # DeepSeek-R1-Distill-Qwen-14B (14B) goes to 3
172
+ '5': '2', # DeepSeek-R1-Distill-Llama-8B (8B) goes to 2
173
+ '6': '5', # Baichuan-M2-32B (32B) goes to 5
174
+ '7': '4', # Baichuan-M1-14B-Instruct (14B) goes to 4
175
+ }
176
+
177
+ # Dynamically determine the range based on actual data
178
+ with open(leaderboard_json_path, 'r') as f:
179
+ data = json.load(f)
180
+
181
+ if 'Model' in data:
182
+ max_index = max(int(k) for k in data['Model'].keys())
183
+ print(f" Found {len(data['Model'])} models (indices 0-{max_index})")
184
+
185
+ # For indices 8 and beyond, they stay the same
186
+ for i in range(8, max_index + 1):
187
+ mapping[str(i)] = str(i)
188
+ else:
189
+ print(" Warning: No 'Model' section found, using default range")
190
+ # Fallback to 99 models (0-98)
191
+ for i in range(8, 99):
192
+ mapping[str(i)] = str(i)
193
+
194
+ # Process each JSON file
195
+ print(f"\nProcessing {leaderboard_json_path}...")
196
+
197
+ with open(leaderboard_json_path, 'r') as f:
198
+ data = json.load(f)
199
+
200
+ # Create new data structure with proper ordering
201
+ new_data = OrderedDict()
202
+
203
+ # Process each section
204
+ for section_name, section_data in data.items():
205
+ new_section = OrderedDict()
206
+
207
+ # First, collect all the remapped data with their new indices
208
+ temp_dict = {}
209
+ for old_idx, value in section_data.items():
210
+ new_idx = mapping.get(old_idx, old_idx)
211
+ temp_dict[int(new_idx)] = value
212
+
213
+ # Sort by new index and add to ordered dict - this ensures physical ordering
214
+ for key in sorted(temp_dict.keys()):
215
+ new_section[str(key)] = temp_dict[key]
216
+
217
+ new_data[section_name] = new_section
218
+
219
+ # Write the reorganized data with proper physical ordering
220
+ with open(leaderboard_json_path, 'w') as f:
221
+ json.dump(new_data, f, indent=4, ensure_ascii=False)
222
+
223
+ print(f" Successfully reorganized indices and physical ordering in {leaderboard_json_path}")
224
+
225
+ # Print the new order for verification
226
+ with open(leaderboard_json_path, 'r') as f:
227
+ example_data = json.load(f)
228
+
229
+ print(f'\nNew model order (first 8) from {leaderboard_json_path}:')
230
+ model_section = example_data['Model']
231
+ # Since we're using OrderedDict and sorted insertion, the first 8 entries should be indices 0-7
232
+ model_keys = list(model_section.keys())[:8]
233
+ for i, key in enumerate(model_keys):
234
+ print(f' Position {i} (Index {key}): {model_section[key]}')
scripts/main.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from helpers.excel_processor import ExcelProcessor
2
+ from helpers.reorganize_indices import reorganize_indices
3
+ import json
4
+
5
+ def update_ranks():
6
+ final_leaderboard_paths = [
7
+ "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/CoT_leaderboard.json",
8
+ "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Few-Shot_leaderboard.json",
9
+ "/Users/kevinxie/Desktop/projects/BRIDGE-Medical-Leaderboard/leaderboards/Zero-Shot_leaderboard.json"
10
+ ]
11
+
12
+ for leaderboard_path in final_leaderboard_paths:
13
+ with open(leaderboard_path, 'r') as f:
14
+ data = json.load(f)
15
+
16
+ avg_performance_dict = data['Average Performance']
17
+
18
+ # Tuples of the original index (key) and the performance score
19
+ tps = []
20
+ for idx, value in avg_performance_dict.items():
21
+ tps.append((idx, value))
22
+
23
+ # Sort the tuples by the performance score in descending order
24
+ tps.sort(key=lambda x: float(x[1]), reverse=True)
25
+
26
+ for rank, tp in enumerate(tps):
27
+ original_idx = tp[0]
28
+
29
+ data['T'][original_idx] = rank + 1 # Rank starts from 1
30
+
31
+ with open(leaderboard_path, 'w') as f:
32
+ json.dump(data, f, indent=4, ensure_ascii=False)
33
+
34
+
35
+ def create_leaderboards(
36
+ excel_path: str,
37
+ output_path: str,
38
+ sheet_names_list: list,
39
+ invalid_models=None
40
+ ):
41
+
42
+ """
43
+ Function that updates a singular leaderboard (JSON).
44
+
45
+ Args:
46
+ excel_path: Path to the excel file
47
+ output_path: Path to the output file
48
+ sheet_names_list: List of sheet names to create leaderboards from
49
+ invalid_models: List of models to exclude from the leaderboards
50
+ """
51
+ excel_processor = ExcelProcessor(excel_path, invalid_models)
52
+
53
+ # Create leaderboards (JSON)
54
+ excel_processor.create_leaderboards(sheet_names_list=sheet_names_list, output_path=output_path)
55
+
56
+ # Reorganize the leaderboard inices
57
+ reorganize_indices(output_path)
58
+
59
+ # Create task information JSON
60
+ excel_processor.create_task_information('task_information.json')
61
+
62
+
63
+ def create_all_leaderboards(
64
+ excel_path: str,
65
+ leaderboard_configs: list,
66
+ invalid_models=None
67
+ ):
68
+
69
+ """
70
+ Loops through each leaderboard's configs to update all leaderboards
71
+ (calls the above function multiple times)
72
+
73
+ Args:
74
+ excel_path: Path to the excel file
75
+ leaderboard_configs: List of leaderboard configs
76
+ invalid_models: List of models to exclude from the leaderboards
77
+ """
78
+
79
+ for config in leaderboard_configs:
80
+ print(f"Creating {config['name']} leaderboard...")
81
+ create_leaderboards(
82
+ excel_path,
83
+ config['output_path'],
84
+ config['sheet_names'],
85
+ invalid_models=invalid_models
86
+ )
87
+
88
+ print(f"{config['name']} leaderboard created successfully!")
89
+
90
+
91
+ if __name__ == "__main__":
92
+ print("***" * 50)
93
+ print("Starting script...")
94
+
95
+ # # ######################################################### #
96
+ # # ######################################################### #
97
+
98
+ # HOW TO UPDATE LEADERBOARDS
99
+ # 1. Download the new excel sheet and/or update the path to the excel sheet
100
+ # 2. Specify which models to exclude from the leaderboard in "invalid_models" list
101
+ # 3. Run scripts/main.py
102
+ # 4. Done! All leaderboards and task information have been updated.
103
+ # 5. Push to GitHub and deploy to Hugging Face Spaces.
104
+
105
+ # # ######################################################### #
106
+ # # ######################################################### #
107
+
108
+ # excel_path --> path to the Google Sheet version you want to use (Clinical Benchmark and LLM)
109
+ excel_path = "/Users/kevinxie/Desktop/projects/BRIDGE-Leaderboard-INTERNAL/Clinical Benchmark and LLM.xlsx"
110
+
111
+ # Configuration for all leaderboards
112
+ leaderboard_configs = [
113
+ {
114
+ 'name': 'Zero-Shot',
115
+ 'output_path': 'leaderboards/Zero-Shot_leaderboard.json',
116
+ 'sheet_names': ["B-CLF", "B-EXT", "B-GEN"]
117
+ },
118
+ {
119
+ 'name': 'Few-Shot',
120
+ 'output_path': 'leaderboards/Few-Shot_leaderboard.json',
121
+ 'sheet_names': ["B-CLF-5shot", "B-EXT-5shot", "B-GEN-5shot"]
122
+ },
123
+ {
124
+ 'name': 'CoT',
125
+ 'output_path': 'leaderboards/CoT_leaderboard.json',
126
+ 'sheet_names': ["B-CLF-CoT", "B-EXT-CoT", "B-GEN-CoT"]
127
+ }
128
+ ]
129
+
130
+ invalid_models = [
131
+ "gemma-3-27b-pt",
132
+ "gemma-3-12b-pt",
133
+ "gemma-3-12b-pt-ylab-4-1-1",
134
+ "gemma-3-12b-pt-ylab-8-1-1",
135
+ "gemma-3-12b-pt-ylab-16-1-1"
136
+ ]
137
+
138
+ # Create all leaderboards with a single function call
139
+ create_all_leaderboards(excel_path, leaderboard_configs, invalid_models)
140
+
141
+ print("***" * 50)
142
+ print("Leaderboards created successfully!")
143
+
144
+ # Update the ranks of the leaderboards (leftmost column)
145
+ update_ranks()
146
+
147
+ print("***" * 50)
148
+ print("Ranks updated successfully!")
149
+ print("***" * 50)
150
+ print("Complete!")
151
+