import math import yaml import pandas as pd from pathlib import Path # AUP calculation (from d3LLM_Code/aup_utils.py) def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float: """Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)""" return min(math.exp(-alpha * (1 - y / y_max)), 1.0) def get_aup(rho: list, y: list, y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0) -> float: """Calculate AUP (Accuracy Under Parallelism) score.""" if len(rho) == 0: return 0.0 sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0]) sorted_rho, sorted_y = zip(*sorted_pairs) sorted_rho, sorted_y = list(sorted_rho), list(sorted_y) y_1 = sorted_y[0] y_min = y_1 - y_min_offset filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min] if len(filtered_pairs) == 0: return sorted_rho[0] * sorted_y[0] filtered_rho, filtered_y = zip(*filtered_pairs) filtered_rho, filtered_y = list(filtered_rho), list(filtered_y) aup = filtered_rho[0] * filtered_y[0] for i in range(1, len(filtered_rho)): y_i, y_prev = filtered_y[i], filtered_y[i-1] w_i = weight_function(y_i, y_max, alpha) w_prev = weight_function(y_prev, y_max, alpha) aup += 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev) return aup DATA_DIR = Path(__file__).parent.parent.parent / "d3LLM_Code" DATA_FILES = ["data_dream.yaml", "data_llada.yaml", "data_dream_coder.yaml"] # Merge HumanEval-Instruct -> HumanEval, MBPP-Instruct -> MBPP; exclude HumanEval+, MBPP+ TASK_MERGE = {"HumanEval-Instruct": "HumanEval", "MBPP-Instruct": "MBPP"} TASK_EXCLUDE = {"HumanEval+", "MBPP+"} TASK_ORDER = ["GSM8K-CoT", "MATH", "MBPP", "HumanEval", "Long-GSM8K"] AVG_AUP_DIVISOR = 5 def load_yaml_data(): """Load YAML files separately, compute y_max per file/task, then merge.""" all_results = {} # {method: {task: (aup, tpf, acc)}} all_meta = {} all_tasks = set() raw_data = {} # {task: {method: [(rho, y), ...]}} for curve plotting for filename in DATA_FILES: filepath = DATA_DIR / filename if not filepath.exists(): continue with open(filepath, 'r') as f: data = yaml.safe_load(f) meta = data.pop('_meta', {}) all_meta.update(meta) # Compute y_max per task WITHIN this file only (as per main.py) file_tasks = {k: v for k, v in data.items() if k not in TASK_EXCLUDE} y_max_per_task = {} for task, methods in file_tasks.items(): y_max_per_task[task] = max(y for pairs in methods.values() for _, y in pairs) # Calculate AUP for each method/task in this file for task, methods in file_tasks.items(): target_task = TASK_MERGE.get(task, task) all_tasks.add(target_task) y_max = y_max_per_task[task] # Store raw data for curve plotting if target_task not in raw_data: raw_data[target_task] = {} for method, pairs in methods.items(): if method not in all_results: all_results[method] = {} rho_list = [p[0] for p in pairs] y_list = [p[1] for p in pairs] aup = get_aup(rho_list, y_list, y_max) tpf = max(rho_list) acc = pairs[0][1] if len(pairs) == 1 else [p[1] for p in pairs if p[0] == max(rho_list)][0] all_results[method][target_task] = (round(aup, 1), round(tpf, 2), round(acc, 1)) raw_data[target_task][method] = pairs # Return tasks in specified order ordered_tasks = [t for t in TASK_ORDER if t in all_tasks] return all_results, all_meta, ordered_tasks, raw_data def compute_leaderboard(): """Compute leaderboard DataFrame from YAML data.""" results_dict, meta, tasks, raw_data = load_yaml_data() results = [] for method in sorted(results_dict.keys()): method_meta = meta.get(method, {}) row = { "Method": method, "Type": method_meta.get("type", "?"), "Foundation": method_meta.get("foundation", "?"), "Link": method_meta.get("link", ""), } aup_sum = 0.0 for task in tasks: if task in results_dict[method]: aup, tpf, acc = results_dict[method][task] row[f"{task}_AUP"], row[f"{task}_TPF"], row[f"{task}_Acc"] = aup, tpf, acc aup_sum += aup else: row[f"{task}_AUP"] = row[f"{task}_TPF"] = row[f"{task}_Acc"] = None row["Avg_AUP"] = round(aup_sum / AVG_AUP_DIVISOR, 1) results.append(row) df = pd.DataFrame(results).sort_values("Avg_AUP", ascending=False).reset_index(drop=True) return df, tasks, raw_data def get_leaderboard_df(): df, _, _ = compute_leaderboard() return df def get_tasks(): _, tasks, _ = compute_leaderboard() return tasks def get_raw_data(): _, _, raw_data = compute_leaderboard() return raw_data