| | import json |
| | import os |
| | from statistics import mean |
| |
|
| | import matplotlib.pyplot as plt |
| | from matplotlib.colors import LinearSegmentedColormap |
| |
|
| | |
| |
|
| |
|
| | |
| | |
| | def main(result_dir: str): |
| | assert os.path.isdir(result_dir), f"{result_dir} is not a directory." |
| |
|
| | model2task2dps = {} |
| | model2task2dps_norm = {} |
| |
|
| | model_list = [] |
| | model_e2e_dps = [] |
| | for result_json in os.listdir(result_dir): |
| | if not result_json.endswith(".json"): |
| | continue |
| | result_json_path = os.path.join(result_dir, result_json) |
| | assert "_temp_0.2_" in result_json, f"Invalid result file name: {result_json}" |
| | model_id = result_json.split("_temp_0.2_")[0] |
| | if model_id.endswith("-instruct") and not model_id.endswith(" perf-instruct"): |
| | model_id = model_id[: -len("-instruct")] |
| | model_id += " :: default" |
| | if "::" not in model_id: |
| | model_id += " :: default" |
| | print(f"Processing {model_id}") |
| | with open(result_json_path) as f: |
| | results = json.load(f) |
| | task2dps = {} |
| | task2dps_norm = {} |
| |
|
| | for task_id, result in results.items(): |
| | if "scores" in result and result["scores"] is not None: |
| | task2dps[task_id] = result["scores"]["max"] |
| | task2dps_norm[task_id] = result["norm_scores"]["max"] |
| | if "dps" in result and result["dps"] is not None: |
| | task2dps[task_id] = max(result["dps"]) |
| | task2dps_norm[task_id] = max(result["dps_norm"]) |
| |
|
| | model2task2dps[model_id] = task2dps |
| | model2task2dps_norm[model_id] = task2dps_norm |
| | model_list.append(model_id) |
| | model_e2e_dps.append(mean(task2dps.values())) |
| |
|
| | |
| | model_list, model_e2e_dps = zip( |
| | *sorted(zip(model_list, model_e2e_dps), key=lambda x: x[1], reverse=True) |
| | ) |
| |
|
| | |
| |
|
| | fig, ax = plt.subplots(figsize=(30, 25)) |
| |
|
| | score_matrix = [] |
| | for i, model_x in enumerate(model_list): |
| | score_list = [] |
| | task2dps_x = model2task2dps[model_x] |
| | for j, model_y in enumerate(model_list): |
| | if j <= i: |
| | score_list.append((0, 0)) |
| | continue |
| | task2dps_y = model2task2dps[model_y] |
| | common_tasks = set(task2dps_x.keys()) & set(task2dps_y.keys()) |
| | if len(common_tasks) == 0: |
| | score_list.append(None) |
| | print( |
| | f"[Warning] no common passing set between {model_x} and {model_y}" |
| | ) |
| | continue |
| | dps_x = mean([task2dps_x[task_id] for task_id in common_tasks]) |
| | dps_y = mean([task2dps_y[task_id] for task_id in common_tasks]) |
| | score_list.append((dps_x, dps_y)) |
| | text = f"{round(dps_x)}" |
| | if dps_x - dps_y >= 1: |
| | text += f"\n+{dps_x - dps_y:.1f}" |
| | elif dps_x - dps_y <= -1: |
| | text += f"\n-{dps_y - dps_x:.1f}" |
| | ax.text( |
| | j, |
| | i, |
| | text, |
| | va="center", |
| | ha="center", |
| | color="green" if dps_x > dps_y else "red", |
| | ) |
| | score_matrix.append(score_list) |
| |
|
| | |
| |
|
| | score_matrix_diff = [ |
| | [None if score is None else score[0] - score[1] for score in score_list] |
| | for score_list in score_matrix |
| | ] |
| |
|
| | cmap = LinearSegmentedColormap.from_list("rg", ["r", "w", "lime"], N=256) |
| | cax = ax.matshow(score_matrix_diff, cmap=cmap) |
| | cax.set_clim(-15, 15) |
| | fig.colorbar(cax) |
| | ax.set_xticks(range(len(model_list))) |
| | ax.set_yticks(range(len(model_list))) |
| | ax.set_xticklabels(model_list, rotation=45, ha="left", rotation_mode="anchor") |
| | ax.set_yticklabels(model_list) |
| | |
| | plt.savefig("pairwise_heatmap.png", dpi=120, bbox_inches="tight") |
| | plt.savefig("pairwise_heatmap.pdf", bbox_inches="tight") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | from fire import Fire |
| |
|
| | Fire(main) |
| |
|