File size: 4,203 Bytes
24c2665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import json
import os
from statistics import mean

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# EvalPerf result file name format: {MODEL}_temp_{TEMP}_ep-{TYPE}_results.json


# Draw a heatmap of pairwise comparison of models
# each pair of models is compared using the same set of passing tasks
def main(result_dir: str):
    assert os.path.isdir(result_dir), f"{result_dir} is not a directory."

    model2task2dps = {}
    model2task2dps_norm = {}

    model_list = []
    model_e2e_dps = []
    for result_json in os.listdir(result_dir):
        if not result_json.endswith(".json"):
            continue
        result_json_path = os.path.join(result_dir, result_json)
        assert "_temp_0.2_" in result_json, f"Invalid result file name: {result_json}"
        model_id = result_json.split("_temp_0.2_")[0]
        if model_id.endswith("-instruct") and not model_id.endswith(" perf-instruct"):
            model_id = model_id[: -len("-instruct")]
            model_id += " :: default"
        if "::" not in model_id:
            model_id += " :: default"
        print(f"Processing {model_id}")
        with open(result_json_path) as f:
            results = json.load(f)
        task2dps = {}
        task2dps_norm = {}

        for task_id, result in results.items():
            if "scores" in result and result["scores"] is not None:
                task2dps[task_id] = result["scores"]["max"]
                task2dps_norm[task_id] = result["norm_scores"]["max"]
            if "dps" in result and result["dps"] is not None:
                task2dps[task_id] = max(result["dps"])
                task2dps_norm[task_id] = max(result["dps_norm"])

        model2task2dps[model_id] = task2dps
        model2task2dps_norm[model_id] = task2dps_norm
        model_list.append(model_id)
        model_e2e_dps.append(mean(task2dps.values()))

    # sort model list by dps score
    model_list, model_e2e_dps = zip(
        *sorted(zip(model_list, model_e2e_dps), key=lambda x: x[1], reverse=True)
    )

    # model_list = model_list[:32]

    fig, ax = plt.subplots(figsize=(30, 25))

    score_matrix = []
    for i, model_x in enumerate(model_list):
        score_list = []
        task2dps_x = model2task2dps[model_x]
        for j, model_y in enumerate(model_list):
            if j <= i:
                score_list.append((0, 0))
                continue
            task2dps_y = model2task2dps[model_y]
            common_tasks = set(task2dps_x.keys()) & set(task2dps_y.keys())
            if len(common_tasks) == 0:
                score_list.append(None)
                print(
                    f"[Warning] no common passing set between {model_x} and {model_y}"
                )
                continue
            dps_x = mean([task2dps_x[task_id] for task_id in common_tasks])
            dps_y = mean([task2dps_y[task_id] for task_id in common_tasks])
            score_list.append((dps_x, dps_y))
            text = f"{round(dps_x)}"
            if dps_x - dps_y >= 1:
                text += f"\n+{dps_x - dps_y:.1f}"
            elif dps_x - dps_y <= -1:
                text += f"\n-{dps_y - dps_x:.1f}"
            ax.text(
                j,
                i,
                text,
                va="center",
                ha="center",
                color="green" if dps_x > dps_y else "red",
            )
        score_matrix.append(score_list)

    # print(score_matrix)

    score_matrix_diff = [
        [None if score is None else score[0] - score[1] for score in score_list]
        for score_list in score_matrix
    ]

    cmap = LinearSegmentedColormap.from_list("rg", ["r", "w", "lime"], N=256)
    cax = ax.matshow(score_matrix_diff, cmap=cmap)
    cax.set_clim(-15, 15)
    fig.colorbar(cax)
    ax.set_xticks(range(len(model_list)))
    ax.set_yticks(range(len(model_list)))
    ax.set_xticklabels(model_list, rotation=45, ha="left", rotation_mode="anchor")
    ax.set_yticklabels(model_list)
    # save fig
    plt.savefig("pairwise_heatmap.png", dpi=120, bbox_inches="tight")
    plt.savefig("pairwise_heatmap.pdf", bbox_inches="tight")


if __name__ == "__main__":
    from fire import Fire

    Fire(main)