File size: 3,488 Bytes
0c51b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import Any, Iterable, List, Tuple

import jsonlines
import numpy as np
from scipy.stats import spearmanr
from src.utils.correlation import fleiss_kappa


def read_data() -> Tuple[Iterable[Any], Iterable[Any]]:
    with jsonlines.open(
        "../data/openai_log_key_utterance.jsonl", "r"
    ) as reader:
        prompting_dataset = list(reader)

    with jsonlines.open("../data/human_log_attribution.jsonl", "r") as reader:
        human_dataset = list(reader)
    return prompting_dataset, human_dataset


def hard_code_key(attributed_utterances: Any) -> Any:
    new_attributed_utterances = {}
    for key in attributed_utterances:
        utterance_num = int(key.split(" ")[1])
        new_utterance_num = utterance_num + 1
        new_key = key.replace(str(utterance_num), str(new_utterance_num))
        new_attributed_utterances[new_key] = attributed_utterances[key]
    return new_attributed_utterances


def build_paired_scores(
    human_attributed_utterances: Any, prompt_attributed_utterances: Any
) -> List[Tuple[float, Any]]:
    paired_scores = []
    for key in human_attributed_utterances:
        human_scores = human_attributed_utterances[key][-1]
        prompt_score = prompt_attributed_utterances[key][-1]
        if isinstance(human_scores, dict) and prompt_score != -1:
            human_score = 0.0
            for key, score in human_scores.items():
                human_score += score
            human_score /= len(human_scores)
            paired_scores.append((human_score, prompt_score))
    return paired_scores


if __name__ == "__main__":
    prompting_dataset, human_dataset = read_data()
    paired_scores_dataset = []
    for human_data in human_dataset:
        for prompt_data in prompting_dataset:
            if (
                human_data["episode_id"] == prompt_data["episode_id"]
                and human_data["agent"] == prompt_data["agent"]
            ):
                human_attributed_utterances = human_data[
                    "attributed_utterances"
                ]
                prompt_attributed_utterances = prompt_data[
                    "key_utterance_judgement"
                ]
                paired_scores = build_paired_scores(
                    human_attributed_utterances, prompt_attributed_utterances
                )
                paired_scores_dataset += paired_scores
                break
    prompt_scores = []
    for score in paired_scores_dataset:
        if score[0] == 3:
            prompt_scores.append(1)
        else:
            prompt_scores.append(0)

    human_scores = []
    for score in paired_scores_dataset:
        if score[1] == "YES":
            human_scores.append(1)
        else:
            human_scores.append(0)

    spearman_corr, _ = spearmanr(human_scores, prompt_scores)

    # calculate the Recall / Precision / F1
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

    accuracy = accuracy_score(human_scores, prompt_scores)
    precision = precision_score(human_scores, prompt_scores)
    recall = recall_score(human_scores, prompt_scores)
    f1 = f1_score(human_scores, prompt_scores)
    print("accuracy: {}".format(accuracy))
    print("precision: {}".format(precision))
    print("recall: {}".format(recall))
    print("f1: {}".format(f1))

    kappa_input = np.array([prompt_scores, human_scores]).T
    fleiss_kappa_score = fleiss_kappa(kappa_input)
    print("Fleiss' Kappa: {}".format(fleiss_kappa_score))