File size: 4,129 Bytes
0c51b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import os
import re
from collections import defaultdict
from copy import deepcopy
from typing import Any, Dict, List

from tqdm import tqdm

# TODO: Fill in REDIS OM URL in the form of `redis://:password@host:port`
os.environ["REDIS_OM_URL"] = "redis://:QzmCUD3C3RdsR@localhost:6381"

from reverse_engineering import run_reverse_by_pk_agent

with open("../../data/sotopia_pi_openai_log_key_utterance.jsonl", 'r') as f:
    data: List[Dict[str, Any]] = [json.loads(d) for d in f.readlines()]

if not os.path.exists("../../data/episode_utterances"):
    os.makedirs("../../data/episode_utterances")
    for d in tqdm(data):
        run_reverse_by_pk_agent(d['episode_id'], True, "../../data/episode_utterances")
        run_reverse_by_pk_agent(d['episode_id'], False, "../../data/episode_utterances")

utterance_pattern = r'Utterance (\d+) by ([A-Za-z ]+)'
print(len(data))
print("turning into attributed utterances")

max_turn_dict: Dict[str, int] = defaultdict(int)
episode_id_goal_score: Dict[str, float] = defaultdict(float)

for d in tqdm(data):
    for uttr_key, attributed_uttr in d['key_utterance_judgement'].items():
        episode_id_goal_score[d['episode_id']] = d['goal_score']
        match = re.search(utterance_pattern, uttr_key)
        if match:
            turn_number = match.group(1)
            agent_name = match.group(2)
        else:
            raise Exception(f"Utterance key not in correct format: {uttr_key}")
        if agent_name != d['agent']:
            continue

        max_turn_dict[f"{d['episode_id']}-{agent_name}"] = max(max_turn_dict[f"{d['episode_id']}-{agent_name}"], int(turn_number))

max_turn_dict = dict(max_turn_dict)

discounting_factor = 0.9
attribution_dict: Dict[str, Dict[Any, Any]] = defaultdict(dict)

def get_attribution_dict(hash_key: str) -> None:
    episode_id, agent_name = hash_key.split("-")
    attribution_list = [0.0] * (max_turn_dict[f"{episode_id}-{agent_name}"] + 1)
    curr_reward = 1.0
    for i in range(max_turn_dict[f"{episode_id}-{agent_name}"], -1, -1):
        attribution_list[i] += curr_reward
        curr_reward *= discounting_factor

    # normalize the attribution
    max_attribution = max(attribution_list)
    attribution_list = [a / max_attribution for a in attribution_list]

    turn_reward_dict = {}
    for i in range(0, len(attribution_list)):
        turn_reward_dict[i] = attribution_list[i]
    attribution_dict[f"{episode_id}-{agent_name}"] = turn_reward_dict

for hash_key in max_turn_dict.keys():
    get_attribution_dict(hash_key)

attribution_dict = dict(attribution_dict)

print("turning into attributed utterances")
# randomly sample a few episodes to check the attribution

print(len(attribution_dict))

attributed_data = []
for hash_key in attribution_dict:
    episode_id, agent_name = hash_key.split("-")
    for turn_number in attribution_dict[hash_key]:
        utterance_path = f"../../data/episode_utterances/{episode_id}-{agent_name}-{turn_number}.json"
        if not os.path.exists(utterance_path):
            raise Exception(f"Utterance not found: {utterance_path}")
        with open(f"../../data/episode_utterances/{episode_id}-{agent_name}-{turn_number}.json", 'r') as f:
            sotopia_utterance = json.load(f)

        new_utterance = deepcopy(sotopia_utterance)
        new_utterance['attribution'] = attribution_dict[hash_key][turn_number]
        new_utterance['turn_number'] = turn_number
        new_utterance['goal_score'] = episode_id_goal_score[episode_id]

        attributed_data.append(new_utterance)

def calc_reward(utter_attrib: float, goal_score: float) -> float:
    return utter_attrib * goal_score

sotopia_pi_utterance_reward = []
for d in tqdm(attributed_data):
    sotopia_pi_utterance_reward.append(
        {
            "instruction": d['prompt'],
            "input": "",
            "output": d['result'],
            "value": calc_reward(d['attribution'], d['goal_score']),
            "system": "",
            "history": []
        }
    )

with open("../../data/sotopia_pi_reward_baseline.json", 'w') as f:
    json.dump(sotopia_pi_utterance_reward, f, indent=4)