File size: 4,942 Bytes
0c51b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
import os
from typing import Any, Dict, List

from src.human_annotate.google_form_apis import get_form, get_form_responses
from tqdm import tqdm

from ..utils.preprocess import extract_goal_scores

GoogleResource = Any


def retrieve_responses(data_dir: str, gcp_key: str) -> None:
    with open(
        os.path.join(data_dir, "openai_log_attribution.jsonl"), "r"
    ) as f:
        log = [json.loads(line) for line in f]

    with open(os.path.join(data_dir, "form_uris.jsonl"), "r") as f:
        form_uris = [json.loads(line) for line in f]

    log = add_responses_to_sheet(log, form_uris, gcp_key)

    with open(os.path.join(data_dir, "human_log_attribution.jsonl"), "w") as f:
        for item in log:
            f.write(json.dumps(item))
            f.write("\n")


def get_episodes_from_form_ids(data_dir: str, gcp_key: str) -> None:
    with open(os.path.join(data_dir, "sotopia_episodes_v1.jsonl"), "r") as f:
        episodes = [json.loads(line) for line in f]

    with open(os.path.join(data_dir, "form_ids.txt"), "r") as f:
        form_ids = f.readlines()

    form_ids = [form_id.strip() for form_id in form_ids]

    print("retrieving episodes from form ids")
    example_episodes = []
    visited = set()
    for form_id in tqdm(form_ids):
        form = get_form(form_id, gcp_key)
        episode_id = form["info"]["title"].split(" ")[-1]
        for episode in episodes:
            if (
                episode["episode_id"] == episode_id
                and episode_id not in visited
            ):
                visited.add(episode_id)
                example_episodes.append(episode)
                break

    with open(os.path.join(data_dir, "example_episodes.jsonl"), "w") as f:
        for episode in example_episodes:
            f.write(json.dumps(episode) + "\n")

    example_episodes_with_scores = extract_goal_scores(example_episodes)
    with open(
        os.path.join(data_dir, "example_episodes_with_scores.jsonl"), "w"
    ) as f:
        for episode in example_episodes_with_scores:
            f.write(json.dumps(episode) + "\n")


def add_responses_to_sheet(
    log: List[Dict[str, Any]], form_uris: List[Dict[str, str]], gcp_key: str
) -> List[Dict[str, Any]]:
    """Add responses to rewarded attribution log."""
    for i in range(len(log)):
        form_id = form_uris[i]["formId"]
        print(f"Log: {i}")
        print(f"Form ID: {form_id}")
        form_schema = get_form(form_id, gcp_key)
        responses = get_form_responses(form_id, gcp_key)
        print(f"Responses: {responses}")
        for key in log[i]["attributed_utterances"]:
            print(f"  Key: {key}")
            item, next_item = None, None
            for item_idx in range(len(form_schema["items"])):
                item = form_schema["items"][item_idx]
                if item["title"].split(":")[0] == key:
                    if item_idx + 1 < len(form_schema["items"]):
                        next_item = form_schema["items"][item_idx + 1]
                    break

            if item and "questionItem" in item:
                question_id = item["questionItem"]["question"]["questionId"]
                for response in responses:
                    response_answer = response["answers"][question_id]
                    if len(log[i]["attributed_utterances"][key]) == 2:
                        log[i]["attributed_utterances"][key].append({})
                    log[i]["attributed_utterances"][key][2].update(
                        {
                            response["lastSubmittedTime"]: int(
                                response_answer["textAnswers"]["answers"][0][
                                    "value"
                                ]
                            )
                        }
                    )
                if next_item and "questionItem" in next_item:
                    next_question_id = next_item["questionItem"]["question"][
                        "questionId"
                    ]
                    for response in responses:
                        next_response_answer = response["answers"][
                            next_question_id
                        ]
                        if len(log[i]["attributed_utterances"][key]) == 3:
                            log[i]["attributed_utterances"][key].append({})
                        log[i]["attributed_utterances"][key][3].update(
                            {
                                response["lastSubmittedTime"]: str(
                                    next_response_answer["textAnswers"][
                                        "answers"
                                    ][0]["value"]
                                )
                            }
                        )
                    print(log[i]["attributed_utterances"][key][3])
            else:
                print("  No question item found")
        print("Updated log")
    return log