File size: 3,335 Bytes
0c51b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import glob
import json
import os
from typing import Any, Dict, List

import click
from db_free_reverse_engineering import run_reverse_by_pk_agent
from tqdm import tqdm


@click.command()
@click.option("--data_dir", type=str, required=True, help="Directory containing data files.")
@click.option("--utterances_output_subdir", type=str, required=True, help="Directory to save the utterances.")
@click.option("--episodes_file", type=str, required=True, help="Path to the raw JSON file.")
@click.option("--sft_output_file", type=str, required=False, help="Path to the processed JSON file.")
def main(data_dir: str, utterances_output_subdir: str, episodes_file: str, sft_output_file: str) -> None:
    episode_path = os.path.join(data_dir, episodes_file)
    if not os.path.exists(episode_path):
        raise Exception(f"Episodes file not found: {episode_path}")

    with open(episode_path, 'r') as f:
        data: List[Dict[str, Any]] = [json.loads(d) for d in f.readlines()]

    cache_dir = os.path.join(data_dir, utterances_output_subdir)
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        for d in tqdm(data):
            run_reverse_by_pk_agent(d['episode_id'], True, cache_dir, episode_path)
            run_reverse_by_pk_agent(d['episode_id'], False, cache_dir, episode_path)

    utterances = []
    for record in glob.glob(f"{cache_dir}/*.json"):
        with open(record, 'r') as f:
            uttr = json.load(f)
            utterances.append(uttr)

    sft_utterances = []
    for uttr in utterances:
        sft_utterances.append({
            "input": uttr['prompt'] + " Your available action types are\nspeak none action leave non-verbal communication.\nNote: You can \"leave\" this conversation if 1. you have achieved your social goals, 2. this conversation makes you uncomfortable, 3. you find it uninteresting/you lose your patience, 4. or for other reasons you want to leave.\n\nPlease only generate a JSON string including the action type and the argument.\nYour action should follow the given format:\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{\"properties\": {\"action_type\": {\"description\": \"whether to speak at this turn or choose to not do anything\", \"enum\": [\"none\", \"speak\", \"non-verbal communication\", \"action\", \"leave\"], \"title\": \"Action Type\", \"type\": \"string\"}, \"argument\": {\"description\": \"the utterance if choose to speak, the expression or gesture if choose non-verbal communication, or the physical action if choose action\", \"title\": \"Argument\", \"type\": \"string\"}}, \"required\": [\"action_type\", \"argument\"]}\n```",
            "output": uttr['result'],
        })

    print(f"Total utterances: {len(sft_utterances)}")
    with open(os.path.join(data_dir, sft_output_file), 'w') as f:
        json.dump(sft_utterances, f, indent=4)

if __name__ == "__main__":
    main()