Other
English
minecraft
action prediction
Llamipa / bespoke /format_unannotated_jsonl.py
Kqte's picture
Upload 4 files
30b495e verified
"""
A dialogue is a list of samples, where each sample contains one new speaker turn.
takes a json of annotated minecraft games and converts to
a turn format to be used in LLAMA parsing.
NB: when creating json-l, use '###PS' for 'predict structure'
"""
import os
import json
import jsonlines
from collections import defaultdict
def preprocess_edus(tlist):
"""
returns a list of lists, where each list contains the edus for a single turn.
Ex:
[...['6 <Buil> What is D2'],
['7 <Arch> Ah there is no stack,', '8 <Arch> pick up the washer'],...]
we see one turn contains the edu index 6, and the next turn contains the edus
with indexes 7 and 8.
NB: in a dialogue, might be best to change speakers to "Arch" and "Buil" to
reflect MSDC training data
"""
elist = []
cnt = 0
for turn in tlist:
speaker = turn['speaker'][:4]
#write code to change speaker names here
new_edus = []
for edu in turn['edus']:
new_string = str(cnt)+' '+'<'+speaker+'>'+' ' + edu
new_edus.append(new_string)
cnt += 1
elist.append(new_edus)
return elist
def get_windows(dial_turns, distance = 15):
"""
Takes the output from preprocess_edus() and
returns a list of index pairs. Each pair gives the delimiting indexes
for a window of turns whose total edus <= distance
Ex.
[(0, 11), (1, 12), (4, 13), (5, 14), ...]
Here, turns 0 through 11 contain edus <=distance, but once the edus from turn
12 are added, the window has to be adjusted in order for edus to remain <=distance.
The window must shifted from 1-12, then from 4-13, etc.
"""
edu_lens = [len(d) for d in dial_turns]
windows = []
esum = 0
first_cutoff = 0
for i, w in enumerate(edu_lens):
esum += w
if esum > distance:
first_cutoff = i - 1
break
windows.append((0, first_cutoff))
for i in range(first_cutoff + 1, len(edu_lens)):
#print(i)
esum = 0
for r in range(i, -1, -1):
esum += edu_lens[r]
if esum > distance:
# print(sum)
# print("new beg ", r+1)
windows.append((r+1,i))
break
return windows
current_folder=os.getcwd()
data_path = current_folder + '/<turns>.json'
save_path = current_folder + '/<parser>.jsonl'
with open(data_path, 'r') as j:
jfile = json.load(j)
dialogues = jfile
json_l = []
dialogue_count = 0
DISTANCE = 15
start_index = 0
for dial in dialogues:
dialogue_count += 1
dial_id = dial['id']
print(dial_id)
#if generating a test file for incremental parsing, add space marker between dialogues
#for any other files (test for gold parsing or train), remove this ---->
sample = {}
sample['PS'] = ""
sample['sample'] = "NEW DIALOGUE " + dial_id
json_l.append(sample)
#<-------------------------------
turns = preprocess_edus(dial['turns']) #preprocess game edus
print(turns)
windows = get_windows(turns, DISTANCE)
print('------------------')
print(windows)
#start with first window
global_context = []
global_context.extend(turns[0]) #add 0 turn "mission has started"
for t in turns[1:windows[0][1]+1]: #go through each subsequent turn in first window and create a new sample
sample = {}
c = "\n".join(global_context)
n = "\n".join(t)
sample['PS'] = ""
sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
json_l.append(sample)
global_context.extend(t)
#now for each new turn added beyond the first window, we need to adjust the context window
for window in windows[1:]:
#print(window)
global_context = []
for tw in turns[window[0]:window[1]]:
global_context.extend(tw)
sample = {}
c = "\n".join(global_context)
n = "\n".join(turns[window[1]])
sample['PS'] = ""
sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
json_l.append(sample)
#convert the dicts into json dicts for json_l
with jsonlines.open(save_path, mode='w') as writer:
for x in json_l:
writer.write(x)
print('jsonl saved for {} games'.format(dialogue_count))