Llamipa / bespoke /format_unannotated_jsonl.py

Upload 4 files

30b495e verified 9 months ago

4.41 kB

	"""
	A dialogue is a list of samples, where each sample contains one new speaker turn.

	takes a json of annotated minecraft games and converts to
	a turn format to be used in LLAMA parsing.

	NB: when creating json-l, use '###PS' for 'predict structure'
	"""

	import os
	import json
	import jsonlines
	from collections import defaultdict


	def preprocess_edus(tlist):
	"""
	returns a list of lists, where each list contains the edus for a single turn.
	Ex:

	[...['6 <Buil> What is D2'],
	['7 <Arch> Ah there is no stack,', '8 <Arch> pick up the washer'],...]

	we see one turn contains the edu index 6, and the next turn contains the edus
	with indexes 7 and 8.

	NB: in a dialogue, might be best to change speakers to "Arch" and "Buil" to
	reflect MSDC training data
	"""
	elist = []

	cnt = 0

	for turn in tlist:
	speaker = turn['speaker'][:4]
	#write code to change speaker names here

	new_edus = []
	for edu in turn['edus']:
	new_string = str(cnt)+' '+'<'+speaker+'>'+' ' + edu
	new_edus.append(new_string)
	cnt += 1

	elist.append(new_edus)

	return elist

	def get_windows(dial_turns, distance = 15):
	"""
	Takes the output from preprocess_edus() and
	returns a list of index pairs. Each pair gives the delimiting indexes
	for a window of turns whose total edus <= distance

	Ex.
	[(0, 11), (1, 12), (4, 13), (5, 14), ...]

	Here, turns 0 through 11 contain edus <=distance, but once the edus from turn
	12 are added, the window has to be adjusted in order for edus to remain <=distance.
	The window must shifted from 1-12, then from 4-13, etc.

	"""
	edu_lens = [len(d) for d in dial_turns]
	windows = []
	esum = 0
	first_cutoff = 0
	for i, w in enumerate(edu_lens):
	esum += w
	if esum > distance:
	first_cutoff = i - 1
	break
	windows.append((0, first_cutoff))


	for i in range(first_cutoff + 1, len(edu_lens)):
	#print(i)
	esum = 0
	for r in range(i, -1, -1):
	esum += edu_lens[r]
	if esum > distance:
	# print(sum)
	# print("new beg ", r+1)
	windows.append((r+1,i))
	break

	return windows

	current_folder=os.getcwd()

	data_path = current_folder + '/<turns>.json'
	save_path = current_folder + '/<parser>.jsonl'

	with open(data_path, 'r') as j:
	jfile = json.load(j)
	dialogues = jfile

	json_l = []

	dialogue_count = 0

	DISTANCE = 15
	start_index = 0


	for dial in dialogues:
	dialogue_count += 1
	dial_id = dial['id']
	print(dial_id)

	#if generating a test file for incremental parsing, add space marker between dialogues
	#for any other files (test for gold parsing or train), remove this ---->
	sample = {}
	sample['PS'] = ""
	sample['sample'] = "NEW DIALOGUE " + dial_id
	json_l.append(sample)
	#<-------------------------------

	turns = preprocess_edus(dial['turns']) #preprocess game edus
	print(turns)

	windows = get_windows(turns, DISTANCE)
	print('------------------')
	print(windows)

	#start with first window
	global_context = []
	global_context.extend(turns[0]) #add 0 turn "mission has started"
	for t in turns[1:windows[0][1]+1]: #go through each subsequent turn in first window and create a new sample
	sample = {}
	c = "\n".join(global_context)
	n = "\n".join(t)

	sample['PS'] = ""
	sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
	json_l.append(sample)

	global_context.extend(t)

	#now for each new turn added beyond the first window, we need to adjust the context window
	for window in windows[1:]:
	#print(window)
	global_context = []
	for tw in turns[window[0]:window[1]]:
	global_context.extend(tw)
	sample = {}
	c = "\n".join(global_context)
	n = "\n".join(turns[window[1]])

	sample['PS'] = ""
	sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
	json_l.append(sample)


	#convert the dicts into json dicts for json_l
	with jsonlines.open(save_path, mode='w') as writer:
	for x in json_l:
	writer.write(x)

	print('jsonl saved for {} games'.format(dialogue_count))