broken.god / scripts /grounded_converter.py

Upload 6 files

59d97af about 3 years ago

8.79 kB

	#!/usr/bin/env python
	# coding=utf-8
	# Copyright (c) Microsoft Corporation.
	# Licensed under the MIT license.

	from abc import ABC, abstractmethod

	import jsonlines
	import json
	import copy
	import glob
	import random
	import fire


	class Converter(ABC):

	def __init__(self, filepath) -> None:
	super().__init__()

	self.filepath = filepath

	def convert(self):
	"""
	Implement your convert logics in this function
	"""
	self.start()
	self.process()
	self.end()
	pass

	def start(self):
	print(f'Start processing {self.__class__.__name__} at {self.filepath}')

	def end(self):
	print(
	f'Finish processing {self.__class__.__name__} at {self.filepath}')

	@abstractmethod
	def process(self):
	"""
	Implement your convert logics in this function
	"""


	class DSTC7Converter(Converter):

	'''
	Converter class for DSTC7 Grounded response generation
	'''

	def process(self):

	convs = open(self.filepath)
	examples = []
	for conv in convs:
	_, c_id, score, facts, context, response = conv.split('\t')
	example = {}
	if context.strip() == 'START':
	continue
	context = context.replace('START EOS TIL ', '')
	example['Context'] = context.strip()
	example['Knowledge'] = facts.replace(
	' < p > ', '').replace(' < /p > ', '').strip()
	example['Response'] = response.strip()
	examples.append(copy.deepcopy(example))

	with jsonlines.open('../data/dstc7.jsonl', mode='w') as writer:
	for i in examples:
	writer.write(i)

	return


	class MSMARCOConverter(Converter):

	'''
	Converter class for MS MARCO
	'''

	def process(self):

	train_data = json.load(open(self.filepath))
	examples = []
	for ids in train_data['query'].keys():
	query, answer, passage = train_data['query'][ids], train_data['answers'][ids], train_data['passages'][ids]
	knowledge = [i['passage_text']
	for i in passage if i['is_selected']]
	example = {}
	example['Context'] = query.strip()
	example['Knowledge'] = ' '.join(knowledge)
	example['Response'] = ' '.join(answer).strip()
	examples.append(copy.deepcopy(example))

	with jsonlines.open('../data/msmarco.jsonl', mode='w') as writer:
	for i in examples:
	writer.write(i)

	return


	class UnifiedQAConverter(Converter):

	def process(self):

	examples = []
	for fname in glob.glob(f'{self.filepath}//'):
	if 'train.tsv' in fname or 'test.tsv' in fname:
	data = open(fname)
	for line in data:
	line = line.strip()
	try:
	question, answer = line.split('\t')
	question, story = question.split('\\n')
	example = {}
	example['Context'] = question
	example['Response'] = answer
	example['Knowledge'] = story
	examples.append(copy.deepcopy(example))
	k += 1
	except:
	pass

	train_writer = jsonlines.open('../data/unifiedqa.jsonl', mode='w')
	for i in examples:
	train_writer.write(i)

	return


	class SGDConverter(Converter):

	'''
	Converter class for SGD dataset
	'''

	def process(self):

	examples = []
	for split in ['train', 'dev', 'test']:
	schema_info = json.load(
	open(f'{self.filepath}/{split}/schema.json'))
	schema_info = dict([(i['service_name'], i) for i in schema_info])
	for file in glob.glob(f'{self.filepath}/{split}/dialogues_*.json'):
	data = json.load(open(file))
	for dialogue in data:
	dialogue_id = dialogue['dialogue_id']
	services = dialogue['services'][0]
	schema = schema_info[services]
	description = schema['description']
	task_slots = [s['name'] for s in schema['slots']]
	task_intents = [s['name'] for s in schema['intents']]
	task_intents_description = [
	s['description'] for s in schema['intents']]
	turns = dialogue['turns']
	history = []
	example = {}
	for idx, turn in enumerate(turns):
	if idx == 0:
	assert turn['speaker'] == 'USER'
	frame = turn['frames'][0]
	service = turn['frames'][0]['service'].split('_')[
	0].lower()
	if turn['speaker'] == 'USER':
	user_utter = turn['utterance']
	history.append(f'{user_utter}')
	belief_slot_values = frame['state']['slot_values']
	slot_values_list = []
	for slot_value in belief_slot_values.items():
	slot, values = slot_value
	value = values[0]
	slot_values_list.append(f'{slot} = {value}')
	slot_values_str = ' ; '.join(slot_values_list)

	else:
	sys_utter = copy.copy(turn['utterance'])
	slot_values_str = f'belief : {service} {slot_values_str}'

	slots = frame['slots']
	offset = 0
	len_ = len(sys_utter)
	candidates = []
	for idx, slot_info in enumerate(slots):
	start, end, slot_name = slot_info['start'], slot_info['exclusive_end'], slot_info['slot']
	sys_utter = sys_utter[:start+offset] + str(
	idx) * (end - start) + sys_utter[end+offset:]
	candidates.append(
	(slot_name, str(idx) * (end - start)))
	for idx, info in enumerate(candidates):
	slotname, target = info
	sys_utter = sys_utter.replace(
	target, f'[{slotname}]')

	reply = f'{sys_utter}'
	example['Context'] = ' EOS '.join(history)
	example['Knowledge'] = slot_values_str
	example['Response'] = reply
	examples.append(copy.deepcopy(example))
	history.append(reply)

	train_writer = jsonlines.open('../data/sgd.jsonl', mode='w')
	for i in examples:
	train_writer.write(i)

	return


	def merge_and_split():

	examples = []
	filepath = '../data/dstc7.jsonl'
	with open(filepath, "r", encoding="utf-8") as reader:
	for item in jsonlines.Reader(reader):
	examples.append(item)

	filepath = '../data/msmarco.jsonl'
	with open(filepath, "r", encoding="utf-8") as reader:
	for item in jsonlines.Reader(reader):
	examples.append(item)

	filepath = '../data/sgd.jsonl'
	with open(filepath, "r", encoding="utf-8") as reader:
	for item in jsonlines.Reader(reader):
	examples.append(item)

	filepath = '../data/unifiedqa.jsonl'
	with open(filepath, "r", encoding="utf-8") as reader:
	for item in jsonlines.Reader(reader):
	examples.append(item)

	random.seed(2021)
	train_writer = jsonlines.open(
	'../data/grounded_data_train.jsonl', mode='w')
	valid_writer = jsonlines.open(
	'../data/grounded_data_valid.jsonl', mode='w')
	for i in examples:
	if random.random() < 0.01:
	valid_writer.write(i)
	else:
	train_writer.write(i)

	print('Done!')


	def process(
	msmarco_path,
	sgd_path,
	dstc7_path,
	unified_qa_path
	):
	MSMARCOConverter(f'{msmarco_path}/train_v2.1.json').convert()
	SGDConverter(f'{sgd_path}').convert()
	DSTC7Converter(f'{dstc7_path}').convert()
	UnifiedQAConverter(unified_qa_path).convert()


	def main():
	fire.Fire(process)
	# merge generated data and split it into train and valid
	merge_and_split()


	if __name__ == '__main__':
	main()