File size: 8,787 Bytes

59d97af

#!/usr/bin/env python
#  coding=utf-8
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT license.

from abc import ABC, abstractmethod

import jsonlines
import json
import copy
import glob
import random
import fire


class Converter(ABC):

    def __init__(self, filepath) -> None:
        super().__init__()

        self.filepath = filepath

    def convert(self):
        """
        Implement your convert logics in this function
        """
        self.start()
        self.process()
        self.end()
        pass

    def start(self):
        print(f'Start processing {self.__class__.__name__} at {self.filepath}')

    def end(self):
        print(
            f'Finish processing {self.__class__.__name__} at {self.filepath}')

    @abstractmethod
    def process(self):
        """
        Implement your convert logics in this function
        """


class DSTC7Converter(Converter):

    '''
    Converter class for DSTC7 Grounded response generation
    '''

    def process(self):

        convs = open(self.filepath)
        examples = []
        for conv in convs:
            _, c_id, score, facts, context, response = conv.split('\t')
            example = {}
            if context.strip() == 'START':
                continue
            context = context.replace('START EOS TIL ', '')
            example['Context'] = context.strip()
            example['Knowledge'] = facts.replace(
                ' < p > ', '').replace(' < /p > ', '').strip()
            example['Response'] = response.strip()
            examples.append(copy.deepcopy(example))

        with jsonlines.open('../data/dstc7.jsonl', mode='w') as writer:
            for i in examples:
                writer.write(i)

        return


class MSMARCOConverter(Converter):

    '''
    Converter class for MS MARCO 
    '''

    def process(self):

        train_data = json.load(open(self.filepath))
        examples = []
        for ids in train_data['query'].keys():
            query, answer, passage = train_data['query'][ids], train_data['answers'][ids], train_data['passages'][ids]
            knowledge = [i['passage_text']
                         for i in passage if i['is_selected']]
            example = {}
            example['Context'] = query.strip()
            example['Knowledge'] = ' '.join(knowledge)
            example['Response'] = ' '.join(answer).strip()
            examples.append(copy.deepcopy(example))

        with jsonlines.open('../data/msmarco.jsonl', mode='w') as writer:
            for i in examples:
                writer.write(i)

        return


class UnifiedQAConverter(Converter):

    def process(self):

        examples = []
        for fname in glob.glob(f'{self.filepath}/*/*'):
            if 'train.tsv' in fname or 'test.tsv' in fname:
                data = open(fname)
                for line in data:
                    line = line.strip()
                    try:
                        question, answer = line.split('\t')
                        question, story = question.split('\\n')
                        example = {}
                        example['Context'] = question
                        example['Response'] = answer
                        example['Knowledge'] = story
                        examples.append(copy.deepcopy(example))
                        k += 1
                    except:
                        pass

        train_writer = jsonlines.open('../data/unifiedqa.jsonl', mode='w')
        for i in examples:
            train_writer.write(i)

        return


class SGDConverter(Converter):

    '''
    Converter class for SGD dataset
    '''

    def process(self):

        examples = []
        for split in ['train', 'dev', 'test']:
            schema_info = json.load(
                open(f'{self.filepath}/{split}/schema.json'))
            schema_info = dict([(i['service_name'], i) for i in schema_info])
            for file in glob.glob(f'{self.filepath}/{split}/dialogues_*.json'):
                data = json.load(open(file))
                for dialogue in data:
                    dialogue_id = dialogue['dialogue_id']
                    services = dialogue['services'][0]
                    schema = schema_info[services]
                    description = schema['description']
                    task_slots = [s['name'] for s in schema['slots']]
                    task_intents = [s['name'] for s in schema['intents']]
                    task_intents_description = [
                        s['description'] for s in schema['intents']]
                    turns = dialogue['turns']
                    history = []
                    example = {}
                    for idx, turn in enumerate(turns):
                        if idx == 0:
                            assert turn['speaker'] == 'USER'
                        frame = turn['frames'][0]
                        service = turn['frames'][0]['service'].split('_')[
                            0].lower()
                        if turn['speaker'] == 'USER':
                            user_utter = turn['utterance']
                            history.append(f'{user_utter}')
                            belief_slot_values = frame['state']['slot_values']
                            slot_values_list = []
                            for slot_value in belief_slot_values.items():
                                slot, values = slot_value
                                value = values[0]
                                slot_values_list.append(f'{slot} = {value}')
                            slot_values_str = ' ; '.join(slot_values_list)

                        else:
                            sys_utter = copy.copy(turn['utterance'])
                            slot_values_str = f'belief : {service} {slot_values_str}'

                            slots = frame['slots']
                            offset = 0
                            len_ = len(sys_utter)
                            candidates = []
                            for idx, slot_info in enumerate(slots):
                                start, end, slot_name = slot_info['start'], slot_info['exclusive_end'], slot_info['slot']
                                sys_utter = sys_utter[:start+offset] + str(
                                    idx) * (end - start) + sys_utter[end+offset:]
                                candidates.append(
                                    (slot_name, str(idx) * (end - start)))
                            for idx, info in enumerate(candidates):
                                slotname, target = info
                                sys_utter = sys_utter.replace(
                                    target, f'[{slotname}]')

                            reply = f'{sys_utter}'
                            example['Context'] = ' EOS '.join(history)
                            example['Knowledge'] = slot_values_str
                            example['Response'] = reply
                            examples.append(copy.deepcopy(example))
                            history.append(reply)

        train_writer = jsonlines.open('../data/sgd.jsonl', mode='w')
        for i in examples:
            train_writer.write(i)

        return


def merge_and_split():

    examples = []
    filepath = '../data/dstc7.jsonl'
    with open(filepath, "r", encoding="utf-8") as reader:
        for item in jsonlines.Reader(reader):
            examples.append(item)

    filepath = '../data/msmarco.jsonl'
    with open(filepath, "r", encoding="utf-8") as reader:
        for item in jsonlines.Reader(reader):
            examples.append(item)

    filepath = '../data/sgd.jsonl'
    with open(filepath, "r", encoding="utf-8") as reader:
        for item in jsonlines.Reader(reader):
            examples.append(item)

    filepath = '../data/unifiedqa.jsonl'
    with open(filepath, "r", encoding="utf-8") as reader:
        for item in jsonlines.Reader(reader):
            examples.append(item)

    random.seed(2021)
    train_writer = jsonlines.open(
        '../data/grounded_data_train.jsonl', mode='w')
    valid_writer = jsonlines.open(
        '../data/grounded_data_valid.jsonl', mode='w')
    for i in examples:
        if random.random() < 0.01:
            valid_writer.write(i)
        else:
            train_writer.write(i)

    print('Done!')


def process(
    msmarco_path,
    sgd_path,
    dstc7_path,
    unified_qa_path
):
    MSMARCOConverter(f'{msmarco_path}/train_v2.1.json').convert()
    SGDConverter(f'{sgd_path}').convert()
    DSTC7Converter(f'{dstc7_path}').convert()
    UnifiedQAConverter(unified_qa_path).convert()


def main():
    fire.Fire(process)
    # merge generated data and split it into train and valid
    merge_and_split()


if __name__ == '__main__':
    main()