Upload 6 files

Browse files

Files changed (6) hide show

scripts/create_downstream_dataset.sh +25 -0
scripts/create_grounded_dataset.sh +24 -0
scripts/create_reddit.py +80 -0
scripts/create_reddit_dataset.sh +16 -0
scripts/downstream_tasks_converter.py +288 -0
scripts/grounded_converter.py +264 -0

scripts/create_downstream_dataset.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+# ------------------------------------------------------------------
+# [Author] Title
+#          Description
+# ------------------------------------------------------------------
+VERSION=0.1.0
+SUBJECT=DialoGLMRedditDataset
+USAGE="Usage: "
+# Please follow parlai to download WoW and WoI dataset
+# WOW_PATH=/home/bapeng/anaconda3/envs/parlai/lib/python3.8/site-packages/data/wizard_of_wikipedia
+# python downstream_tasks_converter.py WoWConverter ${WOW_PATH}
+# WOI_PATH=/home/bapeng/anaconda3/envs/parlai/lib/python3.8/site-packages/data/wizard_of_interent
+# python downstream_tasks_converter.py WoIConverter ${WOI_PATH}
+# # Please follow https://github.com/stanfordnlp/coqa-baselines to prepare seq2seq-train-h2 and seq2seq-dev-h2
+# COQA_PATH=/home/bapeng/experiment/cqa/coqa-baselines/data
+# python downstream_tasks_converter.py CoQAConverter ${COQA_PATH}
+# Please clone https://github.com/wenhuchen/HDSA-Dialog to download the data.
+MULTIWOZ_PATH=/home/bapeng/experiment/HDSA-Dialog/data
+python downstream_tasks_converter.py MultiWOZConverter ${MULTIWOZ_PATH}

scripts/create_grounded_dataset.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# ------------------------------------------------------------------
+# [Author] Title
+#          Description
+# ------------------------------------------------------------------
+VERSION=0.1.0
+SUBJECT=DialoGLMGroundedDataset
+USAGE="Usage: "
+# Please follow https://microsoft.github.io/msmarco/ to download msmarco dataset
+MSMARCO_PATH=/home/bapeng/experiment/DialoGLM/data/dummy_data/msmarco
+# Please follow https://github.com/google-research-datasets/dstc8-schema-guided-dialogue
+SGD_PATH=/home/bapeng/experiment/dstc8-schema-guided-dialogue
+# Please follow https://github.com/mgalley/DSTC7-End-to-End-Conversation-Modeling
+DSTC7_PATH=/home/bapeng/experiment/DialoGLM/data/dummy_data/dstc7/dstc7_h100.tsv
+#Please follow instructions on https://github.com/allenai/unifiedqa to download the dataset
+UNIFIED_QA_PATH=/home/bapeng/experiment/DialoGLM/data/dummy_data/unifedqa
+python grounded_converter.py ${MSMARCO_PATH} ${SGD_PATH} ${DSTC7_PATH} ${UNIFIED_QA_PATH}

scripts/create_reddit.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python
+#  coding=utf-8
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+import jsonlines
+import fire
+def _norm_text(text):
+    w, *toks = text.strip().split()
+    try:
+        w = float(w)
+    except Exception:
+        toks = [w] + toks
+        w = 1.0
+    return w, ' '.join(toks)
+def _get_inputs_from_text(text):
+    srcs, tgt = text.strip().split('\t')
+    weights = []
+    inputs = []
+    for src in srcs.split(' EOS '):
+        src_weight, src = _norm_text(src)
+        weights.append(src_weight)
+        inputs.append(src)
+    tgt_weight, tgt = _norm_text(tgt)
+    if tgt_weight != 0:
+        weights.append(tgt_weight)
+        inputs.append(tgt)
+    return weights, inputs
+def process(reddit_path):
+    idx = 0
+    writer = jsonlines.open('../data/reddit_session_level.jsonl', 'w')
+    with open(reddit_path, "r", encoding="utf-8") as reader:
+        for line in reader:
+            idx += 1
+            if idx % 10000 == 0:
+                print(idx)
+            weights, inputs = _get_inputs_from_text(line)
+            if 0.0 in weights:
+                continue
+            else:
+                writer.write({'text': ' EOS '.join(inputs)})
+    idx = 0
+    with open('../data/reddit_session_level.jsonl', "r", encoding="utf-8") as reader:
+        writer = jsonlines.open('../data/reddit.jsonl', mode='w')
+        for item in jsonlines.Reader(reader):
+            idx += 1
+            if idx % 10000 == 0:
+                print(idx)
+            context = item['text'].split('EOS')
+            for idx in range(0, len(context)-1):
+                history = 'EOS'.join(context[:idx+1])
+                response = context[idx+1]
+                if len(history) == 0:
+                    continue
+                example = {}
+                example['Context'] = history
+                example['Knowledge'] = ''
+                example['Response'] = response.strip()
+                writer.write(example)
+def main():
+    fire.Fire(process)
+if __name__ == '__main__':
+    main()

scripts/create_reddit_dataset.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+# ------------------------------------------------------------------
+# [Author] Title
+#          Description
+# ------------------------------------------------------------------
+VERSION=0.1.0
+SUBJECT=DialoGLMRedditDataset
+USAGE="Usage: "
+# Please follow https://microsoft.github.io/msmarco/ to download msmarco dataset
+REDDIT_PATH=../data/dummy_data/reddit/dialogpt.t1000.txt
+python create_reddit.py ${REDDIT_PATH}

scripts/downstream_tasks_converter.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python
+#  coding=utf-8
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+from abc import ABC, abstractmethod
+import jsonlines
+import json
+import copy
+import random
+import fire
+class Converter(ABC):
+    def __init__(self, filepath) -> None:
+        super().__init__()
+        self.filepath = filepath
+    def convert(self):
+        """
+        Implement your convert logics in this function
+        """
+        self.start()
+        self.process()
+        self.end()
+        pass
+    def start(self):
+        print(f'Start processing {self.__class__.__name__} at {self.filepath}')
+    def end(self):
+        print(
+            f'Finish processing {self.__class__.__name__} at {self.filepath}')
+    @abstractmethod
+    def process(self):
+        """
+        Implement your convert logics in this function
+        """
+class WoWConverter(Converter):
+    def process(self):
+        train_data = json.load(open(f'{self.filepath}/train.json'))
+        topic_data = {}
+        for i in train_data:
+            chosen_topic = i['chosen_topic']
+            if not chosen_topic in topic_data.keys():
+                topic_data[chosen_topic] = []
+            else:
+                topic_data[chosen_topic].append((i['persona'], i['dialog']))
+        topic_data_sorted = sorted(
+            topic_data.items(), key=lambda k: -len(k[1]))
+        examples = []
+        for topic, dialogs in topic_data_sorted[1:100:2]:
+            for persona, dialog in dialogs[:1]:
+                history = [persona]
+                history = []
+                example = {}
+                checked_sentence = ''
+                for i in dialog:
+                    speaker = i['speaker']
+                    text = i['text']
+                    if 'Wizard' in speaker:
+                        try:
+                            checked_sentence = next(
+                                iter(i['checked_sentence'].values()))
+                        except Exception:
+                            checked_sentence = ''
+                        response = text
+                        example['Context'] = ' EOS '.join(history)
+                        example['Knowledge'] = checked_sentence
+                        example['Response'] = response.strip()
+                        examples.append(copy.deepcopy(example))
+                        example = {}
+                    else:
+                        text = text
+                    history.append(text.strip())
+        with jsonlines.open('../data/wow/wow_train.jsonl', mode='w') as writer:
+            for i in examples:
+                writer.write(i)
+        for split in ['valid', 'test']:
+            data = json.load(
+                open(f'{self.filepath}/{split}_random_split.json'))
+            examples = []
+            for dialog in data:
+                history = []
+                example = {}
+                checked_sentence = ''
+                persona = dialog['persona']
+                history = [persona]
+                for i in dialog['dialog']:
+                    speaker = i['speaker']
+                    text = i['text']
+                    if 'Wizard' in speaker:
+                        try:
+                            checked_sentence = next(
+                                iter(i['checked_sentence'].values()))
+                        except Exception:
+                            checked_sentence = ''
+                        text = text
+                        response = text
+                        example['Context'] = ' EOS '.join(history)
+                        example['Knowledge'] = checked_sentence
+                        example['Response'] = response.strip()
+                        examples.append(copy.deepcopy(example))
+                        example = {}
+                    else:
+                        text = text
+                    history.append(text)
+            with jsonlines.open(f'../data/wow/wow_{split}.jsonl', mode='w') as writer:
+                for i in examples:
+                    writer.write(i)
+        return super().process()
+class WoIConverter(Converter):
+    def process(self):
+        for split in ['train', 'valid', 'test']:
+            reader = jsonlines.open(f'{self.filepath}/{split}.jsonl')
+            examples = []
+            num_of_dialogs = 0
+            for dialog in reader:
+                num_of_dialogs += 1
+                example = {}
+                history = []
+                turn = ''
+                data = list(dialog.values())[0]
+                persona = data['apprentice_persona']
+                history = [persona.replace('\n', ' ')]
+                for i in data['dialog_history']:
+                    if 'SearchAgent' in i['action']:
+                        continue
+                    else:
+                        if i['action'] == 'Wizard => Apprentice':
+                            contents = []
+                            selected = []
+                            for content_ in i['context']['contents']:
+                                contents.extend(content_['content'])
+                            for selected_ in i['context']['selected_contents']:
+                                selected.extend(selected_)
+                            knowledge = []
+                            for c, s in zip(contents, selected[1:]):
+                                if s:
+                                    knowledge.append(c)
+                            turn = i['text'].strip()
+                            example['Context'] = ' EOS '.join(history)
+                            example['Knowledge'] = ' '.join(knowledge)
+                            example['Response'] = turn.strip()
+                            examples.append(copy.deepcopy(example))
+                        else:
+                            turn = i['text'].strip()
+                        history.append(turn)
+            with jsonlines.open(f'../data/woi/woi_{split}.jsonl', mode='w') as writer:
+                for i in examples:
+                    if split == 'train':
+                        if random.random() < 0.006:
+                            writer.write(i)
+                    else:
+                        writer.write(i)
+        return super().process()
+class CoQAConverter(Converter):
+    def process(self):
+        for split in ['train', 'dev']:
+            source = open(f'{self.filepath}/seq2seq-{split}-h2-src.txt')
+            target = open(f'{self.filepath}/seq2seq-{split}-h2-tgt.txt')
+            source_ = []
+            for line in source:
+                if line.strip() != '':
+                    sotry, question = line.strip().split('||')
+                    source_.append((sotry, question))
+            target_ = []
+            for line in target:
+                if line.strip() != '':
+                    target_.append(line.strip())
+            examples = []
+            for context, response in zip(source_, target_):
+                story, question = context
+                examples.append(
+                    {'Context': question, 'Response': response, 'Knowledge': story})
+            if split == 'dev':
+                split = 'valid'
+            with jsonlines.open(f'../data/coqa/coqa_{split}.jsonl', mode='w') as writer:
+                for i in examples:
+                    if split == 'train':
+                        if random.random() < 0.006:
+                            writer.write(i)
+                    else:
+                        writer.write(i)
+        return super().process()
+class MultiWOZConverter(Converter):
+    def process(self):
+        for split in ['train', 'val', 'test']:
+            data = json.load(open(f'{self.filepath}/{split}.json'))
+            examples = []
+            for i in data:
+                name = i['file'].lower()
+                history = []
+                for turn in i['info']:
+                    history.append(turn['user_orig'])
+                    bs = turn['BS']
+                    bs_str = []
+                    for domain, states in bs.items():
+                        domain_str = []
+                        for state in states:
+                            domain_str.append(state[0] + ' = ' + state[1])
+                        domain_str = ' ; '.join(domain_str)
+                        bs_str.append(domain + ' ' + domain_str)
+                    bs_str = ' | '.join(bs_str)
+                    db_str = 'kb '
+                    db = turn['KB']
+                    if db == 0:
+                        db_str += 'zero'
+                    elif db_str == 1:
+                        db_str += 'one'
+                    elif db_str == 2:
+                        db_str += 'two'
+                    else:
+                        db_str += 'more than two'
+                    act_seq = ' '.join(turn['act'].keys())
+                    example = {}
+                    example['Context'] = ' EOS '.join(history[:])
+                    example['Knowledge'] = bs_str + ' | ' + db_str
+                    example['Response'] = act_seq + ' | ' + turn['sys'].strip()
+                    history.append(turn['sys'].strip())
+                    examples.append(copy.copy(example))
+            if split == 'val':
+                split = 'valid'
+            with jsonlines.open(f'../data/multiwoz/multiwoz_{split}.jsonl', mode='w') as writer:
+                for i in examples:
+                    if split == 'train':
+                        if random.random() < 0.006:
+                            writer.write(i)
+                    else:
+                        writer.write(i)
+        return super().process()
+def convert(class_name, file_path):
+    eval(class_name)(file_path).convert()
+def main():
+    fire.Fire(convert)
+if __name__ == '__main__':
+    main()

scripts/grounded_converter.py ADDED Viewed

	@@ -0,0 +1,264 @@

+#!/usr/bin/env python
+#  coding=utf-8
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+from abc import ABC, abstractmethod
+import jsonlines
+import json
+import copy
+import glob
+import random
+import fire
+class Converter(ABC):
+    def __init__(self, filepath) -> None:
+        super().__init__()
+        self.filepath = filepath
+    def convert(self):
+        """
+        Implement your convert logics in this function
+        """
+        self.start()
+        self.process()
+        self.end()
+        pass
+    def start(self):
+        print(f'Start processing {self.__class__.__name__} at {self.filepath}')
+    def end(self):
+        print(
+            f'Finish processing {self.__class__.__name__} at {self.filepath}')
+    @abstractmethod
+    def process(self):
+        """
+        Implement your convert logics in this function
+        """
+class DSTC7Converter(Converter):
+    '''
+    Converter class for DSTC7 Grounded response generation
+    '''
+    def process(self):
+        convs = open(self.filepath)
+        examples = []
+        for conv in convs:
+            _, c_id, score, facts, context, response = conv.split('\t')
+            example = {}
+            if context.strip() == 'START':
+                continue
+            context = context.replace('START EOS TIL ', '')
+            example['Context'] = context.strip()
+            example['Knowledge'] = facts.replace(
+                ' < p > ', '').replace(' < /p > ', '').strip()
+            example['Response'] = response.strip()
+            examples.append(copy.deepcopy(example))
+        with jsonlines.open('../data/dstc7.jsonl', mode='w') as writer:
+            for i in examples:
+                writer.write(i)
+        return
+class MSMARCOConverter(Converter):
+    '''
+    Converter class for MS MARCO
+    '''
+    def process(self):
+        train_data = json.load(open(self.filepath))
+        examples = []
+        for ids in train_data['query'].keys():
+            query, answer, passage = train_data['query'][ids], train_data['answers'][ids], train_data['passages'][ids]
+            knowledge = [i['passage_text']
+                         for i in passage if i['is_selected']]
+            example = {}
+            example['Context'] = query.strip()
+            example['Knowledge'] = ' '.join(knowledge)
+            example['Response'] = ' '.join(answer).strip()
+            examples.append(copy.deepcopy(example))
+        with jsonlines.open('../data/msmarco.jsonl', mode='w') as writer:
+            for i in examples:
+                writer.write(i)
+        return
+class UnifiedQAConverter(Converter):
+    def process(self):
+        examples = []
+        for fname in glob.glob(f'{self.filepath}/*/*'):
+            if 'train.tsv' in fname or 'test.tsv' in fname:
+                data = open(fname)
+                for line in data:
+                    line = line.strip()
+                    try:
+                        question, answer = line.split('\t')
+                        question, story = question.split('\\n')
+                        example = {}
+                        example['Context'] = question
+                        example['Response'] = answer
+                        example['Knowledge'] = story
+                        examples.append(copy.deepcopy(example))
+                        k += 1
+                    except:
+                        pass
+        train_writer = jsonlines.open('../data/unifiedqa.jsonl', mode='w')
+        for i in examples:
+            train_writer.write(i)
+        return
+class SGDConverter(Converter):
+    '''
+    Converter class for SGD dataset
+    '''
+    def process(self):
+        examples = []
+        for split in ['train', 'dev', 'test']:
+            schema_info = json.load(
+                open(f'{self.filepath}/{split}/schema.json'))
+            schema_info = dict([(i['service_name'], i) for i in schema_info])
+            for file in glob.glob(f'{self.filepath}/{split}/dialogues_*.json'):
+                data = json.load(open(file))
+                for dialogue in data:
+                    dialogue_id = dialogue['dialogue_id']
+                    services = dialogue['services'][0]
+                    schema = schema_info[services]
+                    description = schema['description']
+                    task_slots = [s['name'] for s in schema['slots']]
+                    task_intents = [s['name'] for s in schema['intents']]
+                    task_intents_description = [
+                        s['description'] for s in schema['intents']]
+                    turns = dialogue['turns']
+                    history = []
+                    example = {}
+                    for idx, turn in enumerate(turns):
+                        if idx == 0:
+                            assert turn['speaker'] == 'USER'
+                        frame = turn['frames'][0]
+                        service = turn['frames'][0]['service'].split('_')[
+                            0].lower()
+                        if turn['speaker'] == 'USER':
+                            user_utter = turn['utterance']
+                            history.append(f'{user_utter}')
+                            belief_slot_values = frame['state']['slot_values']
+                            slot_values_list = []
+                            for slot_value in belief_slot_values.items():
+                                slot, values = slot_value
+                                value = values[0]
+                                slot_values_list.append(f'{slot} = {value}')
+                            slot_values_str = ' ; '.join(slot_values_list)
+                        else:
+                            sys_utter = copy.copy(turn['utterance'])
+                            slot_values_str = f'belief : {service} {slot_values_str}'
+                            slots = frame['slots']
+                            offset = 0
+                            len_ = len(sys_utter)
+                            candidates = []
+                            for idx, slot_info in enumerate(slots):
+                                start, end, slot_name = slot_info['start'], slot_info['exclusive_end'], slot_info['slot']
+                                sys_utter = sys_utter[:start+offset] + str(
+                                    idx) * (end - start) + sys_utter[end+offset:]
+                                candidates.append(
+                                    (slot_name, str(idx) * (end - start)))
+                            for idx, info in enumerate(candidates):
+                                slotname, target = info
+                                sys_utter = sys_utter.replace(
+                                    target, f'[{slotname}]')
+                            reply = f'{sys_utter}'
+                            example['Context'] = ' EOS '.join(history)
+                            example['Knowledge'] = slot_values_str
+                            example['Response'] = reply
+                            examples.append(copy.deepcopy(example))
+                            history.append(reply)
+        train_writer = jsonlines.open('../data/sgd.jsonl', mode='w')
+        for i in examples:
+            train_writer.write(i)
+        return
+def merge_and_split():
+    examples = []
+    filepath = '../data/dstc7.jsonl'
+    with open(filepath, "r", encoding="utf-8") as reader:
+        for item in jsonlines.Reader(reader):
+            examples.append(item)
+    filepath = '../data/msmarco.jsonl'
+    with open(filepath, "r", encoding="utf-8") as reader:
+        for item in jsonlines.Reader(reader):
+            examples.append(item)
+    filepath = '../data/sgd.jsonl'
+    with open(filepath, "r", encoding="utf-8") as reader:
+        for item in jsonlines.Reader(reader):
+            examples.append(item)
+    filepath = '../data/unifiedqa.jsonl'
+    with open(filepath, "r", encoding="utf-8") as reader:
+        for item in jsonlines.Reader(reader):
+            examples.append(item)
+    random.seed(2021)
+    train_writer = jsonlines.open(
+        '../data/grounded_data_train.jsonl', mode='w')
+    valid_writer = jsonlines.open(
+        '../data/grounded_data_valid.jsonl', mode='w')
+    for i in examples:
+        if random.random() < 0.01:
+            valid_writer.write(i)
+        else:
+            train_writer.write(i)
+    print('Done!')
+def process(
+    msmarco_path,
+    sgd_path,
+    dstc7_path,
+    unified_qa_path
+):
+    MSMARCOConverter(f'{msmarco_path}/train_v2.1.json').convert()
+    SGDConverter(f'{sgd_path}').convert()
+    DSTC7Converter(f'{dstc7_path}').convert()
+    UnifiedQAConverter(unified_qa_path).convert()
+def main():
+    fire.Fire(process)
+    # merge generated data and split it into train and valid
+    merge_and_split()
+if __name__ == '__main__':
+    main()