File size: 8,425 Bytes
83d5adb
 
e149b0f
83d5adb
 
 
63b2c6a
75a64b9
83d5adb
 
 
63b2c6a
 
83d5adb
8f1745b
c106121
 
e149b0f
 
 
83d5adb
f16a715
 
 
 
83d5adb
c106121
 
 
 
 
 
 
 
 
 
 
e149b0f
c106121
e149b0f
 
 
 
 
 
 
 
 
83d5adb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58e8b0b
 
83d5adb
 
 
 
d63afc6
 
ab0523b
d63afc6
 
 
 
 
 
 
 
ab0523b
d63afc6
 
ab0523b
 
 
 
d63afc6
 
 
ab0523b
 
 
 
 
 
 
4f504fa
d63afc6
 
 
 
63b2c6a
 
33285bd
63b2c6a
 
33285bd
63b2c6a
e219af7
8626c30
 
 
63b2c6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75a64b9
63b2c6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d63afc6
83d5adb
 
 
 
 
 
 
 
 
e149b0f
83d5adb
63b2c6a
83d5adb
e149b0f
 
d63afc6
 
63b2c6a
 
83d5adb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

import os
import re
import argparse
import pickle
import tokenizers
import transformers
import huggingface_hub
import qarac.corpora.BNCorpus
import qarac.corpora.Batcher
import qarac.models.qarac_base_model
import qarac.models.QaracTrainerModel
import qarac.corpora.CombinedCorpus
import keras
import tensorflow
import spacy
import pandas
import qarac.utils.CoreferenceResolver



def decoder_loss(y_true,y_pred):
    return keras.losses.sparse_categorical_crossentropy(y_true,
                                                        y_pred.logits,
                                                        logits=True)

def capitalise(token,i):
    return token.text_with_ws.title() if i==0 or token.tag_.startswith('NNP') else token.text_with_ws.lower()

def clean_question(doc):
    words = [capitalise(token,i) for (i,token) in enumerate(doc)]
    if words[-1]!='?':
        words.append('?')
    return ''.join(words)

def prepare_wiki_qa(filename,outfilename):
    data = pandas.read_csv(filename,sep='\t')
    data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:]))
    nlp = spacy.load('en_core_web_trf')
    predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver()
    data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor)
    unique_questions = data.groupby('QNum')['Question'].first()
    cleaned_questions = pandas.Series([clean_question(doc)
                                       for doc in nlp.pipe(unique_questions)],
                                      index = unique_questions.index)
    for (i,question) in cleaned_questions.items():
        data.loc[data['QNum']==i,'Cleaned_question']=question
    data[['Cleaned_question','Resolved_answer','Label']].to_csv(outfilename)

        
def train_base_model(task,filename):
    tokenizer = tokenizers.Tokenizer.from_pretrained('xlm-roberta-base')
    tokenizer.add_special_tokens(['<start>','<end>','<pad>'])
    tokenizer.save('/'.join([os.environ['HOME'],
                            'QARAC',
                            'models',
                            'tokenizer.json']))
    bnc = qarac.corpora.BNCorpus.BNCorpus(tokenizer=tokenizer,
                                          task=task)
    (train,test)=bnc.split(0.01)
    train_data=qarac.corpora.Batcher.Batcher(train)
    model = qarac.models.qarac_base_model.qarac_base_model(tokenizer.get_vocab_size(), 
                                                           768, 
                                                           12,
                                                           task=='decode')
    optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
    model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics='accuracy')
    model.fit(train_data,
              epochs=100,
              workers = 16,
              use_multiprocessing=True)
    test_data=qarac.corpora.Batcher.Batcher(test)
    print(model.evaluate(test_data))
    model.save(filename)
    
def prepare_training_datasets():
    wikiqa = pandas.read_csv('corpora/WikiQA.csv')
    avicenna = pandas.read_csv('corpora/Avicenna_Train.csv',encoding='iso-8859-1')
    snli = pandas.read_csv('corpora/snli_1.0_train.csv')
    question_answering = wikiqa.loc[wikiqa['Label']==1,
                                    ['Cleaned_question',
                                     'Resolved_answer']].rename(columns={'Cleaned_question':'question',
                                                                         'Resolved_answer':'answer'})
    reasoning = avicenna.loc[avicenna['Syllogistic relation']=='yes',
                             ['Premise 1',
                              'Premise 2',
                              'Conclusion']].rename(columns={'Premise 1':'proposition0',
                                                              'Premise 2':'proposition1',
                                                              'Conclusion':'conclusion'})
    consistency = snli.loc[snli['gold_label']!='-',
                           ['sentence1',
                            'sentence2']].rename(columns={'sentence1':'statement0',
                                                          'sentence2':'statement1'})
    mapping = {'entailment':1.0,
               'neutral':0.0,
               'contradiction':-1.0}
    consistency['consistency'] = snli.loc[snli['gold_label']!='-',
                                          'gold_label'].apply(lambda x:mapping[x])
    all_text = pandas.concat([wikiqa['Resolved_answer'],
                              avicenna['Premise 1'],
                              avicenna['Premise 1'],
                              reasoning['conclusion'],
                              snli['sentence1'],
                              snli['sentence2']]).to_frame(name='all_text').reset_index(drop=True)
    all_text.to_csv('corpora/all_text.csv')
    question_answering.to_csv('corpora/question_answering.csv')
    reasoning.to_csv('corpora/reasoning_train.csv')
    consistency.to_csv('corpora/consistency.csv')
    
def train_models(path):
    encoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base')
    config = encoder_base.config
    config.is_decoder = True
    decoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base',
                                                               config=config)
    tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
    trainer = qarac.models.QaracTrainerModel.QaracTrainerModel(encoder_base, 
                                                               decoder_base, 
                                                               tokenizer)
    losses={'encode_decode':decoder_loss,
            'question_answering':keras.losses.mean_squared_error,
            'reasoning':decoder_loss,
            'consistency':keras.losses.mean_squared_error}
    optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
    trainer.compile(optimizer=optimizer,
                    loss=losses)
    training_data = qarac.corpora.CombinedCorpus(tokenizer,
                                                 all_text='corpora/all_text.csv',
                                                 question_answering='corpora/question_answering.csv',
                                                 reasoning='corpora/reasoning_train.csv',
                                                 consistency='corpora/consistency.csv')
    trainer.fit(training_data,
                epochs=10,
                workers=16,
                use_multiprocessing=True)
    huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
    trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
    trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
    trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
    with open('model_summaries.txt') as summaries:
        summaries.write('TRAINER MODEL\n')
        summaries.write(trainer.summary())
        summaries.write('QUESTION ENCODER\n')
        summaries.write(trainer.question_encoder.summary())
        summaries.write('ANSWER ENCODER\n')
        summaries.write(trainer.answer_encoder.summary())
        summaries.write('DECODER\n')
        summaries.write(trainer.decoder.summary())
    keras.utils.plot_model(trainer,'trainer_model.png')
    keras.utils.plot_model(trainer.answer_encoder,'encoder_model.png')
    keras.utils.plot_model(trainer.decoder,'decoder_model.png')
                                                              
    
    
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='QARAC',
                                     description='Experimental NLP system, aimed at improving factual accuracy')
    parser.add_argument('task')
    parser.add_argument('-f','--filename')
    parser.add_argument('-t','--training-task')
    parser.add_argument('-o','--outputfile')
    args = parser.parse_args()
    if args.task == 'train_base_model': 
        train_base_model(args.training_task,args.filename)
    elif args.task == 'prepare_wiki_qa':
        prepare_wiki_qa(args.filename,args.outputfile)
    elif args.task == 'prepare_training_datasets':
        prepare_training_datasets()
    elif args.task == 'train_models':
        train_models(args.filename)