PeteBleackley commited on
Commit
65ae142
·
1 Parent(s): 6ebe943

Testing scripts

Browse files
Files changed (1) hide show
  1. scripts.py +92 -7
scripts.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import re
4
  import argparse
5
  import pickle
 
6
  import numpy
7
  import tokenizers
8
  import transformers
@@ -20,6 +21,7 @@ import qarac.utils.CoreferenceResolver
20
  import nltk.corpus
21
  import difflib
22
  import scipy.stats
 
23
 
24
 
25
 
@@ -133,10 +135,12 @@ def train_models(path):
133
  question_answering='corpora/question_answering.csv',
134
  reasoning='corpora/reasoning_train.csv',
135
  consistency='corpora/consistency.csv')
136
- trainer.fit(training_data,
137
- epochs=10,
138
- workers=16,
139
- use_multiprocessing=True)
 
 
140
  huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
141
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
142
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
@@ -211,9 +215,86 @@ def test_encode_decode(path):
211
  axes = pandas.Series(percentiles, index=percent).plot.bar()
212
  axes.get_figure().savefig('encode_decode_percentile.svg')
213
 
214
-
215
-
216
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
 
219
  if __name__ == '__main__':
@@ -232,4 +313,8 @@ if __name__ == '__main__':
232
  prepare_training_datasets()
233
  elif args.task == 'train_models':
234
  train_models(args.filename)
 
 
 
 
235
 
 
3
  import re
4
  import argparse
5
  import pickle
6
+ import json
7
  import numpy
8
  import tokenizers
9
  import transformers
 
21
  import nltk.corpus
22
  import difflib
23
  import scipy.stats
24
+ import scipy.spatial
25
 
26
 
27
 
 
135
  question_answering='corpora/question_answering.csv',
136
  reasoning='corpora/reasoning_train.csv',
137
  consistency='corpora/consistency.csv')
138
+ history = trainer.fit(training_data,
139
+ epochs=10,
140
+ workers=16,
141
+ use_multiprocessing=True)
142
+ with open('history.json','w') as jsonfile:
143
+ json.dump(history.history,jsonfile)
144
  huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
145
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
146
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
 
215
  axes = pandas.Series(percentiles, index=percent).plot.bar()
216
  axes.get_figure().savefig('encode_decode_percentile.svg')
217
 
218
+
219
+ def test_question_answering(path):
220
+ question_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-question-encoder'.format(path))
221
+ answer_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path))
222
+ tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
223
+ data = pandas.read_csv('WikiQA.tsv',sep='\t')
224
+ data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:]))
225
+ nlp = spacy.load('en_core_web_trf')
226
+ predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver()
227
+ data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor)
228
+ unique_questions = data.groupby('QNum')['Question'].first()
229
+ cleaned_questions = pandas.Series([clean_question(doc)
230
+ for doc in nlp.pipe(unique_questions)],
231
+ index = unique_questions.index)
232
+
233
+ def tokenize(column):
234
+ return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)),
235
+ add_special_tokens=False)
236
+ questions = tokenize(cleaned_questions)
237
+ maxlen=max((len(question) for question in questions))
238
+ pad_token = tokenizer.token_to_id('<pad>')
239
+ for question in questions:
240
+ question.pad(maxlen,pad_id=pad_token)
241
+ question_ids = tensorflow.constant([question.ids
242
+ for question in questions])
243
+ attention_mask = tensorflow.constant(numpy.not_equal(question_ids.numpy(),
244
+ pad_token).astype(int))
245
+ q_vectors = question_encoder(question_ids,
246
+ attention_mask=attention_mask).numpy()
247
+ answers = tokenize(data['Resolved_answer'])
248
+ maxlen = max((len(answer) for answer in answers))
249
+ for answer in answers:
250
+ answer.pad(maxlen,pad_id=pad_token)
251
+ answer_ids = tensorflow.constant([answer.ids
252
+ for answer in answers])
253
+ attention_mask = tensorflow.constant(numpy.not_equal(answer_ids.numpy(),
254
+ pad_token).astype(int))
255
+ answer_lookup = scipy.spatial.KDTree(answer_encoder(answer_ids,
256
+ attention_mask=attention_mask).numpy())
257
+ n_correct = 0
258
+ all_distances = 0.0
259
+ correct_distances = 0.0
260
+ wrong_distances = 0.0
261
+ all_sq = 0.0
262
+ correct_sq = 0.0
263
+ wrong_sq = 0.0
264
+ for (i,qv) in enumerate(q_vectors):
265
+ (d,row) = answer_lookup.query(qv)
266
+ dsq=d**2.0
267
+ correct = (row['QNum']==i and row['Label']==1)
268
+ all_distances+=d
269
+ all_sq+=dsq
270
+ if correct:
271
+ n_correct+=1
272
+ correct_distances+=d
273
+ correct_sq+=dsq
274
+ else:
275
+ wrong_distances+=d
276
+ wrong_sq+=dsq
277
+ N = cleaned_questions.shape[0]
278
+ print("{0} questions, {1} possible answers, {2} correct answers".format(N,
279
+ data.shape[0],
280
+ n_correct))
281
+ accuracy = n_correct/N
282
+ baseline = N/data.shape[0]
283
+ kappa = 1.0 - ((1.0-accuracy)/(1.0-baseline))
284
+ print(("Accuracy: {0}, Baseline {1}, kappa{2} ".format(accuracy,baseline,kappa)))
285
+ mean_dist =all_distances/N
286
+ mean_sq = all_sq/N
287
+ all_sd = numpy.sqrt(mean_sq-(mean_dist**2.0))
288
+ print("Question-answer distances")
289
+ print("All: mean {0}, sd {1}".format(mean_dist,all_sd))
290
+ correct_mean = correct_distances/n_correct
291
+ correct_meansq = correct_sq/n_correct
292
+ correct_sd = numpy.sqrt(correct_meansq - (correct_mean**2.0))
293
+ print("Correct: mean {0}, sd {1}".format(correct_mean,correct_sd))
294
+ wrong_mean = wrong_distances/(N-n_correct)
295
+ wrong_meansq = wrong_sq/(N-n_correct)
296
+ wrong_sd = numpy.sqrt(wrong_meansq - (wrong_mean**2.0))
297
+ print("Wrong: mean {0}, sd {1}".format(wrong_mean,wrong_sd))
298
 
299
 
300
  if __name__ == '__main__':
 
313
  prepare_training_datasets()
314
  elif args.task == 'train_models':
315
  train_models(args.filename)
316
+ elif args.task == 'test_encode_decode':
317
+ test_encode_decode(args.filename)
318
+ elif args.task== 'test_question_answering':
319
+ test_question_answering(args.filename)
320