PeteBleackley commited on
Commit
6d6bb62
·
1 Parent(s): 65ae142

Testing script for reasoning

Browse files
Files changed (1) hide show
  1. scripts.py +89 -1
scripts.py CHANGED
@@ -294,7 +294,95 @@ def test_question_answering(path):
294
  wrong_mean = wrong_distances/(N-n_correct)
295
  wrong_meansq = wrong_sq/(N-n_correct)
296
  wrong_sd = numpy.sqrt(wrong_meansq - (wrong_mean**2.0))
297
- print("Wrong: mean {0}, sd {1}".format(wrong_mean,wrong_sd))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
 
300
  if __name__ == '__main__':
 
294
  wrong_mean = wrong_distances/(N-n_correct)
295
  wrong_meansq = wrong_sq/(N-n_correct)
296
  wrong_sd = numpy.sqrt(wrong_meansq - (wrong_mean**2.0))
297
+ print("Wrong: mean {0}, sd {1}".format(wrong_mean,wrong_sd))
298
+
299
+ def test_reasoning(path):
300
+ encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path))
301
+ decoder = transformers.Transformer.from_pretrained('{}/qarac-robeerta-decoder'.format(path))
302
+ tokenizer=tokenizers.Tokenizer.from_pretrained('roberta-base')
303
+ exclude = tokenizer.encode('<s> </s> <pad>').ids
304
+ analyser = difflib.SequenceMatcher(lambda x: x in exclude)
305
+ data = pandas.read_csv('corpora/Avicenna_Test.csv',encoding='iso-8859-1')
306
+ data = data.loc[data['Syllogistic relation']=='yes']
307
+ def tokenize(column):
308
+ return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)),
309
+ add_special_tokens=False)
310
+ p0 = tokenize(data['Premise 1'])
311
+ p1 = tokenize(data['Premise 2'])
312
+ c = tokenize(data['Conclusion'])
313
+ p0_batch = []
314
+ p1_batch = []
315
+ c_batch = []
316
+ n=0
317
+ pad_token = tokenizer.token_to_id('<pad>')
318
+ matches=[]
319
+ for (p0_sample,p1_sample,c_sample) in zip(p0,p1,c):
320
+ p0_batch.append(p0_sample)
321
+ p1_batch.append(p1_sample)
322
+ c_batch.append(c_sample)
323
+ n+=1
324
+ if n==32:
325
+ maxlen=max((len(sample for sample in p0_batch)))
326
+ for sample in p0_batch:
327
+ sample.pad(maxlen,pad_token)
328
+ p0_in = tensorflow.constant([sample.ids for sample in p0.batch])
329
+ p0_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
330
+ pad_token).astype(int))
331
+ maxlen=max((len(sample for sample in p1_batch)))
332
+ for sample in p1_batch:
333
+ sample.pad(maxlen,pad_token)
334
+ p1_in = tensorflow.constant([sample.ids for sample in p1.batch])
335
+ p1_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
336
+ pad_token).astype(int))
337
+ predictions = decoder.generate(vector=(encoder(p0_in,
338
+ attention_mask=p0_attn)
339
+ +encoder(p1_in,
340
+ attention_mask=p1_attn)))
341
+ for (s1,s2) in zip(c_batch,predictions):
342
+ analyser.set_seqs(s1.ids, s2)
343
+ matches.append(analyser.ratio())
344
+ n=0
345
+ p0_batch=[]
346
+ p1_batch=[]
347
+ c_batch=[]
348
+ if n!=0:
349
+ maxlen=max((len(sample for sample in p0_batch)))
350
+ for sample in p0_batch:
351
+ sample.pad(maxlen,pad_token)
352
+ p0_in = tensorflow.constant([sample.ids for sample in p0.batch])
353
+ p0_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
354
+ pad_token).astype(int))
355
+ maxlen=max((len(sample for sample in p1_batch)))
356
+ for sample in p1_batch:
357
+ sample.pad(maxlen,pad_token)
358
+ p1_in = tensorflow.constant([sample.ids for sample in p1.batch])
359
+ p1_attn = tensorflow.constant(numpy.not_equal(p0_in.numpy(),
360
+ pad_token).astype(int))
361
+ predictions = decoder.generate(vector=(encoder(p0_in,
362
+ attention_mask=p0_attn)
363
+ +encoder(p1_in,
364
+ attention_mask=p1_attn)))
365
+ for (s1,s2) in zip(c_batch,predictions):
366
+ analyser.set_seqs(s1.ids, s2)
367
+ matches.append(analyser.ratio())
368
+ matches = numpy.array(matches)
369
+ print("Accuracy: mean = {0}, sd = {1}".format(matches.mean(),
370
+ matches.sd()))
371
+ (alpha,beta,loc,scale)=scipy.stats.beta.fit(matches,floc=0.0,fscale=1.0)
372
+ print("Beta distribution parameters alpha = {0}, beta = {1}".format(alpha,beta))
373
+ (hist,bins) = numpy.histogram(matches,bins='fd')
374
+ with pandas.option_context('plotting.backend','matploblib.backends.backend_svg') as options:
375
+ axes = pandas.Series(hist,index=(bins[1:]+bins[:-1]/2)).plot.bar()
376
+ axes.get_figure().savefig('reasoning_histogram.svg')
377
+ percent = numpy.linspace(0.0,1.0,101)
378
+ percentiles = numpy.quantile(matches,percent)
379
+ with pandas.option_context('plotting.backend','matplotlib.backends.backend_svg') as options:
380
+ axes = pandas.Series(percentiles, index=percent).plot.bar()
381
+ axes.get_figure().savefig('reasoning_percentile.svg')
382
+
383
+
384
+
385
+
386
 
387
 
388
  if __name__ == '__main__':