Spaces:
Build error
Build error
PeteBleackley
commited on
Commit
·
65ae142
1
Parent(s):
6ebe943
Testing scripts
Browse files- scripts.py +92 -7
scripts.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
import re
|
| 4 |
import argparse
|
| 5 |
import pickle
|
|
|
|
| 6 |
import numpy
|
| 7 |
import tokenizers
|
| 8 |
import transformers
|
|
@@ -20,6 +21,7 @@ import qarac.utils.CoreferenceResolver
|
|
| 20 |
import nltk.corpus
|
| 21 |
import difflib
|
| 22 |
import scipy.stats
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
|
|
@@ -133,10 +135,12 @@ def train_models(path):
|
|
| 133 |
question_answering='corpora/question_answering.csv',
|
| 134 |
reasoning='corpora/reasoning_train.csv',
|
| 135 |
consistency='corpora/consistency.csv')
|
| 136 |
-
trainer.fit(training_data,
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
|
| 141 |
trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
|
| 142 |
trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
|
|
@@ -211,9 +215,86 @@ def test_encode_decode(path):
|
|
| 211 |
axes = pandas.Series(percentiles, index=percent).plot.bar()
|
| 212 |
axes.get_figure().savefig('encode_decode_percentile.svg')
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
|
| 219 |
if __name__ == '__main__':
|
|
@@ -232,4 +313,8 @@ if __name__ == '__main__':
|
|
| 232 |
prepare_training_datasets()
|
| 233 |
elif args.task == 'train_models':
|
| 234 |
train_models(args.filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
|
|
|
| 3 |
import re
|
| 4 |
import argparse
|
| 5 |
import pickle
|
| 6 |
+
import json
|
| 7 |
import numpy
|
| 8 |
import tokenizers
|
| 9 |
import transformers
|
|
|
|
| 21 |
import nltk.corpus
|
| 22 |
import difflib
|
| 23 |
import scipy.stats
|
| 24 |
+
import scipy.spatial
|
| 25 |
|
| 26 |
|
| 27 |
|
|
|
|
| 135 |
question_answering='corpora/question_answering.csv',
|
| 136 |
reasoning='corpora/reasoning_train.csv',
|
| 137 |
consistency='corpora/consistency.csv')
|
| 138 |
+
history = trainer.fit(training_data,
|
| 139 |
+
epochs=10,
|
| 140 |
+
workers=16,
|
| 141 |
+
use_multiprocessing=True)
|
| 142 |
+
with open('history.json','w') as jsonfile:
|
| 143 |
+
json.dump(history.history,jsonfile)
|
| 144 |
huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
|
| 145 |
trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
|
| 146 |
trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
|
|
|
|
| 215 |
axes = pandas.Series(percentiles, index=percent).plot.bar()
|
| 216 |
axes.get_figure().savefig('encode_decode_percentile.svg')
|
| 217 |
|
| 218 |
+
|
| 219 |
+
def test_question_answering(path):
|
| 220 |
+
question_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-question-encoder'.format(path))
|
| 221 |
+
answer_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path))
|
| 222 |
+
tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
|
| 223 |
+
data = pandas.read_csv('WikiQA.tsv',sep='\t')
|
| 224 |
+
data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:]))
|
| 225 |
+
nlp = spacy.load('en_core_web_trf')
|
| 226 |
+
predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver()
|
| 227 |
+
data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor)
|
| 228 |
+
unique_questions = data.groupby('QNum')['Question'].first()
|
| 229 |
+
cleaned_questions = pandas.Series([clean_question(doc)
|
| 230 |
+
for doc in nlp.pipe(unique_questions)],
|
| 231 |
+
index = unique_questions.index)
|
| 232 |
+
|
| 233 |
+
def tokenize(column):
|
| 234 |
+
return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)),
|
| 235 |
+
add_special_tokens=False)
|
| 236 |
+
questions = tokenize(cleaned_questions)
|
| 237 |
+
maxlen=max((len(question) for question in questions))
|
| 238 |
+
pad_token = tokenizer.token_to_id('<pad>')
|
| 239 |
+
for question in questions:
|
| 240 |
+
question.pad(maxlen,pad_id=pad_token)
|
| 241 |
+
question_ids = tensorflow.constant([question.ids
|
| 242 |
+
for question in questions])
|
| 243 |
+
attention_mask = tensorflow.constant(numpy.not_equal(question_ids.numpy(),
|
| 244 |
+
pad_token).astype(int))
|
| 245 |
+
q_vectors = question_encoder(question_ids,
|
| 246 |
+
attention_mask=attention_mask).numpy()
|
| 247 |
+
answers = tokenize(data['Resolved_answer'])
|
| 248 |
+
maxlen = max((len(answer) for answer in answers))
|
| 249 |
+
for answer in answers:
|
| 250 |
+
answer.pad(maxlen,pad_id=pad_token)
|
| 251 |
+
answer_ids = tensorflow.constant([answer.ids
|
| 252 |
+
for answer in answers])
|
| 253 |
+
attention_mask = tensorflow.constant(numpy.not_equal(answer_ids.numpy(),
|
| 254 |
+
pad_token).astype(int))
|
| 255 |
+
answer_lookup = scipy.spatial.KDTree(answer_encoder(answer_ids,
|
| 256 |
+
attention_mask=attention_mask).numpy())
|
| 257 |
+
n_correct = 0
|
| 258 |
+
all_distances = 0.0
|
| 259 |
+
correct_distances = 0.0
|
| 260 |
+
wrong_distances = 0.0
|
| 261 |
+
all_sq = 0.0
|
| 262 |
+
correct_sq = 0.0
|
| 263 |
+
wrong_sq = 0.0
|
| 264 |
+
for (i,qv) in enumerate(q_vectors):
|
| 265 |
+
(d,row) = answer_lookup.query(qv)
|
| 266 |
+
dsq=d**2.0
|
| 267 |
+
correct = (row['QNum']==i and row['Label']==1)
|
| 268 |
+
all_distances+=d
|
| 269 |
+
all_sq+=dsq
|
| 270 |
+
if correct:
|
| 271 |
+
n_correct+=1
|
| 272 |
+
correct_distances+=d
|
| 273 |
+
correct_sq+=dsq
|
| 274 |
+
else:
|
| 275 |
+
wrong_distances+=d
|
| 276 |
+
wrong_sq+=dsq
|
| 277 |
+
N = cleaned_questions.shape[0]
|
| 278 |
+
print("{0} questions, {1} possible answers, {2} correct answers".format(N,
|
| 279 |
+
data.shape[0],
|
| 280 |
+
n_correct))
|
| 281 |
+
accuracy = n_correct/N
|
| 282 |
+
baseline = N/data.shape[0]
|
| 283 |
+
kappa = 1.0 - ((1.0-accuracy)/(1.0-baseline))
|
| 284 |
+
print(("Accuracy: {0}, Baseline {1}, kappa{2} ".format(accuracy,baseline,kappa)))
|
| 285 |
+
mean_dist =all_distances/N
|
| 286 |
+
mean_sq = all_sq/N
|
| 287 |
+
all_sd = numpy.sqrt(mean_sq-(mean_dist**2.0))
|
| 288 |
+
print("Question-answer distances")
|
| 289 |
+
print("All: mean {0}, sd {1}".format(mean_dist,all_sd))
|
| 290 |
+
correct_mean = correct_distances/n_correct
|
| 291 |
+
correct_meansq = correct_sq/n_correct
|
| 292 |
+
correct_sd = numpy.sqrt(correct_meansq - (correct_mean**2.0))
|
| 293 |
+
print("Correct: mean {0}, sd {1}".format(correct_mean,correct_sd))
|
| 294 |
+
wrong_mean = wrong_distances/(N-n_correct)
|
| 295 |
+
wrong_meansq = wrong_sq/(N-n_correct)
|
| 296 |
+
wrong_sd = numpy.sqrt(wrong_meansq - (wrong_mean**2.0))
|
| 297 |
+
print("Wrong: mean {0}, sd {1}".format(wrong_mean,wrong_sd))
|
| 298 |
|
| 299 |
|
| 300 |
if __name__ == '__main__':
|
|
|
|
| 313 |
prepare_training_datasets()
|
| 314 |
elif args.task == 'train_models':
|
| 315 |
train_models(args.filename)
|
| 316 |
+
elif args.task == 'test_encode_decode':
|
| 317 |
+
test_encode_decode(args.filename)
|
| 318 |
+
elif args.task== 'test_question_answering':
|
| 319 |
+
test_question_answering(args.filename)
|
| 320 |
|