| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | ''' |
| | MRPC : Microsoft Research Paraphrase (detection) Corpus |
| | ''' |
| | from __future__ import absolute_import, division, unicode_literals |
| |
|
| | import os |
| | import logging |
| | import numpy as np |
| | import io |
| |
|
| | from senteval.tools.validation import KFoldClassifier |
| |
|
| | from sklearn.metrics import f1_score |
| |
|
| |
|
| | class MRPCEval(object): |
| | def __init__(self, task_path, seed=1111): |
| | logging.info('***** Transfer task : MRPC *****\n\n') |
| | self.seed = seed |
| | train = self.loadFile(os.path.join(task_path, |
| | 'msr_paraphrase_train.txt')) |
| | test = self.loadFile(os.path.join(task_path, |
| | 'msr_paraphrase_test.txt')) |
| | self.mrpc_data = {'train': train, 'test': test} |
| |
|
| | def do_prepare(self, params, prepare): |
| | |
| | samples = self.mrpc_data['train']['X_A'] + \ |
| | self.mrpc_data['train']['X_B'] + \ |
| | self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B'] |
| | return prepare(params, samples) |
| |
|
| | def loadFile(self, fpath): |
| | mrpc_data = {'X_A': [], 'X_B': [], 'y': []} |
| | with io.open(fpath, 'r', encoding='utf-8') as f: |
| | for line in f: |
| | text = line.strip().split('\t') |
| | mrpc_data['X_A'].append(text[3].split()) |
| | mrpc_data['X_B'].append(text[4].split()) |
| | mrpc_data['y'].append(text[0]) |
| |
|
| | mrpc_data['X_A'] = mrpc_data['X_A'][1:] |
| | mrpc_data['X_B'] = mrpc_data['X_B'][1:] |
| | mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]] |
| | return mrpc_data |
| |
|
| | def run(self, params, batcher): |
| | mrpc_embed = {'train': {}, 'test': {}} |
| |
|
| | for key in self.mrpc_data: |
| | logging.info('Computing embedding for {0}'.format(key)) |
| | |
| | text_data = {} |
| | sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], |
| | self.mrpc_data[key]['X_B'], |
| | self.mrpc_data[key]['y']), |
| | key=lambda z: (len(z[0]), len(z[1]), z[2])) |
| |
|
| | text_data['A'] = [x for (x, y, z) in sorted_corpus] |
| | text_data['B'] = [y for (x, y, z) in sorted_corpus] |
| | text_data['y'] = [z for (x, y, z) in sorted_corpus] |
| |
|
| | for txt_type in ['A', 'B']: |
| | mrpc_embed[key][txt_type] = [] |
| | for ii in range(0, len(text_data['y']), params.batch_size): |
| | batch = text_data[txt_type][ii:ii + params.batch_size] |
| | embeddings = batcher(params, batch) |
| | mrpc_embed[key][txt_type].append(embeddings) |
| | mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) |
| | mrpc_embed[key]['y'] = np.array(text_data['y']) |
| | logging.info('Computed {0} embeddings'.format(key)) |
| |
|
| | |
| | trainA = mrpc_embed['train']['A'] |
| | trainB = mrpc_embed['train']['B'] |
| | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] |
| | trainY = mrpc_embed['train']['y'] |
| |
|
| | |
| | testA = mrpc_embed['test']['A'] |
| | testB = mrpc_embed['test']['B'] |
| | testF = np.c_[np.abs(testA - testB), testA * testB] |
| | testY = mrpc_embed['test']['y'] |
| |
|
| | config = {'nclasses': 2, 'seed': self.seed, |
| | 'usepytorch': params.usepytorch, |
| | 'classifier': params.classifier, |
| | 'nhid': params.nhid, 'kfold': params.kfold} |
| | clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, |
| | test={'X': testF, 'y': testY}, config=config) |
| |
|
| | devacc, testacc, yhat = clf.run() |
| | testf1 = round(100*f1_score(testY, yhat), 2) |
| | logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' |
| | .format(devacc, testacc, testf1)) |
| | return {'devacc': devacc, 'acc': testacc, 'f1': testf1, |
| | 'ndev': len(trainA), 'ntest': len(testA)} |
| |
|