| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | ''' |
| | SNLI - Entailment |
| | ''' |
| | from __future__ import absolute_import, division, unicode_literals |
| |
|
| | import codecs |
| | import os |
| | import io |
| | import copy |
| | import logging |
| | import numpy as np |
| |
|
| | from senteval.tools.validation import SplitClassifier |
| |
|
| |
|
| | class SNLIEval(object): |
| | def __init__(self, taskpath, seed=1111): |
| | logging.debug('***** Transfer task : SNLI Entailment*****\n\n') |
| | self.seed = seed |
| | train1 = self.loadFile(os.path.join(taskpath, 's1.train')) |
| | train2 = self.loadFile(os.path.join(taskpath, 's2.train')) |
| |
|
| | trainlabels = io.open(os.path.join(taskpath, 'labels.train'), |
| | encoding='utf-8').read().splitlines() |
| |
|
| | valid1 = self.loadFile(os.path.join(taskpath, 's1.dev')) |
| | valid2 = self.loadFile(os.path.join(taskpath, 's2.dev')) |
| | validlabels = io.open(os.path.join(taskpath, 'labels.dev'), |
| | encoding='utf-8').read().splitlines() |
| |
|
| | test1 = self.loadFile(os.path.join(taskpath, 's1.test')) |
| | test2 = self.loadFile(os.path.join(taskpath, 's2.test')) |
| | testlabels = io.open(os.path.join(taskpath, 'labels.test'), |
| | encoding='utf-8').read().splitlines() |
| |
|
| | |
| | sorted_train = sorted(zip(train2, train1, trainlabels), |
| | key=lambda z: (len(z[0]), len(z[1]), z[2])) |
| | train2, train1, trainlabels = map(list, zip(*sorted_train)) |
| |
|
| | sorted_valid = sorted(zip(valid2, valid1, validlabels), |
| | key=lambda z: (len(z[0]), len(z[1]), z[2])) |
| | valid2, valid1, validlabels = map(list, zip(*sorted_valid)) |
| |
|
| | sorted_test = sorted(zip(test2, test1, testlabels), |
| | key=lambda z: (len(z[0]), len(z[1]), z[2])) |
| | test2, test1, testlabels = map(list, zip(*sorted_test)) |
| |
|
| | self.samples = train1 + train2 + valid1 + valid2 + test1 + test2 |
| | self.data = {'train': (train1, train2, trainlabels), |
| | 'valid': (valid1, valid2, validlabels), |
| | 'test': (test1, test2, testlabels) |
| | } |
| |
|
| | def do_prepare(self, params, prepare): |
| | return prepare(params, self.samples) |
| |
|
| | def loadFile(self, fpath): |
| | with codecs.open(fpath, 'rb', 'latin-1') as f: |
| | return [line.split() for line in |
| | f.read().splitlines()] |
| |
|
| | def run(self, params, batcher): |
| | self.X, self.y = {}, {} |
| | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} |
| | for key in self.data: |
| | if key not in self.X: |
| | self.X[key] = [] |
| | if key not in self.y: |
| | self.y[key] = [] |
| |
|
| | input1, input2, mylabels = self.data[key] |
| | enc_input = [] |
| | n_labels = len(mylabels) |
| | for ii in range(0, n_labels, params.batch_size): |
| | batch1 = input1[ii:ii + params.batch_size] |
| | batch2 = input2[ii:ii + params.batch_size] |
| |
|
| | if len(batch1) == len(batch2) and len(batch1) > 0: |
| | enc1 = batcher(params, batch1) |
| | enc2 = batcher(params, batch2) |
| | enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, |
| | np.abs(enc1 - enc2)))) |
| | if (ii*params.batch_size) % (20000*params.batch_size) == 0: |
| | logging.info("PROGRESS (encoding): %.2f%%" % |
| | (100 * ii / n_labels)) |
| | self.X[key] = np.vstack(enc_input) |
| | self.y[key] = [dico_label[y] for y in mylabels] |
| |
|
| | config = {'nclasses': 3, 'seed': self.seed, |
| | 'usepytorch': params.usepytorch, |
| | 'cudaEfficient': True, |
| | 'nhid': params.nhid, 'noreg': True} |
| |
|
| | config_classifier = copy.deepcopy(params.classifier) |
| | config_classifier['max_epoch'] = 15 |
| | config_classifier['epoch_size'] = 1 |
| | config['classifier'] = config_classifier |
| |
|
| | clf = SplitClassifier(self.X, self.y, config) |
| | devacc, testacc = clf.run() |
| | logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n' |
| | .format(devacc, testacc)) |
| | return {'devacc': devacc, 'acc': testacc, |
| | 'ndev': len(self.data['valid'][0]), |
| | 'ntest': len(self.data['test'][0])} |
| |
|