| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | ''' |
| | TREC question-type classification |
| | ''' |
| |
|
| | from __future__ import absolute_import, division, unicode_literals |
| |
|
| | import os |
| | import io |
| | import logging |
| | import numpy as np |
| |
|
| | from senteval.tools.validation import KFoldClassifier |
| |
|
| |
|
| | class TRECEval(object): |
| | def __init__(self, task_path, seed=1111): |
| | logging.info('***** Transfer task : TREC *****\n\n') |
| | self.seed = seed |
| | self.train = self.loadFile(os.path.join(task_path, 'train_5500.label')) |
| | self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label')) |
| |
|
| | def do_prepare(self, params, prepare): |
| | samples = self.train['X'] + self.test['X'] |
| | return prepare(params, samples) |
| |
|
| | def loadFile(self, fpath): |
| | trec_data = {'X': [], 'y': []} |
| | tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, |
| | 'HUM': 3, 'LOC': 4, 'NUM': 5} |
| | with io.open(fpath, 'r', encoding='latin-1') as f: |
| | for line in f: |
| | target, sample = line.strip().split(':', 1) |
| | sample = sample.split(' ', 1)[1].split() |
| | assert target in tgt2idx, target |
| | trec_data['X'].append(sample) |
| | trec_data['y'].append(tgt2idx[target]) |
| | return trec_data |
| |
|
| | def run(self, params, batcher): |
| | train_embeddings, test_embeddings = [], [] |
| |
|
| | |
| | sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), |
| | key=lambda z: (len(z[0]), z[1])) |
| | train_samples = [x for (x, y) in sorted_corpus_train] |
| | train_labels = [y for (x, y) in sorted_corpus_train] |
| |
|
| | sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), |
| | key=lambda z: (len(z[0]), z[1])) |
| | test_samples = [x for (x, y) in sorted_corpus_test] |
| | test_labels = [y for (x, y) in sorted_corpus_test] |
| |
|
| | |
| | for ii in range(0, len(train_labels), params.batch_size): |
| | batch = train_samples[ii:ii + params.batch_size] |
| | embeddings = batcher(params, batch) |
| | train_embeddings.append(embeddings) |
| | train_embeddings = np.vstack(train_embeddings) |
| | logging.info('Computed train embeddings') |
| |
|
| | |
| | for ii in range(0, len(test_labels), params.batch_size): |
| | batch = test_samples[ii:ii + params.batch_size] |
| | embeddings = batcher(params, batch) |
| | test_embeddings.append(embeddings) |
| | test_embeddings = np.vstack(test_embeddings) |
| | logging.info('Computed test embeddings') |
| |
|
| | config_classifier = {'nclasses': 6, 'seed': self.seed, |
| | 'usepytorch': params.usepytorch, |
| | 'classifier': params.classifier, |
| | 'kfold': params.kfold} |
| | clf = KFoldClassifier({'X': train_embeddings, |
| | 'y': np.array(train_labels)}, |
| | {'X': test_embeddings, |
| | 'y': np.array(test_labels)}, |
| | config_classifier) |
| | devacc, testacc, _ = clf.run() |
| | logging.debug('\nDev acc : {0} Test acc : {1} \ |
| | for TREC\n'.format(devacc, testacc)) |
| | return {'devacc': devacc, 'acc': testacc, |
| | 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])} |
| |
|