| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | ''' |
| | SST - binary classification |
| | ''' |
| |
|
| | from __future__ import absolute_import, division, unicode_literals |
| |
|
| | import os |
| | import io |
| | import logging |
| | import numpy as np |
| |
|
| | from senteval.tools.validation import SplitClassifier |
| |
|
| |
|
| | class SSTEval(object): |
| | def __init__(self, task_path, nclasses=2, seed=1111): |
| | self.seed = seed |
| |
|
| | |
| | assert nclasses in [2, 5] |
| | self.nclasses = nclasses |
| | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained' |
| | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name) |
| |
|
| | train = self.loadFile(os.path.join(task_path, 'sentiment-train')) |
| | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev')) |
| | test = self.loadFile(os.path.join(task_path, 'sentiment-test')) |
| | self.sst_data = {'train': train, 'dev': dev, 'test': test} |
| |
|
| | def do_prepare(self, params, prepare): |
| | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \ |
| | self.sst_data['test']['X'] |
| | return prepare(params, samples) |
| |
|
| | def loadFile(self, fpath): |
| | sst_data = {'X': [], 'y': []} |
| | with io.open(fpath, 'r', encoding='utf-8') as f: |
| | for line in f: |
| | if self.nclasses == 2: |
| | sample = line.strip().split('\t') |
| | sst_data['y'].append(int(sample[1])) |
| | sst_data['X'].append(sample[0].split()) |
| | elif self.nclasses == 5: |
| | sample = line.strip().split(' ', 1) |
| | sst_data['y'].append(int(sample[0])) |
| | sst_data['X'].append(sample[1].split()) |
| | assert max(sst_data['y']) == self.nclasses - 1 |
| | return sst_data |
| |
|
| | def run(self, params, batcher): |
| | sst_embed = {'train': {}, 'dev': {}, 'test': {}} |
| | bsize = params.batch_size |
| |
|
| | for key in self.sst_data: |
| | logging.info('Computing embedding for {0}'.format(key)) |
| | |
| | sorted_data = sorted(zip(self.sst_data[key]['X'], |
| | self.sst_data[key]['y']), |
| | key=lambda z: (len(z[0]), z[1])) |
| | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) |
| |
|
| | sst_embed[key]['X'] = [] |
| | for ii in range(0, len(self.sst_data[key]['y']), bsize): |
| | batch = self.sst_data[key]['X'][ii:ii + bsize] |
| | embeddings = batcher(params, batch) |
| | sst_embed[key]['X'].append(embeddings) |
| | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) |
| | sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) |
| | logging.info('Computed {0} embeddings'.format(key)) |
| |
|
| | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, |
| | 'usepytorch': params.usepytorch, |
| | 'classifier': params.classifier} |
| |
|
| | clf = SplitClassifier(X={'train': sst_embed['train']['X'], |
| | 'valid': sst_embed['dev']['X'], |
| | 'test': sst_embed['test']['X']}, |
| | y={'train': sst_embed['train']['y'], |
| | 'valid': sst_embed['dev']['y'], |
| | 'test': sst_embed['test']['y']}, |
| | config=config_classifier) |
| |
|
| | devacc, testacc = clf.run() |
| | logging.debug('\nDev acc : {0} Test acc : {1} for \ |
| | SST {2} classification\n'.format(devacc, testacc, self.task_name)) |
| |
|
| | return {'devacc': devacc, 'acc': testacc, |
| | 'ndev': len(sst_embed['dev']['X']), |
| | 'ntest': len(sst_embed['test']['X'])} |
| |
|