| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | Validation and classification |
| | (train) : inner-kfold classifier |
| | (train, test) : kfold classifier |
| | (train, dev, test) : split classifier |
| | |
| | """ |
| | from __future__ import absolute_import, division, unicode_literals |
| |
|
| | import logging |
| | import numpy as np |
| | from senteval.tools.classifier import MLP |
| |
|
| | import sklearn |
| | assert(sklearn.__version__ >= "0.18.0"), \ |
| | "need to update sklearn to version >= 0.18.0" |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.model_selection import StratifiedKFold |
| |
|
| |
|
| | def get_classif_name(classifier_config, usepytorch): |
| | if not usepytorch: |
| | modelname = 'sklearn-LogReg' |
| | else: |
| | nhid = classifier_config['nhid'] |
| | optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim'] |
| | bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size'] |
| | modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs) |
| | return modelname |
| |
|
| | |
| | class InnerKFoldClassifier(object): |
| | """ |
| | (train) split classifier : InnerKfold. |
| | """ |
| | def __init__(self, X, y, config): |
| | self.X = X |
| | self.y = y |
| | self.featdim = X.shape[1] |
| | self.nclasses = config['nclasses'] |
| | self.seed = config['seed'] |
| | self.devresults = [] |
| | self.testresults = [] |
| | self.usepytorch = config['usepytorch'] |
| | self.classifier_config = config['classifier'] |
| | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
| |
|
| | self.k = 5 if 'kfold' not in config else config['kfold'] |
| |
|
| | def run(self): |
| | logging.info('Training {0} with (inner) {1}-fold cross-validation' |
| | .format(self.modelname, self.k)) |
| |
|
| | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
| | [2**t for t in range(-2, 4, 1)] |
| | skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) |
| | innerskf = StratifiedKFold(n_splits=self.k, shuffle=True, |
| | random_state=1111) |
| | count = 0 |
| | for train_idx, test_idx in skf.split(self.X, self.y): |
| | count += 1 |
| | X_train, X_test = self.X[train_idx], self.X[test_idx] |
| | y_train, y_test = self.y[train_idx], self.y[test_idx] |
| | scores = [] |
| | for reg in regs: |
| | regscores = [] |
| | for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train): |
| | X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx] |
| | y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx] |
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=reg, |
| | seed=self.seed) |
| | clf.fit(X_in_train, y_in_train, |
| | validation_data=(X_in_test, y_in_test)) |
| | else: |
| | clf = LogisticRegression(C=reg, random_state=self.seed) |
| | clf.fit(X_in_train, y_in_train) |
| | regscores.append(clf.score(X_in_test, y_in_test)) |
| | scores.append(round(100*np.mean(regscores), 2)) |
| | optreg = regs[np.argmax(scores)] |
| | logging.info('Best param found at split {0}: l2reg = {1} \ |
| | with score {2}'.format(count, optreg, np.max(scores))) |
| | self.devresults.append(np.max(scores)) |
| |
|
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=optreg, |
| | seed=self.seed) |
| |
|
| | clf.fit(X_train, y_train, validation_split=0.05) |
| | else: |
| | clf = LogisticRegression(C=optreg, random_state=self.seed) |
| | clf.fit(X_train, y_train) |
| |
|
| | self.testresults.append(round(100*clf.score(X_test, y_test), 2)) |
| |
|
| | devaccuracy = round(np.mean(self.devresults), 2) |
| | testaccuracy = round(np.mean(self.testresults), 2) |
| | return devaccuracy, testaccuracy |
| |
|
| |
|
| | class KFoldClassifier(object): |
| | """ |
| | (train, test) split classifier : cross-validation on train. |
| | """ |
| | def __init__(self, train, test, config): |
| | self.train = train |
| | self.test = test |
| | self.featdim = self.train['X'].shape[1] |
| | self.nclasses = config['nclasses'] |
| | self.seed = config['seed'] |
| | self.usepytorch = config['usepytorch'] |
| | self.classifier_config = config['classifier'] |
| | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
| |
|
| | self.k = 5 if 'kfold' not in config else config['kfold'] |
| |
|
| | def run(self): |
| | |
| | logging.info('Training {0} with {1}-fold cross-validation' |
| | .format(self.modelname, self.k)) |
| | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
| | [2**t for t in range(-1, 6, 1)] |
| | skf = StratifiedKFold(n_splits=self.k, shuffle=True, |
| | random_state=self.seed) |
| | scores = [] |
| |
|
| | for reg in regs: |
| | scanscores = [] |
| | for train_idx, test_idx in skf.split(self.train['X'], |
| | self.train['y']): |
| | |
| | X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx] |
| |
|
| | X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx] |
| |
|
| | |
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=reg, |
| | seed=self.seed) |
| | clf.fit(X_train, y_train, validation_data=(X_test, y_test)) |
| | else: |
| | clf = LogisticRegression(C=reg, random_state=self.seed) |
| | clf.fit(X_train, y_train) |
| | score = clf.score(X_test, y_test) |
| | scanscores.append(score) |
| | |
| | scores.append(round(100*np.mean(scanscores), 2)) |
| |
|
| | |
| | logging.info([('reg:' + str(regs[idx]), scores[idx]) |
| | for idx in range(len(scores))]) |
| | optreg = regs[np.argmax(scores)] |
| | devaccuracy = np.max(scores) |
| | logging.info('Cross-validation : best param found is reg = {0} \ |
| | with score {1}'.format(optreg, devaccuracy)) |
| |
|
| | logging.info('Evaluating...') |
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=optreg, |
| | seed=self.seed) |
| | clf.fit(self.train['X'], self.train['y'], validation_split=0.05) |
| | else: |
| | clf = LogisticRegression(C=optreg, random_state=self.seed) |
| | clf.fit(self.train['X'], self.train['y']) |
| | yhat = clf.predict(self.test['X']) |
| |
|
| | testaccuracy = clf.score(self.test['X'], self.test['y']) |
| | testaccuracy = round(100*testaccuracy, 2) |
| |
|
| | return devaccuracy, testaccuracy, yhat |
| |
|
| |
|
| | class SplitClassifier(object): |
| | """ |
| | (train, valid, test) split classifier. |
| | """ |
| | def __init__(self, X, y, config): |
| | self.X = X |
| | self.y = y |
| | self.nclasses = config['nclasses'] |
| | self.featdim = self.X['train'].shape[1] |
| | self.seed = config['seed'] |
| | self.usepytorch = config['usepytorch'] |
| | self.classifier_config = config['classifier'] |
| | self.cudaEfficient = False if 'cudaEfficient' not in config else \ |
| | config['cudaEfficient'] |
| | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
| | self.noreg = False if 'noreg' not in config else config['noreg'] |
| | self.config = config |
| |
|
| | def run(self): |
| | logging.info('Training {0} with standard validation..' |
| | .format(self.modelname)) |
| | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
| | [2**t for t in range(-2, 4, 1)] |
| | if self.noreg: |
| | regs = [1e-9 if self.usepytorch else 1e9] |
| | scores = [] |
| | for reg in regs: |
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=reg, |
| | seed=self.seed, cudaEfficient=self.cudaEfficient) |
| |
|
| | |
| | clf.fit(self.X['train'], self.y['train'], |
| | validation_data=(self.X['valid'], self.y['valid'])) |
| | else: |
| | clf = LogisticRegression(C=reg, random_state=self.seed) |
| | clf.fit(self.X['train'], self.y['train']) |
| | scores.append(round(100*clf.score(self.X['valid'], |
| | self.y['valid']), 2)) |
| | logging.info([('reg:'+str(regs[idx]), scores[idx]) |
| | for idx in range(len(scores))]) |
| | optreg = regs[np.argmax(scores)] |
| | devaccuracy = np.max(scores) |
| | logging.info('Validation : best param found is reg = {0} with score \ |
| | {1}'.format(optreg, devaccuracy)) |
| | clf = LogisticRegression(C=optreg, random_state=self.seed) |
| | logging.info('Evaluating...') |
| | if self.usepytorch: |
| | clf = MLP(self.classifier_config, inputdim=self.featdim, |
| | nclasses=self.nclasses, l2reg=optreg, |
| | seed=self.seed, cudaEfficient=self.cudaEfficient) |
| |
|
| | |
| | clf.fit(self.X['train'], self.y['train'], |
| | validation_data=(self.X['valid'], self.y['valid'])) |
| | else: |
| | clf = LogisticRegression(C=optreg, random_state=self.seed) |
| | clf.fit(self.X['train'], self.y['train']) |
| |
|
| | testaccuracy = clf.score(self.X['test'], self.y['test']) |
| | testaccuracy = round(100*testaccuracy, 2) |
| | return devaccuracy, testaccuracy |
| |
|