nvan13's picture
Add files using upload-large-folder tool
f4dcc30 verified
#
# Author: penhe@microsoft.com
# Date: 01/25/2019
#
import os
import csv
import copy
from collections import OrderedDict,defaultdict,Counter
from collections.abc import Sequence
import numpy as np
from ...utils import get_logger
from ...utils import xtqdm as tqdm
from ...data import example_to_feature
from .metrics import *
from ..models import SequenceClassificationModel
logger=get_logger()
__all__ = ['EvalData', 'Task']
class EvalData:
def __init__(self, name, examples, metrics_fn=None, predict_fn=None, ignore_metric=False, critial_metrics=None):
def accuracy_fn(logits, labels):
return OrderedDict(accuracy= metric_accuracy(logits, labels))
def default_pred_fn(logits, output_dir, name, prefix):
output=os.path.join(output_dir, 'submit-{}-{}.tsv'.format(name, prefix))
preds = np.argmax(logits, axis=-1)
with open(output, 'w', encoding='utf-8') as fs:
fs.write('index\tpredictions\n')
for i,p in enumerate(preds):
fs.write('{}\t{}\n'.format(i, p))
self.name = name
self.data = examples
self.ignore_metric = ignore_metric
self.critial_metrics = critial_metrics
self.metrics_fn = metrics_fn if metrics_fn is not None else accuracy_fn
self.predict_fn = predict_fn if predict_fn is not None else default_pred_fn
def __repr__(self):
return f'{self.name}, {type(self.data)}: {len(self.data)}, {self.predict_fn}, {self.metrics_fn}'
class Task():
_meta={}
def __init__(self, tokenizer, args, **kwargs):
self.tokenizer = tokenizer
self.args = args
def eval_data(self, **kwargs):
raise NotImplementedError('Eval_data method not implemented yet.')
def train_data(self, **kwargs):
raise NotImplementedError('Eval_data method not implemented yet.')
def test_data(self, **kwargs):
raise NotImplementedError('Eval_data method not implemented yet.')
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
def label2id(self, labelstr):
label_dict = {l:i for i,l in enumerate(self.get_labels())}
return label_dict[labelstr] if labelstr in label_dict else -1
def get_train_fn(self, *args, **kwargs):
return None
def get_eval_fn(self, *args, **kwargs):
return None
def get_pred_fn(self, *args, **kwargs):
return None
def get_loss_fn(self, *args, **kwargs):
return None
def get_metrics_fn(self):
"""Calcuate metrics based on prediction results"""
def metrics_fn(logits, labels):
return OrderedDict(accuracy= metric_accuracy(logits, labels))
return metrics_fn
def get_predict_fn(self):
"""Calcuate metrics based on prediction results"""
def predict_fn(logits, output_dir, name, prefix):
output=os.path.join(output_dir, 'submit-{}-{}.tsv'.format(name, prefix))
preds = np.argmax(logits, axis=-1)
labels = self.get_labels()
with open(output, 'w', encoding='utf-8') as fs:
fs.write('index\tpredictions\n')
for i,p in enumerate(preds):
fs.write('{}\t{}\n'.format(i, labels[p]))
return predict_fn
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding='utf-8') as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
def get_feature_fn(self, max_seq_len = 512, mask_gen = None, label_type='int', training=False):
tokenizer = self.tokenizer
def _example_to_feature(example, rng=None, ext_params=None, **kwargs):
return example_to_feature(tokenizer, example, max_seq_len = max_seq_len, \
rng = rng, mask_generator = mask_gen, ext_params = ext_params, label_type=label_type, **kwargs)
return _example_to_feature
def get_model_class_fn(self):
return SequenceClassificationModel.load_model
@classmethod
def add_arguments(cls, parser):
"""Add task specific arguments
e.g. parser.add_argument('--data_dir', type=str, help='The path of data directory.')
"""
parser.add_argument('--task_example_arg', type=str, default=None, help='An example task specific argument')
return parser