import os import sys parentdir = os.path.dirname(__file__) parentdir = os.path.dirname(parentdir) parentdir = os.path.dirname(parentdir) sys.path.append(parentdir) import logging import argparse import os from typing import Any, List, Tuple, Mapping from collections import defaultdict from numpy import random import torch import torch.nn as nn import stanza from stanza.models.common.utils import default_device from stanza.models.lemma_classifier import utils from stanza.models.lemma_classifier.base_model import LemmaClassifier from stanza.models.lemma_classifier.lstm_model import LemmaClassifierLSTM from stanza.models.lemma_classifier.transformer_model import LemmaClassifierWithTransformer from stanza.utils.confusion import format_confusion from stanza.utils.get_tqdm import get_tqdm tqdm = get_tqdm() logger = logging.getLogger('stanza.lemmaclassifier') def get_weighted_f1(mcc_results: Mapping[int, Mapping[str, float]], confusion: Mapping[int, Mapping[int, int]]) -> float: """ Computes the weighted F1 score across an evaluation set. The weight of a class's F1 score is equal to the number of examples in evaluation. This makes classes that have more examples in the evaluation more impactful to the weighted f1. """ num_total_examples = 0 weighted_f1 = 0 for class_id in mcc_results: class_f1 = mcc_results.get(class_id).get("f1") num_class_examples = sum(confusion.get(class_id).values()) weighted_f1 += class_f1 * num_class_examples num_total_examples += num_class_examples return weighted_f1 / num_total_examples def evaluate_sequences(gold_tag_sequences: List[Any], pred_tag_sequences: List[Any], label_decoder: Mapping, verbose=True): """ Evaluates a model's predicted tags against a set of gold tags. Computes precision, recall, and f1 for all classes. Precision = true positives / true positives + false positives Recall = true positives / true positives + false negatives F1 = 2 * (Precision * Recall) / (Precision + Recall) Returns: 1. Multi class result dictionary, where each class is a key and maps to another map of its F1, precision, and recall scores. e.g. multiclass_results[0]["precision"] would give class 0's precision. 2. Confusion matrix, where each key is a gold tag and its value is another map with a key of the predicted tag with value of that (gold, pred) count. e.g. confusion[0][1] = 6 would mean that for gold tag 0, the model predicted tag 1 a total of 6 times. """ assert len(gold_tag_sequences) == len(pred_tag_sequences), \ f"Length of gold tag sequences is {len(gold_tag_sequences)}, while length of predicted tag sequence is {len(pred_tag_sequences)}" confusion = defaultdict(lambda: defaultdict(int)) reverse_label_decoder = {y: x for x, y in label_decoder.items()} for gold, pred in zip(gold_tag_sequences, pred_tag_sequences): confusion[reverse_label_decoder[gold]][reverse_label_decoder[pred]] += 1 multi_class_result = defaultdict(lambda: defaultdict(float)) # compute precision, recall and f1 for each class and store inside of `multi_class_result` for gold_tag in confusion.keys(): try: prec = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum([confusion.get(k, {}).get(gold_tag, 0) for k in confusion.keys()]) except ZeroDivisionError: prec = 0.0 try: recall = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum(confusion.get(gold_tag, {}).values()) except ZeroDivisionError: recall = 0.0 try: f1 = 2 * (prec * recall) / (prec + recall) except ZeroDivisionError: f1 = 0.0 multi_class_result[gold_tag] = { "precision": prec, "recall": recall, "f1": f1 } if verbose: for lemma in multi_class_result: logger.info(f"Lemma '{lemma}' had precision {100 * multi_class_result[lemma]['precision']}, recall {100 * multi_class_result[lemma]['recall']} and F1 score of {100 * multi_class_result[lemma]['f1']}") weighted_f1 = get_weighted_f1(multi_class_result, confusion) return multi_class_result, confusion, weighted_f1 def model_predict(model: nn.Module, position_indices: torch.Tensor, sentences: List[List[str]], upos_tags: List[List[int]]=[]) -> torch.Tensor: """ A LemmaClassifierLSTM or LemmaClassifierWithTransformer is used to predict on a single text example, given the position index of the target token. Args: model (LemmaClassifier): A trained LemmaClassifier that is able to predict on a target token. position_indices (Tensor[int]): A tensor of the (zero-indexed) position of the target token in `text` for each example in the batch. sentences (List[List[str]]): A list of lists of the tokenized strings of the input sentences. Returns: (int): The index of the predicted class in `model`'s output. """ with torch.no_grad(): logits = model(position_indices, sentences, upos_tags) # should be size (batch_size, output_size) predicted_class = torch.argmax(logits, dim=1) # should be size (batch_size, 1) return predicted_class def evaluate_model(model: nn.Module, eval_path: str, verbose: bool = True, is_training: bool = False) -> Tuple[Mapping, Mapping, float, float]: """ Helper function for model evaluation Args: model (LemmaClassifierLSTM or LemmaClassifierWithTransformer): An instance of the LemmaClassifier class that has architecture initialized which matches the model saved in `model_path`. model_path (str): Path to the saved model weights that will be loaded into `model`. eval_path (str): Path to the saved evaluation dataset. verbose (bool, optional): True if `evaluate_sequences()` should print the F1, Precision, and Recall for each class. Defaults to True. is_training (bool, optional): Whether the model is in training mode. If the model is training, we do not change it to eval mode. Returns: 1. Multi-class results (Mapping[int, Mapping[str, float]]): first map has keys as the classes (lemma indices) and value is another map with key of "f1", "precision", or "recall" with corresponding values. 2. Confusion Matrix (Mapping[int, Mapping[int, int]]): A confusion matrix with keys equal to the index of the gold tag, and a value of the map with the key as the predicted tag and corresponding count of that (gold, pred) pair. 3. Accuracy (float): the total accuracy (num correct / total examples) across the evaluation set. """ # load model device = default_device() model.to(device) if not is_training: model.eval() # set to eval mode # load in eval data dataset = utils.Dataset(eval_path, label_decoder=model.label_decoder, shuffle=False) logger.info(f"Evaluating on evaluation file {eval_path}") correct, total = 0, 0 gold_tags, pred_tags = dataset.labels, [] # run eval on each example from dataset for sentences, pos_indices, upos_tags, labels in tqdm(dataset, "Evaluating examples from data file"): pred = model_predict(model, pos_indices, sentences, upos_tags) # Pred should be size (batch_size, ) correct_preds = pred == labels.to(device) correct += torch.sum(correct_preds) total += len(correct_preds) pred_tags += pred.tolist() logger.info("Finished evaluating on dataset. Computing scores...") accuracy = correct / total mc_results, confusion, weighted_f1 = evaluate_sequences(gold_tags, pred_tags, dataset.label_decoder, verbose=verbose) # add brackets around batches of gold and pred tags because each batch is an element within the sequences in this helper if verbose: logger.info(f"Accuracy: {accuracy} ({correct}/{total})") logger.info(f"Label decoder: {dataset.label_decoder}") return mc_results, confusion, accuracy, weighted_f1 def main(args=None, predefined_args=None): # TODO: can unify this script with train_lstm_model.py? # TODO: can save the model type in the model .pt, then # automatically figure out what type of model we are using by # looking in the file parser = argparse.ArgumentParser() parser.add_argument("--vocab_size", type=int, default=10000, help="Number of tokens in vocab") parser.add_argument("--embedding_dim", type=int, default=100, help="Number of dimensions in word embeddings (currently using GloVe)") parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer") parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read') parser.add_argument("--charlm", action='store_true', default=False, help="Whether not to use the charlm embeddings") parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.") parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file") parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file") parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model.pt"), help="Path to model save file") parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta' or 'lstm')") parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta") parser.add_argument("--eval_file", type=str, help="path to evaluation file") args = parser.parse_args(args) if not predefined_args else predefined_args logger.info("Running training script with the following args:") args = vars(args) for arg in args: logger.info(f"{arg}: {args[arg]}") logger.info("------------------------------------------------------------") logger.info(f"Attempting evaluation of model from {args['save_name']} on file {args['eval_file']}") model = LemmaClassifier.load(args['save_name'], args) mcc_results, confusion, acc, weighted_f1 = evaluate_model(model, args['eval_file']) logger.info(f"MCC Results: {dict(mcc_results)}") logger.info("______________________________________________") logger.info(f"Confusion:\n%s", format_confusion(confusion)) logger.info("______________________________________________") logger.info(f"Accuracy: {acc}") logger.info("______________________________________________") logger.info(f"Weighted f1: {weighted_f1}") return mcc_results, confusion, acc, weighted_f1 if __name__ == "__main__": main()