stanza-digphil / stanza /models /lemma_classifier /evaluate_models.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 1 month ago

11.1 kB

	import os
	import sys

	parentdir = os.path.dirname(__file__)
	parentdir = os.path.dirname(parentdir)
	parentdir = os.path.dirname(parentdir)
	sys.path.append(parentdir)

	import logging
	import argparse
	import os

	from typing import Any, List, Tuple, Mapping
	from collections import defaultdict
	from numpy import random

	import torch
	import torch.nn as nn

	import stanza

	from stanza.models.common.utils import default_device
	from stanza.models.lemma_classifier import utils
	from stanza.models.lemma_classifier.base_model import LemmaClassifier
	from stanza.models.lemma_classifier.lstm_model import LemmaClassifierLSTM
	from stanza.models.lemma_classifier.transformer_model import LemmaClassifierWithTransformer
	from stanza.utils.confusion import format_confusion
	from stanza.utils.get_tqdm import get_tqdm

	tqdm = get_tqdm()

	logger = logging.getLogger('stanza.lemmaclassifier')


	def get_weighted_f1(mcc_results: Mapping[int, Mapping[str, float]], confusion: Mapping[int, Mapping[int, int]]) -> float:
	"""
	Computes the weighted F1 score across an evaluation set.

	The weight of a class's F1 score is equal to the number of examples in evaluation. This makes classes that have more
	examples in the evaluation more impactful to the weighted f1.
	"""
	num_total_examples = 0
	weighted_f1 = 0

	for class_id in mcc_results:
	class_f1 = mcc_results.get(class_id).get("f1")
	num_class_examples = sum(confusion.get(class_id).values())
	weighted_f1 += class_f1 * num_class_examples
	num_total_examples += num_class_examples

	return weighted_f1 / num_total_examples


	def evaluate_sequences(gold_tag_sequences: List[Any], pred_tag_sequences: List[Any], label_decoder: Mapping, verbose=True):
	"""
	Evaluates a model's predicted tags against a set of gold tags. Computes precision, recall, and f1 for all classes.

	Precision = true positives / true positives + false positives
	Recall = true positives / true positives + false negatives
	F1 = 2 * (Precision * Recall) / (Precision + Recall)

	Returns:
	1. Multi class result dictionary, where each class is a key and maps to another map of its F1, precision, and recall scores.
	e.g. multiclass_results[0]["precision"] would give class 0's precision.
	2. Confusion matrix, where each key is a gold tag and its value is another map with a key of the predicted tag with value of that (gold, pred) count.
	e.g. confusion[0][1] = 6 would mean that for gold tag 0, the model predicted tag 1 a total of 6 times.
	"""
	assert len(gold_tag_sequences) == len(pred_tag_sequences), \
	f"Length of gold tag sequences is {len(gold_tag_sequences)}, while length of predicted tag sequence is {len(pred_tag_sequences)}"

	confusion = defaultdict(lambda: defaultdict(int))

	reverse_label_decoder = {y: x for x, y in label_decoder.items()}
	for gold, pred in zip(gold_tag_sequences, pred_tag_sequences):
	confusion[reverse_label_decoder[gold]][reverse_label_decoder[pred]] += 1

	multi_class_result = defaultdict(lambda: defaultdict(float))
	# compute precision, recall and f1 for each class and store inside of `multi_class_result`
	for gold_tag in confusion.keys():

	try:
	prec = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum([confusion.get(k, {}).get(gold_tag, 0) for k in confusion.keys()])
	except ZeroDivisionError:
	prec = 0.0

	try:
	recall = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum(confusion.get(gold_tag, {}).values())
	except ZeroDivisionError:
	recall = 0.0

	try:
	f1 = 2 * (prec * recall) / (prec + recall)
	except ZeroDivisionError:
	f1 = 0.0

	multi_class_result[gold_tag] = {
	"precision": prec,
	"recall": recall,
	"f1": f1
	}

	if verbose:
	for lemma in multi_class_result:
	logger.info(f"Lemma '{lemma}' had precision {100 * multi_class_result[lemma]['precision']}, recall {100 * multi_class_result[lemma]['recall']} and F1 score of {100 * multi_class_result[lemma]['f1']}")

	weighted_f1 = get_weighted_f1(multi_class_result, confusion)

	return multi_class_result, confusion, weighted_f1


	def model_predict(model: nn.Module, position_indices: torch.Tensor, sentences: List[List[str]], upos_tags: List[List[int]]=[]) -> torch.Tensor:
	"""
	A LemmaClassifierLSTM or LemmaClassifierWithTransformer is used to predict on a single text example, given the position index of the target token.

	Args:
	model (LemmaClassifier): A trained LemmaClassifier that is able to predict on a target token.
	position_indices (Tensor[int]): A tensor of the (zero-indexed) position of the target token in `text` for each example in the batch.
	sentences (List[List[str]]): A list of lists of the tokenized strings of the input sentences.

	Returns:
	(int): The index of the predicted class in `model`'s output.
	"""
	with torch.no_grad():
	logits = model(position_indices, sentences, upos_tags) # should be size (batch_size, output_size)
	predicted_class = torch.argmax(logits, dim=1) # should be size (batch_size, 1)

	return predicted_class


	def evaluate_model(model: nn.Module, eval_path: str, verbose: bool = True, is_training: bool = False) -> Tuple[Mapping, Mapping, float, float]:
	"""
	Helper function for model evaluation

	Args:
	model (LemmaClassifierLSTM or LemmaClassifierWithTransformer): An instance of the LemmaClassifier class that has architecture initialized which matches the model saved in `model_path`.
	model_path (str): Path to the saved model weights that will be loaded into `model`.
	eval_path (str): Path to the saved evaluation dataset.
	verbose (bool, optional): True if `evaluate_sequences()` should print the F1, Precision, and Recall for each class. Defaults to True.
	is_training (bool, optional): Whether the model is in training mode. If the model is training, we do not change it to eval mode.

	Returns:
	1. Multi-class results (Mapping[int, Mapping[str, float]]): first map has keys as the classes (lemma indices) and value is
	another map with key of "f1", "precision", or "recall" with corresponding values.
	2. Confusion Matrix (Mapping[int, Mapping[int, int]]): A confusion matrix with keys equal to the index of the gold tag, and a value of the
	map with the key as the predicted tag and corresponding count of that (gold, pred) pair.
	3. Accuracy (float): the total accuracy (num correct / total examples) across the evaluation set.
	"""
	# load model
	device = default_device()
	model.to(device)

	if not is_training:
	model.eval() # set to eval mode

	# load in eval data
	dataset = utils.Dataset(eval_path, label_decoder=model.label_decoder, shuffle=False)

	logger.info(f"Evaluating on evaluation file {eval_path}")

	correct, total = 0, 0
	gold_tags, pred_tags = dataset.labels, []

	# run eval on each example from dataset
	for sentences, pos_indices, upos_tags, labels in tqdm(dataset, "Evaluating examples from data file"):
	pred = model_predict(model, pos_indices, sentences, upos_tags) # Pred should be size (batch_size, )
	correct_preds = pred == labels.to(device)
	correct += torch.sum(correct_preds)
	total += len(correct_preds)
	pred_tags += pred.tolist()

	logger.info("Finished evaluating on dataset. Computing scores...")
	accuracy = correct / total

	mc_results, confusion, weighted_f1 = evaluate_sequences(gold_tags, pred_tags, dataset.label_decoder, verbose=verbose)
	# add brackets around batches of gold and pred tags because each batch is an element within the sequences in this helper
	if verbose:
	logger.info(f"Accuracy: {accuracy} ({correct}/{total})")
	logger.info(f"Label decoder: {dataset.label_decoder}")

	return mc_results, confusion, accuracy, weighted_f1


	def main(args=None, predefined_args=None):

	# TODO: can unify this script with train_lstm_model.py?
	# TODO: can save the model type in the model .pt, then
	# automatically figure out what type of model we are using by
	# looking in the file
	parser = argparse.ArgumentParser()
	parser.add_argument("--vocab_size", type=int, default=10000, help="Number of tokens in vocab")
	parser.add_argument("--embedding_dim", type=int, default=100, help="Number of dimensions in word embeddings (currently using GloVe)")
	parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer")
	parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
	parser.add_argument("--charlm", action='store_true', default=False, help="Whether not to use the charlm embeddings")
	parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
	parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file")
	parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file")
	parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model.pt"), help="Path to model save file")
	parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta' or 'lstm')")
	parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta")
	parser.add_argument("--eval_file", type=str, help="path to evaluation file")

	args = parser.parse_args(args) if not predefined_args else predefined_args

	logger.info("Running training script with the following args:")
	args = vars(args)
	for arg in args:
	logger.info(f"{arg}: {args[arg]}")
	logger.info("------------------------------------------------------------")

	logger.info(f"Attempting evaluation of model from {args['save_name']} on file {args['eval_file']}")
	model = LemmaClassifier.load(args['save_name'], args)

	mcc_results, confusion, acc, weighted_f1 = evaluate_model(model, args['eval_file'])

	logger.info(f"MCC Results: {dict(mcc_results)}")
	logger.info("______________________________________________")
	logger.info(f"Confusion:\n%s", format_confusion(confusion))
	logger.info("______________________________________________")
	logger.info(f"Accuracy: {acc}")
	logger.info("______________________________________________")
	logger.info(f"Weighted f1: {weighted_f1}")

	return mcc_results, confusion, acc, weighted_f1


	if __name__ == "__main__":
	main()