Spaces:

KawshikManikantan
/

MEIRa

Running

App Files Files Community

MEIRa / utils_evaluate.py

KawshikManikantan

upload_trial

98e2ea5 about 1 year ago

raw

history blame contribute delete

8.29 kB

	import os
	import logging
	import pickle
	import time
	import json
	import torch
	from os import path
	from collections import OrderedDict, Counter

	from coref_utils.metrics import CorefEvaluator, F1Evaluator
	from coref_utils.conll import evaluate_conll
	from coref_utils.utils import get_mention_to_cluster, is_aligned, filter_clusters

	from model.utils import action_sequences_to_clusters
	from model.entity_ranking_model import EntityRankingModel

	from omegaconf import DictConfig
	from typing import Dict
	from torch import Tensor
	from collections import defaultdict
	import time

	logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
	logger = logging.getLogger()


	def get_log_file_name(
	config,
	dataset,
	teacher_force,
	gold_mentions,
	split,
	_iter,
	):

	log_dir = path.join(config.paths.model_dir, dataset)

	## Used for special experiments where we want to save logs in a different directory --
	if config.get("log_dir_add", None) is not None:
	log_dir_add = config.log_dir_add
	log_dir = path.join(log_dir, log_dir_add)

	if not path.exists(log_dir):
	os.makedirs(log_dir)

	gold_ment_str = ""
	if (
	config.model.mention_params.use_gold_ments
	): ## Mode where you train with golden mentions
	gold_ment_str = "_gold"

	tf_str = "" ## Teacher forced evaluation
	if teacher_force == True:
	tf_str = "_tf"

	gold_str = "" ## Golden mentions in evaluation
	if gold_mentions == True:
	gold_str = "_gold(eval)"

	ext_ment_str = "" ## External mention evaluation
	if config.model.mention_params.ext_ment:
	ext_ment_str = "_ext_ment"

	log_file = path.join(
	log_dir,
	split + gold_ment_str + gold_str + tf_str + _iter + ext_ment_str + ".log.jsonl",
	)
	log_file_link = path.join(
	log_dir,
	split
	+ gold_ment_str
	+ gold_str
	+ tf_str
	+ _iter
	+ ext_ment_str
	+ ".link.jsonl",
	)
	print("Log file: ", log_file)
	return log_file, log_file_link


	def get_logs(example, raw_predicted_clusters, coref_scores):
	log_example = dict(example)
	log_example["predicted_clusters"] = raw_predicted_clusters
	log_example["coref_scores"] = coref_scores

	del log_example["tensorized_sent"]
	for key in list(log_example.keys()):
	if isinstance(log_example[key], Tensor):
	del log_example[key]
	return log_example


	def full_coref_evaluation(
	config: DictConfig,
	model: EntityRankingModel,
	data_iter_map: Dict,
	dataset: str,
	split="dev",
	_iter="",
	teacher_force=False,
	gold_mentions=False,
	final_eval=False,
	conll_data_dir: Dict = None,
	) -> Dict:
	"""Function to evaluate full coreference chains.

	Args:
	config: Experiment configuration
	model: Coreference model
	data_iter_map: Data iterator
	dataset: Name of the coreference dataset
	split: Partition of the dataset - train/dev/test
	final_eval: Whether this is a periodic evaluation or final evaluation
	For final evaluation, official CoNLL scores can be calculated if possible.
	conll_data_dir: Data directory dictionary which maps datasets to their gold CoNLL files.

	Returns:
	dict: Dictionary with results for all the metrics.
	"""

	# Capture the auxiliary action accuracy
	total_actions = 0.0
	evaluator = CorefEvaluator()
	f1evaluator = F1Evaluator()
	coref_predictions, subtoken_maps = {}, {}

	logger.info(f"Evaluating on {len(data_iter_map[split][dataset])} examples")

	log_file, log_file_link = get_log_file_name(
	config,
	dataset,
	teacher_force,
	gold_mentions,
	split,
	_iter,
	)
	f = open(log_file, "w")
	f_link = open(log_file_link, "w")

	for example in data_iter_map[split][dataset]:
	## Get outputs:
	(
	pred_mentions,
	pred_mentions_emb,
	mention_scores,
	gt_actions,
	pred_actions,
	coref_scores,
	entity_cluster_states,
	link_time,
	) = model(example, teacher_force=teacher_force, gold_mentions=gold_mentions)

	num_major_entities = len(example["representatives"])
	raw_predicted_clusters = action_sequences_to_clusters(
	pred_actions, pred_mentions, num_major_entities
	)
	assert (
	len(raw_predicted_clusters)
	== len(example["clusters"])
	== num_major_entities + 1
	), "Number of clusters should be equal to number of major entities + 1"

	## Remove clusters less than the threshold of 1 and remove others from evaluation in MET here. Remove empty clustes for coref
	predicted_clusters_coref = filter_clusters(raw_predicted_clusters, threshold=1)

	## Keep cluster numbers same as the number of major entities.
	predicted_clusters_f1 = filter_clusters(raw_predicted_clusters, threshold=0)

	## Golden clusters cannot be empty so we can use the threshold as 1 But we remove the last cluster anyways
	gold_clusters = filter_clusters(example["clusters"], threshold=1)

	mention_to_predicted_coref = get_mention_to_cluster(predicted_clusters_coref)
	mention_to_gold = get_mention_to_cluster(gold_clusters)

	evaluator.update(
	predicted_clusters_coref,
	gold_clusters,
	mention_to_predicted_coref,
	mention_to_gold,
	)

	assert (
	len(predicted_clusters_f1) == len(gold_clusters) == num_major_entities
	), "Predicted and Gold clusters should be of same length and equal to number of major entities + 1"

	f1evaluator.update(predicted_clusters_f1, gold_clusters)

	coref_predictions[example["doc_key"]] = raw_predicted_clusters
	if "orig_subtoken_map" in example:
	subtoken_maps[example["doc_key"]] = example["orig_subtoken_map"]
	else:
	subtoken_maps[example["doc_key"]] = example["subtoken_map"]

	total_actions += len(pred_actions)

	max_coref_scores = [max(coref_score) for coref_score in coref_scores]
	## Removed oracle clustering for now. Code is now at the bottom of this file.

	log_example = get_logs(
	example,
	raw_predicted_clusters=raw_predicted_clusters,
	coref_scores=max_coref_scores,
	)
	log_link_example = {
	"doc_key": example["doc_key"],
	"num_mentions": len(pred_mentions),
	"link_time": link_time,
	}
	if _iter == "":
	f.write(json.dumps(log_example) + "\n")
	f_link.write(json.dumps(log_link_example) + "\n")
	f.close()
	f_link.close()

	result_dict: Dict = OrderedDict()
	perf_str: str = ""
	# Print individual metrics
	for indv_metric, indv_evaluator in zip(config.metrics, evaluator.evaluators):
	perf_str += ", " + indv_metric + ": {}".format(indv_evaluator.get_f1() * 100)
	result_dict[indv_metric] = OrderedDict()
	result_dict[indv_metric]["recall"] = indv_evaluator.get_recall() * 100
	result_dict[indv_metric]["precision"] = indv_evaluator.get_precision() * 100
	result_dict[indv_metric]["fscore"] = indv_evaluator.get_f1() * 100

	result_dict["fscore"] = evaluator.get_f1() * 100
	result_dict["f1_macro"], result_dict["f1_micro"] = f1evaluator.get_numbers()
	logger.info("F-score: %.1f %s" % (result_dict["fscore"], perf_str))

	return result_dict


	def coref_evaluation(
	config: DictConfig,
	model: EntityRankingModel,
	data_iter_map: Dict,
	dataset: str,
	split="dev",
	_iter="",
	teacher_force=False,
	gold_mentions=False,
	final_eval=False,
	conll_data_dir: Dict = None,
	) -> Dict:
	"""Evaluation function which calls the dataset-appropriate coreference evaluation function."""

	return full_coref_evaluation(
	config,
	model,
	data_iter_map,
	dataset,
	split=split,
	_iter=_iter,
	teacher_force=teacher_force,
	gold_mentions=gold_mentions,
	final_eval=final_eval,
	conll_data_dir=conll_data_dir,
	)