thanks to NVIDIA ❤

7934b29 almost 3 years ago

13.1 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from math import ceil
	from time import perf_counter
	from typing import List

	import numpy as np
	import torch.nn as nn
	from tqdm import tqdm

	from nemo.collections.nlp.data.text_normalization import TextNormalizationTestDataset, constants
	from nemo.collections.nlp.data.text_normalization.utils import input_preprocessing
	from nemo.collections.nlp.models.duplex_text_normalization.utils import get_formatted_string
	from nemo.utils import logging

	try:
	from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct

	PYNINI_AVAILABLE = True
	except (ImportError, ModuleNotFoundError):
	PYNINI_AVAILABLE = False


	__all__ = ['DuplexTextNormalizationModel']


	class DuplexTextNormalizationModel(nn.Module):
	"""
	DuplexTextNormalizationModel is a wrapper class that can be used to
	encapsulate a trained tagger and a trained decoder. The class is intended
	to be used for inference only (e.g., for evaluation).
	"""

	def __init__(self, tagger, decoder, lang):
	super(DuplexTextNormalizationModel, self).__init__()

	self.tagger = tagger
	self.decoder = decoder
	self.lang = lang

	def evaluate(
	self, dataset: TextNormalizationTestDataset, batch_size: int, errors_log_fp: str, verbose: bool = True
	):
	""" Function for evaluating the performance of the model on a dataset

	Args:
	dataset: The dataset to be used for evaluation.
	batch_size: Batch size to use during inference. You can set it to be 1
	(no batching) if you want to measure the running time of the model
	per individual example (assuming requests are coming to the model one-by-one).
	errors_log_fp: Path to the file for logging the errors
	verbose: if true prints and logs various evaluation results

	Returns:
	results: A Dict containing the evaluation results (e.g., accuracy, running time)
	"""
	results = {}
	error_f = open(errors_log_fp, 'w+')

	# Apply the model on the dataset
	(
	all_run_times,
	all_dirs,
	all_inputs,
	all_targets,
	all_classes,
	all_nb_spans,
	all_span_starts,
	all_span_ends,
	all_output_spans,
	) = ([], [], [], [], [], [], [], [], [])
	all_tag_preds, all_final_preds = [], []
	nb_iters = int(ceil(len(dataset) / batch_size))
	for i in tqdm(range(nb_iters)):
	start_idx = i * batch_size
	end_idx = (i + 1) * batch_size
	batch_insts = dataset[start_idx:end_idx]
	(
	batch_dirs,
	batch_inputs,
	batch_targets,
	batch_classes,
	batch_nb_spans,
	batch_span_starts,
	batch_span_ends,
	) = zip(*batch_insts)
	# Inference and Running Time Measurement
	batch_start_time = perf_counter()

	batch_tag_preds, batch_output_spans, batch_final_preds = self._infer(
	batch_inputs, batch_dirs, processed=True
	)

	batch_run_time = (perf_counter() - batch_start_time) * 1000 # milliseconds
	all_run_times.append(batch_run_time)
	# Update all_dirs, all_inputs, all_tag_preds, all_final_preds and all_targets
	all_dirs.extend(batch_dirs)
	all_inputs.extend(batch_inputs)
	all_tag_preds.extend(batch_tag_preds)
	all_final_preds.extend(batch_final_preds)
	all_targets.extend(batch_targets)
	all_classes.extend(batch_classes)
	all_nb_spans.extend(batch_nb_spans)
	all_span_starts.extend(batch_span_starts)
	all_span_ends.extend(batch_span_ends)
	all_output_spans.extend(batch_output_spans)

	# Metrics
	tn_error_ctx, itn_error_ctx = 0, 0
	for direction in constants.INST_DIRECTIONS:
	(
	cur_dirs,
	cur_inputs,
	cur_tag_preds,
	cur_final_preds,
	cur_targets,
	cur_classes,
	cur_nb_spans,
	cur_span_starts,
	cur_span_ends,
	cur_output_spans,
	) = ([], [], [], [], [], [], [], [], [], [])
	for dir, _input, tag_pred, final_pred, target, cls, nb_spans, span_starts, span_ends, output_spans in zip(
	all_dirs,
	all_inputs,
	all_tag_preds,
	all_final_preds,
	all_targets,
	all_classes,
	all_nb_spans,
	all_span_starts,
	all_span_ends,
	all_output_spans,
	):
	if dir == direction:
	cur_dirs.append(dir)
	cur_inputs.append(_input)
	cur_tag_preds.append(tag_pred)
	cur_final_preds.append(final_pred)
	cur_targets.append(target)
	cur_classes.append(cls)
	cur_nb_spans.append(nb_spans)
	cur_span_starts.append(span_starts)
	cur_span_ends.append(span_ends)
	cur_output_spans.append(output_spans)
	nb_instances = len(cur_final_preds)
	cur_targets_sent = [" ".join(x) for x in cur_targets]

	sent_accuracy = TextNormalizationTestDataset.compute_sent_accuracy(
	cur_final_preds, cur_targets_sent, cur_dirs
	)

	class_accuracy = TextNormalizationTestDataset.compute_class_accuracy(
	[x.split() for x in cur_inputs],
	cur_targets,
	cur_tag_preds,
	cur_dirs,
	cur_output_spans,
	cur_classes,
	cur_nb_spans,
	cur_span_ends,
	)
	if verbose:
	logging.info(f'\n============ Direction {direction} ============')
	logging.info(f'Sentence Accuracy: {sent_accuracy}')
	logging.info(f'nb_instances: {nb_instances}')
	if not isinstance(class_accuracy, str):
	log_class_accuracies = ""
	for key, value in class_accuracy.items():
	log_class_accuracies += f"\n\t{key}:\t{value[0]}\t{value[1]}/{value[2]}"
	else:
	log_class_accuracies = class_accuracy
	logging.info(f'class accuracies: {log_class_accuracies}')
	# Update results
	results[direction] = {
	'sent_accuracy': sent_accuracy,
	'nb_instances': nb_instances,
	"class_accuracy": log_class_accuracies,
	}
	# Write errors to log file
	for _input, tag_pred, final_pred, target, classes in zip(
	cur_inputs, cur_tag_preds, cur_final_preds, cur_targets_sent, cur_classes
	):
	if not TextNormalizationTestDataset.is_same(final_pred, target, direction):
	if direction == constants.INST_BACKWARD:
	error_f.write('Backward Problem (ITN)\n')
	itn_error_ctx += 1
	elif direction == constants.INST_FORWARD:
	error_f.write('Forward Problem (TN)\n')
	tn_error_ctx += 1

	formatted_input_str = get_formatted_string(self.decoder.processor.tokenize(_input).split())
	formatted_tag_pred_str = get_formatted_string(tag_pred)
	class_str = " ".join(classes)
	error_f.write(f'Original Input : {_input}\n')
	error_f.write(f'Input : {formatted_input_str}\n')
	error_f.write(f'Predicted Tags : {formatted_tag_pred_str}\n')
	error_f.write(f'Ground Classes : {class_str}\n')
	error_f.write(f'Predicted Str : {final_pred}\n')
	error_f.write(f'Ground-Truth : {target}\n')
	error_f.write('\n')
	results['itn_error_ctx'] = itn_error_ctx
	results['tn_error_ctx'] = tn_error_ctx

	# Running Time
	avg_running_time = np.average(all_run_times) / batch_size # in ms
	if verbose:
	logging.info(f'Average running time (normalized by batch size): {avg_running_time} ms')
	results['running_time'] = avg_running_time

	# Close log file
	error_f.close()
	logging.info(f'Errors are saved at {errors_log_fp}.')
	return results

	# Functions for inference
	def _infer(self, sents: List[str], inst_directions: List[str], processed=False):
	"""
	Main function for Inference

	If the 'joint' mode is used, "sents" will include both spoken and written forms on each input sentence,
	and "inst_directions" will include both constants.INST_BACKWARD and constants.INST_FORWARD

	Args:
	sents: A list of input texts.
	inst_directions: A list of str where each str indicates the direction of the corresponding instance \
	(i.e., constants.INST_BACKWARD for ITN or constants.INST_FORWARD for TN).
	processed: Set to True when used with TextNormalizationTestDataset, the data is already tokenized with moses,
	repetitive moses tokenization could lead to the number of tokens and class span mismatch

	Returns:
	tag_preds: A list of lists where the inner list contains the tag predictions from the tagger for each word in the input text.
	output_spans: A list of lists where each list contains the decoded semiotic spans from the decoder for an input text.
	final_outputs: A list of str where each str is the final output text for an input text.
	"""
	original_sents = [s for s in sents]
	# Separate into words
	if not processed:
	sents = [input_preprocessing(x, lang=self.lang) for x in sents]
	sents = [self.decoder.processor.tokenize(x).split() for x in sents]
	else:
	sents = [x.split() for x in sents]

	# Tagging
	# span_ends included, returns index wrt to words in input without auxiliary words
	tag_preds, nb_spans, span_starts, span_ends = self.tagger._infer(sents, inst_directions)
	output_spans = self.decoder._infer(sents, nb_spans, span_starts, span_ends, inst_directions)

	# Prepare final outputs
	final_outputs = []
	for ix, (sent, tags) in enumerate(zip(sents, tag_preds)):
	try:
	cur_words, jx, span_idx = [], 0, 0
	cur_spans = output_spans[ix]
	while jx < len(sent):
	tag, word = tags[jx], sent[jx]
	if constants.SAME_TAG in tag:
	cur_words.append(word)
	jx += 1
	else:
	jx += 1
	cur_words.append(cur_spans[span_idx])
	span_idx += 1
	while jx < len(sent) and tags[jx] == constants.I_PREFIX + constants.TRANSFORM_TAG:
	jx += 1

	if processed:
	# for Class-based evaluation, don't apply Moses detokenization
	cur_output_str = " ".join(cur_words)
	else:
	# detokenize the output with Moses and fix punctuation marks to match the input
	# for interactive inference or inference from a file
	cur_output_str = self.decoder.processor.detokenize(cur_words)
	if PYNINI_AVAILABLE:
	cur_output_str = post_process_punct(input=original_sents[ix], normalized_text=cur_output_str)
	else:
	logging.warning(
	"`pynini` not installed, please install via nemo_text_processing/pynini_install.sh"
	)
	final_outputs.append(cur_output_str)
	except IndexError:
	logging.warning(f"Input sent is too long and will be skipped - {original_sents[ix]}")
	final_outputs.append(original_sents[ix])
	return tag_preds, output_spans, final_outputs