camenduru's picture
thanks to NVIDIA ❀
7934b29
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
from copy import deepcopy
import regex as re
from tqdm import tqdm
from nemo.collections.nlp.data.text_normalization import constants
__all__ = [
'read_data_file',
'normalize_str',
'flatten',
'convert_fraction',
'convert_superscript',
'add_space_around_dash',
]
def flatten(l):
""" flatten a list of lists """
return [item for sublist in l for item in sublist]
def add_space_around_dash(input: str):
""" adds space around dash between numbers and non-numbers"""
input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input)
input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input)
input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input)
input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input)
return input
def convert_superscript(written: str):
"""convert superscript to regular character"""
written = re.sub("Β²", "2", written)
written = re.sub("Β³", "3", written)
return written
def convert_fraction(written: str):
"""
converts fraction to standard form, e.g "Β½" -> "1/2", "1 Β½" -> "1 1/2"
Args:
written: written form
Returns:
written: modified form
"""
written = re.sub(" Β½", " 1/2", written)
written = re.sub(" β…“", " 1/3", written)
written = re.sub(" β…”", " 2/3", written)
written = re.sub(" ΒΌ", " 1/4", written)
written = re.sub(" ΒΎ", " 3/4", written)
written = re.sub(" β…•", " 1/5", written)
written = re.sub(" β…–", " 2/5", written)
written = re.sub(" β…—", " 3/5", written)
written = re.sub(" β…˜", " 4/5", written)
written = re.sub(" β…™", " 1/6", written)
written = re.sub(" β…š", " 5/6", written)
written = re.sub(" β…›", " 1/8", written)
written = re.sub(" β…œ", " 3/8", written)
written = re.sub(" ⅝", " 5/8", written)
written = re.sub(" β…ž", " 7/8", written)
written = re.sub("^Β½", "1/2", written)
written = re.sub("^β…“", "1/3", written)
written = re.sub("^β…”", "2/3", written)
written = re.sub("^ΒΌ", "1/4", written)
written = re.sub("^ΒΎ", "3/4", written)
written = re.sub("^β…•", "1/5", written)
written = re.sub("^β…–", "2/5", written)
written = re.sub("^β…—", "3/5", written)
written = re.sub("^β…˜", "4/5", written)
written = re.sub("^β…™", "1/6", written)
written = re.sub("^β…š", "5/6", written)
written = re.sub("^β…›", "1/8", written)
written = re.sub("^β…œ", "3/8", written)
written = re.sub("^⅝", "5/8", written)
written = re.sub("^β…ž", "7/8", written)
written = re.sub("-Β½", "-1/2", written)
written = re.sub("-β…“", "-1/3", written)
written = re.sub("-β…”", "-2/3", written)
written = re.sub("-ΒΌ", "-1/4", written)
written = re.sub("-ΒΎ", "-3/4", written)
written = re.sub("-β…•", "-1/5", written)
written = re.sub("-β…–", "-2/5", written)
written = re.sub("-β…—", "-3/5", written)
written = re.sub("-β…˜", "-4/5", written)
written = re.sub("-β…™", "-1/6", written)
written = re.sub("-β…š", "-5/6", written)
written = re.sub("-β…›", "-1/8", written)
written = re.sub("-β…œ", "-3/8", written)
written = re.sub("-⅝", "-5/8", written)
written = re.sub("-β…ž", "-7/8", written)
written = re.sub("([0-9])\s?Β½", "\\1 1/2", written)
written = re.sub("([0-9])\s?β…“", "\\1 1/3", written)
written = re.sub("([0-9])\s?β…”", "\\1 2/3", written)
written = re.sub("([0-9])\s?ΒΌ", "\\1 1/4", written)
written = re.sub("([0-9])\s?ΒΎ", "\\1 3/4", written)
written = re.sub("([0-9])\s?β…•", "\\1 1/5", written)
written = re.sub("([0-9])\s?β…–", "\\1 2/5", written)
written = re.sub("([0-9])\s?β…—", "\\1 3/5", written)
written = re.sub("([0-9])\s?β…˜", "\\1 4/5", written)
written = re.sub("([0-9])\s?β…™", "\\1 1/6", written)
written = re.sub("([0-9])\s?β…š", "\\1 5/6", written)
written = re.sub("([0-9])\s?β…›", "\\1 1/8", written)
written = re.sub("([0-9])\s?β…œ", "\\1 3/8", written)
written = re.sub("([0-9])\s?⅝", "\\1 5/8", written)
written = re.sub("([0-9])\s?β…ž", "\\1 7/8", written)
return written
def input_preprocessing(sent: str, lang: str):
""" Function for preprocessing the input texts. The function first does
some basic tokenization. For English, it then also processes Greek letters
such as Ξ” or Ξ» (if any).
Args:
sent: input text.
lang: language
Returns: preprocessed input text.
"""
# Basic Preprocessing and Tokenization
if lang == constants.ENGLISH:
sent = sent.replace('+', ' plus ')
sent = sent.replace('=', ' equals ')
sent = sent.replace('@', ' at ')
sent = sent.replace('*', ' times ')
# Greek letters processing
for jx, tok in enumerate(sent):
if tok in constants.EN_GREEK_TO_SPOKEN:
sent = sent[:jx] + constants.EN_GREEK_TO_SPOKEN[tok] + sent[jx + 1 :]
sent = convert_superscript(sent)
sent = convert_fraction(sent)
sent = add_space_around_dash(sent)
return sent
def read_data_file(fp: str, lang: str, max_insts: int = -1):
""" Reading the raw data from a file of NeMo format
For more info about the data format, refer to the
`text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
Args:
fp: file paths
lang: language
max_insts: Maximum number of instances (-1 means no limit)
Returns:
insts: List of sentences parsed as list of words
"""
insts, w_words, s_words, classes = [], [], [], []
# Read input file
with open(fp, 'r', encoding='utf-8') as f:
for line in tqdm(f):
es = [e.strip() for e in input_preprocessing(line.strip(), lang=lang).split('\t')]
if es[0] == '<eos>':
inst = (deepcopy(classes), deepcopy(w_words), deepcopy(s_words))
insts.append(inst)
# Reset
w_words, s_words, classes = [], [], []
if max_insts > 0 and len(insts) >= max_insts:
break
else:
classes.append(es[0])
w_words.append(es[1])
s_words.append(es[2])
return insts
def normalize_str(input_str):
""" Normalize an input string """
return input_str.strip().lower().replace(" ", " ")
def remove_puncts(input_str):
""" Remove punctuations from an input string """
return input_str.translate(str.maketrans('', '', string.punctuation))