# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import string from copy import deepcopy import regex as re from tqdm import tqdm from nemo.collections.nlp.data.text_normalization import constants __all__ = [ 'read_data_file', 'normalize_str', 'flatten', 'convert_fraction', 'convert_superscript', 'add_space_around_dash', ] def flatten(l): """ flatten a list of lists """ return [item for sublist in l for item in sublist] def add_space_around_dash(input: str): """ adds space around dash between numbers and non-numbers""" input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) return input def convert_superscript(written: str): """convert superscript to regular character""" written = re.sub("²", "2", written) written = re.sub("³", "3", written) return written def convert_fraction(written: str): """ converts fraction to standard form, e.g "½" -> "1/2", "1 ½" -> "1 1/2" Args: written: written form Returns: written: modified form """ written = re.sub(" ½", " 1/2", written) written = re.sub(" ⅓", " 1/3", written) written = re.sub(" ⅔", " 2/3", written) written = re.sub(" ¼", " 1/4", written) written = re.sub(" ¾", " 3/4", written) written = re.sub(" ⅕", " 1/5", written) written = re.sub(" ⅖", " 2/5", written) written = re.sub(" ⅗", " 3/5", written) written = re.sub(" ⅘", " 4/5", written) written = re.sub(" ⅙", " 1/6", written) written = re.sub(" ⅚", " 5/6", written) written = re.sub(" ⅛", " 1/8", written) written = re.sub(" ⅜", " 3/8", written) written = re.sub(" ⅝", " 5/8", written) written = re.sub(" ⅞", " 7/8", written) written = re.sub("^½", "1/2", written) written = re.sub("^⅓", "1/3", written) written = re.sub("^⅔", "2/3", written) written = re.sub("^¼", "1/4", written) written = re.sub("^¾", "3/4", written) written = re.sub("^⅕", "1/5", written) written = re.sub("^⅖", "2/5", written) written = re.sub("^⅗", "3/5", written) written = re.sub("^⅘", "4/5", written) written = re.sub("^⅙", "1/6", written) written = re.sub("^⅚", "5/6", written) written = re.sub("^⅛", "1/8", written) written = re.sub("^⅜", "3/8", written) written = re.sub("^⅝", "5/8", written) written = re.sub("^⅞", "7/8", written) written = re.sub("-½", "-1/2", written) written = re.sub("-⅓", "-1/3", written) written = re.sub("-⅔", "-2/3", written) written = re.sub("-¼", "-1/4", written) written = re.sub("-¾", "-3/4", written) written = re.sub("-⅕", "-1/5", written) written = re.sub("-⅖", "-2/5", written) written = re.sub("-⅗", "-3/5", written) written = re.sub("-⅘", "-4/5", written) written = re.sub("-⅙", "-1/6", written) written = re.sub("-⅚", "-5/6", written) written = re.sub("-⅛", "-1/8", written) written = re.sub("-⅜", "-3/8", written) written = re.sub("-⅝", "-5/8", written) written = re.sub("-⅞", "-7/8", written) written = re.sub("([0-9])\s?½", "\\1 1/2", written) written = re.sub("([0-9])\s?⅓", "\\1 1/3", written) written = re.sub("([0-9])\s?⅔", "\\1 2/3", written) written = re.sub("([0-9])\s?¼", "\\1 1/4", written) written = re.sub("([0-9])\s?¾", "\\1 3/4", written) written = re.sub("([0-9])\s?⅕", "\\1 1/5", written) written = re.sub("([0-9])\s?⅖", "\\1 2/5", written) written = re.sub("([0-9])\s?⅗", "\\1 3/5", written) written = re.sub("([0-9])\s?⅘", "\\1 4/5", written) written = re.sub("([0-9])\s?⅙", "\\1 1/6", written) written = re.sub("([0-9])\s?⅚", "\\1 5/6", written) written = re.sub("([0-9])\s?⅛", "\\1 1/8", written) written = re.sub("([0-9])\s?⅜", "\\1 3/8", written) written = re.sub("([0-9])\s?⅝", "\\1 5/8", written) written = re.sub("([0-9])\s?⅞", "\\1 7/8", written) return written def input_preprocessing(sent: str, lang: str): """ Function for preprocessing the input texts. The function first does some basic tokenization. For English, it then also processes Greek letters such as Δ or λ (if any). Args: sent: input text. lang: language Returns: preprocessed input text. """ # Basic Preprocessing and Tokenization if lang == constants.ENGLISH: sent = sent.replace('+', ' plus ') sent = sent.replace('=', ' equals ') sent = sent.replace('@', ' at ') sent = sent.replace('*', ' times ') # Greek letters processing for jx, tok in enumerate(sent): if tok in constants.EN_GREEK_TO_SPOKEN: sent = sent[:jx] + constants.EN_GREEK_TO_SPOKEN[tok] + sent[jx + 1 :] sent = convert_superscript(sent) sent = convert_fraction(sent) sent = add_space_around_dash(sent) return sent def read_data_file(fp: str, lang: str, max_insts: int = -1): """ Reading the raw data from a file of NeMo format For more info about the data format, refer to the `text_normalization doc `. Args: fp: file paths lang: language max_insts: Maximum number of instances (-1 means no limit) Returns: insts: List of sentences parsed as list of words """ insts, w_words, s_words, classes = [], [], [], [] # Read input file with open(fp, 'r', encoding='utf-8') as f: for line in tqdm(f): es = [e.strip() for e in input_preprocessing(line.strip(), lang=lang).split('\t')] if es[0] == '': inst = (deepcopy(classes), deepcopy(w_words), deepcopy(s_words)) insts.append(inst) # Reset w_words, s_words, classes = [], [], [] if max_insts > 0 and len(insts) >= max_insts: break else: classes.append(es[0]) w_words.append(es[1]) s_words.append(es[2]) return insts def normalize_str(input_str): """ Normalize an input string """ return input_str.strip().lower().replace(" ", " ") def remove_puncts(input_str): """ Remove punctuations from an input string """ return input_str.translate(str.maketrans('', '', string.punctuation))