|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import string |
|
|
from copy import deepcopy |
|
|
|
|
|
import regex as re |
|
|
from tqdm import tqdm |
|
|
|
|
|
from nemo.collections.nlp.data.text_normalization import constants |
|
|
|
|
|
__all__ = [ |
|
|
'read_data_file', |
|
|
'normalize_str', |
|
|
'flatten', |
|
|
'convert_fraction', |
|
|
'convert_superscript', |
|
|
'add_space_around_dash', |
|
|
] |
|
|
|
|
|
|
|
|
def flatten(l): |
|
|
""" flatten a list of lists """ |
|
|
return [item for sublist in l for item in sublist] |
|
|
|
|
|
|
|
|
def add_space_around_dash(input: str): |
|
|
""" adds space around dash between numbers and non-numbers""" |
|
|
input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) |
|
|
input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) |
|
|
input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) |
|
|
input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) |
|
|
return input |
|
|
|
|
|
|
|
|
def convert_superscript(written: str): |
|
|
"""convert superscript to regular character""" |
|
|
written = re.sub("Β²", "2", written) |
|
|
written = re.sub("Β³", "3", written) |
|
|
return written |
|
|
|
|
|
|
|
|
def convert_fraction(written: str): |
|
|
""" |
|
|
converts fraction to standard form, e.g "Β½" -> "1/2", "1 Β½" -> "1 1/2" |
|
|
|
|
|
Args: |
|
|
written: written form |
|
|
Returns: |
|
|
written: modified form |
|
|
""" |
|
|
written = re.sub(" Β½", " 1/2", written) |
|
|
written = re.sub(" β
", " 1/3", written) |
|
|
written = re.sub(" β
", " 2/3", written) |
|
|
written = re.sub(" ΒΌ", " 1/4", written) |
|
|
written = re.sub(" ΒΎ", " 3/4", written) |
|
|
written = re.sub(" β
", " 1/5", written) |
|
|
written = re.sub(" β
", " 2/5", written) |
|
|
written = re.sub(" β
", " 3/5", written) |
|
|
written = re.sub(" β
", " 4/5", written) |
|
|
written = re.sub(" β
", " 1/6", written) |
|
|
written = re.sub(" β
", " 5/6", written) |
|
|
written = re.sub(" β
", " 1/8", written) |
|
|
written = re.sub(" β
", " 3/8", written) |
|
|
written = re.sub(" β
", " 5/8", written) |
|
|
written = re.sub(" β
", " 7/8", written) |
|
|
written = re.sub("^Β½", "1/2", written) |
|
|
written = re.sub("^β
", "1/3", written) |
|
|
written = re.sub("^β
", "2/3", written) |
|
|
written = re.sub("^ΒΌ", "1/4", written) |
|
|
written = re.sub("^ΒΎ", "3/4", written) |
|
|
written = re.sub("^β
", "1/5", written) |
|
|
written = re.sub("^β
", "2/5", written) |
|
|
written = re.sub("^β
", "3/5", written) |
|
|
written = re.sub("^β
", "4/5", written) |
|
|
written = re.sub("^β
", "1/6", written) |
|
|
written = re.sub("^β
", "5/6", written) |
|
|
written = re.sub("^β
", "1/8", written) |
|
|
written = re.sub("^β
", "3/8", written) |
|
|
written = re.sub("^β
", "5/8", written) |
|
|
written = re.sub("^β
", "7/8", written) |
|
|
written = re.sub("-Β½", "-1/2", written) |
|
|
written = re.sub("-β
", "-1/3", written) |
|
|
written = re.sub("-β
", "-2/3", written) |
|
|
written = re.sub("-ΒΌ", "-1/4", written) |
|
|
written = re.sub("-ΒΎ", "-3/4", written) |
|
|
written = re.sub("-β
", "-1/5", written) |
|
|
written = re.sub("-β
", "-2/5", written) |
|
|
written = re.sub("-β
", "-3/5", written) |
|
|
written = re.sub("-β
", "-4/5", written) |
|
|
written = re.sub("-β
", "-1/6", written) |
|
|
written = re.sub("-β
", "-5/6", written) |
|
|
written = re.sub("-β
", "-1/8", written) |
|
|
written = re.sub("-β
", "-3/8", written) |
|
|
written = re.sub("-β
", "-5/8", written) |
|
|
written = re.sub("-β
", "-7/8", written) |
|
|
written = re.sub("([0-9])\s?Β½", "\\1 1/2", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 1/3", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 2/3", written) |
|
|
written = re.sub("([0-9])\s?ΒΌ", "\\1 1/4", written) |
|
|
written = re.sub("([0-9])\s?ΒΎ", "\\1 3/4", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 1/5", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 2/5", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 3/5", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 4/5", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 1/6", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 5/6", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 1/8", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 3/8", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 5/8", written) |
|
|
written = re.sub("([0-9])\s?β
", "\\1 7/8", written) |
|
|
return written |
|
|
|
|
|
|
|
|
def input_preprocessing(sent: str, lang: str): |
|
|
""" Function for preprocessing the input texts. The function first does |
|
|
some basic tokenization. For English, it then also processes Greek letters |
|
|
such as Ξ or Ξ» (if any). |
|
|
|
|
|
Args: |
|
|
sent: input text. |
|
|
lang: language |
|
|
|
|
|
Returns: preprocessed input text. |
|
|
""" |
|
|
|
|
|
if lang == constants.ENGLISH: |
|
|
sent = sent.replace('+', ' plus ') |
|
|
sent = sent.replace('=', ' equals ') |
|
|
sent = sent.replace('@', ' at ') |
|
|
sent = sent.replace('*', ' times ') |
|
|
|
|
|
for jx, tok in enumerate(sent): |
|
|
if tok in constants.EN_GREEK_TO_SPOKEN: |
|
|
sent = sent[:jx] + constants.EN_GREEK_TO_SPOKEN[tok] + sent[jx + 1 :] |
|
|
|
|
|
sent = convert_superscript(sent) |
|
|
sent = convert_fraction(sent) |
|
|
sent = add_space_around_dash(sent) |
|
|
|
|
|
return sent |
|
|
|
|
|
|
|
|
def read_data_file(fp: str, lang: str, max_insts: int = -1): |
|
|
""" Reading the raw data from a file of NeMo format |
|
|
For more info about the data format, refer to the |
|
|
`text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`. |
|
|
|
|
|
Args: |
|
|
fp: file paths |
|
|
lang: language |
|
|
max_insts: Maximum number of instances (-1 means no limit) |
|
|
Returns: |
|
|
insts: List of sentences parsed as list of words |
|
|
""" |
|
|
insts, w_words, s_words, classes = [], [], [], [] |
|
|
|
|
|
with open(fp, 'r', encoding='utf-8') as f: |
|
|
for line in tqdm(f): |
|
|
es = [e.strip() for e in input_preprocessing(line.strip(), lang=lang).split('\t')] |
|
|
if es[0] == '<eos>': |
|
|
inst = (deepcopy(classes), deepcopy(w_words), deepcopy(s_words)) |
|
|
insts.append(inst) |
|
|
|
|
|
w_words, s_words, classes = [], [], [] |
|
|
|
|
|
if max_insts > 0 and len(insts) >= max_insts: |
|
|
break |
|
|
else: |
|
|
classes.append(es[0]) |
|
|
w_words.append(es[1]) |
|
|
s_words.append(es[2]) |
|
|
return insts |
|
|
|
|
|
|
|
|
def normalize_str(input_str): |
|
|
""" Normalize an input string """ |
|
|
return input_str.strip().lower().replace(" ", " ") |
|
|
|
|
|
|
|
|
def remove_puncts(input_str): |
|
|
""" Remove punctuations from an input string """ |
|
|
return input_str.translate(str.maketrans('', '', string.punctuation)) |
|
|
|