|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
This script takes as input "text_full", which includes three transcript files |
|
|
(train_S, train_M and train_L) for AISHELL4: |
|
|
- text_full |
|
|
and generates the output file text_normalize which is implemented |
|
|
to normalize text: |
|
|
- text |
|
|
""" |
|
|
|
|
|
|
|
|
import argparse |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
def get_parser(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Normalizing for text", |
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
) |
|
|
parser.add_argument( |
|
|
"--input", |
|
|
default="data/lang_char/text_full", |
|
|
type=str, |
|
|
help="the input text files for AISHELL4", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
default="data/lang_char/text", |
|
|
type=str, |
|
|
help="the text implemented with normalizer for AISHELL4", |
|
|
) |
|
|
|
|
|
return parser |
|
|
|
|
|
|
|
|
def text_normalize(str_line: str): |
|
|
line = str_line.strip().rstrip("\n") |
|
|
line = line.replace(" ", "") |
|
|
line = line.replace("<sil>", "") |
|
|
line = line.replace("<%>", "") |
|
|
line = line.replace("<->", "") |
|
|
line = line.replace("<$>", "") |
|
|
line = line.replace("<#>", "") |
|
|
line = line.replace("<_>", "") |
|
|
line = line.replace("<space>", "") |
|
|
line = line.replace("`", "") |
|
|
line = line.replace("&", "") |
|
|
line = line.replace(",", "") |
|
|
line = line.replace("A", "") |
|
|
line = line.replace("a", "A") |
|
|
line = line.replace("b", "B") |
|
|
line = line.replace("c", "C") |
|
|
line = line.replace("k", "K") |
|
|
line = line.replace("t", "T") |
|
|
line = line.replace(",", "") |
|
|
line = line.replace("丶", "") |
|
|
line = line.replace("。", "") |
|
|
line = line.replace("、", "") |
|
|
line = line.replace("?", "") |
|
|
line = line.replace("·", "") |
|
|
line = line.replace("*", "") |
|
|
line = line.replace("!", "") |
|
|
line = line.replace("$", "") |
|
|
line = line.replace("+", "") |
|
|
line = line.replace("-", "") |
|
|
line = line.replace("\\", "") |
|
|
line = line.replace("?", "") |
|
|
line = line.replace("¥", "") |
|
|
line = line.replace("%", "") |
|
|
line = line.replace(".", "") |
|
|
line = line.replace("<", "") |
|
|
line = line.replace("&", "") |
|
|
line = line.upper() |
|
|
|
|
|
return line |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = get_parser() |
|
|
args = parser.parse_args() |
|
|
|
|
|
input_file = args.input |
|
|
output_file = args.output |
|
|
|
|
|
f = open(input_file, "r", encoding="utf-8") |
|
|
lines = f.readlines() |
|
|
new_lines = [] |
|
|
for i in tqdm(range(len(lines))): |
|
|
new_line = text_normalize(lines[i]) |
|
|
new_lines.append(new_line) |
|
|
|
|
|
f_new = open(output_file, "w", encoding="utf-8") |
|
|
for line in new_lines: |
|
|
f_new.write(line) |
|
|
f_new.write("\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|