MFA / scripts /training_models /generate_g2p.py
niobures's picture
MFA
2f6b10b verified
import os.path
import re
import sys
from montreal_forced_aligner.command_line.mfa import mfa_cli
from montreal_forced_aligner.config import TEMPORARY_DIRECTORY
MODEL_VERSION = "3.0.0"
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dictionary_dir = os.path.join(root_dir, "dictionary", "training")
g2p_dir = os.path.join(root_dir, "g2p", "staging")
output_dir = os.path.join(root_dir, "dictionary", "training", "g2pped")
config_dir = os.path.join(root_dir, "config", "acoustic")
temp_dir = TEMPORARY_DIRECTORY
os.makedirs(output_dir, exist_ok=True)
if sys.platform == "win32":
training_root = "D:/Data/speech/model_training_corpora"
else:
training_root = "/mnt/d/Data/speech/model_training_corpora"
lang_codes = [
"czech",
"russian",
"french",
"german",
"portuguese_brazil",
"portuguese_portugal",
"spanish_spain",
"spanish_latin_america",
"swedish",
"thai",
"turkish",
"english_us",
"english_us_arpa",
"english_uk",
"english_nigeria",
"korean_jamo",
"korean",
"hausa",
"swahili",
"vietnamese_hanoi",
"vietnamese_hue",
"vietnamese_ho_chi_minh_city",
"ukrainian",
"polish",
"croatian",
"bulgarian",
"japanese",
"japanese_katakana",
#'mandarin_china', 'mandarin_erhua', 'mandarin_taiwan'
"tamil",
"hindi",
"urdu",
]
lang_codes = [
"hindi-urdu",
#'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_ho_chi_minh_city',
]
corpus_variety_mapping = {
"hindi-urdu": {
"shrutilipi_hindi": "hindi",
"common_voice_hindi": "hindi",
"musc2021_cs_hindi": "hindi",
"musc2021_hindi": "hindi",
"common_voice_urdu": "urdu",
"shrutilipi_urdu": "urdu",
}
}
if __name__ == "__main__":
for lang in lang_codes:
print(lang)
lang_corpus_dir = os.path.join(training_root, lang)
config_path = os.path.join(config_dir, lang + ".yaml")
if lang in corpus_variety_mapping:
for subcorpus, dialect in corpus_variety_mapping[lang].items():
print(subcorpus, dialect)
dictionary_path = os.path.join(dictionary_dir, f"{dialect}_mfa.dict")
model_path = os.path.join(g2p_dir, f"{dialect}_mfa.zip")
if not os.path.exists(model_path):
continue
output_file = os.path.join(output_dir, f"{dialect}_{subcorpus}_mfa.dict")
command = [
"g2p",
lang_corpus_dir,
model_path,
output_file,
"--clean",
"-j",
"10",
"--dictionary_path",
dictionary_path,
"--oov_count_threshold",
"3",
"--use_mp",
"--no_use_postgres",
"--num_pronunciations",
"1",
]
if os.path.exists(config_path):
command += ["--config_path", config_path]
mfa_cli(command, standalone_mode=False)
error
else:
dictionary_path = os.path.join(dictionary_dir, f"{lang}_mfa.dict")
model_path = os.path.join(g2p_dir, f"{lang}_mfa.zip")
if not os.path.exists(model_path):
continue
output_file = os.path.join(output_dir, f"{lang}_mfa.dict")
command = [
"g2p",
lang_corpus_dir,
model_path,
output_file,
"--clean",
"-j",
"10",
"--dictionary_path",
dictionary_path,
"--use_mp",
"--evaluate",
"--num_pronunciations",
"1",
]
config_path = os.path.join(config_dir, lang + ".yaml")
if os.path.exists(config_path):
command += ["--config_path", config_path]
mfa_cli(command, standalone_mode=False)