|
|
import os.path |
|
|
import re |
|
|
import sys |
|
|
|
|
|
from montreal_forced_aligner.command_line.mfa import mfa_cli |
|
|
from montreal_forced_aligner.config import TEMPORARY_DIRECTORY |
|
|
|
|
|
MODEL_VERSION = "3.0.0" |
|
|
|
|
|
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
dictionary_dir = os.path.join(root_dir, "dictionary", "training") |
|
|
g2p_dir = os.path.join(root_dir, "g2p", "staging") |
|
|
output_dir = os.path.join(root_dir, "dictionary", "training", "g2pped") |
|
|
config_dir = os.path.join(root_dir, "config", "acoustic") |
|
|
temp_dir = TEMPORARY_DIRECTORY |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
if sys.platform == "win32": |
|
|
training_root = "D:/Data/speech/model_training_corpora" |
|
|
else: |
|
|
training_root = "/mnt/d/Data/speech/model_training_corpora" |
|
|
|
|
|
|
|
|
lang_codes = [ |
|
|
"czech", |
|
|
"russian", |
|
|
"french", |
|
|
"german", |
|
|
"portuguese_brazil", |
|
|
"portuguese_portugal", |
|
|
"spanish_spain", |
|
|
"spanish_latin_america", |
|
|
"swedish", |
|
|
"thai", |
|
|
"turkish", |
|
|
"english_us", |
|
|
"english_us_arpa", |
|
|
"english_uk", |
|
|
"english_nigeria", |
|
|
"korean_jamo", |
|
|
"korean", |
|
|
"hausa", |
|
|
"swahili", |
|
|
"vietnamese_hanoi", |
|
|
"vietnamese_hue", |
|
|
"vietnamese_ho_chi_minh_city", |
|
|
"ukrainian", |
|
|
"polish", |
|
|
"croatian", |
|
|
"bulgarian", |
|
|
"japanese", |
|
|
"japanese_katakana", |
|
|
|
|
|
"tamil", |
|
|
"hindi", |
|
|
"urdu", |
|
|
] |
|
|
|
|
|
lang_codes = [ |
|
|
"hindi-urdu", |
|
|
|
|
|
] |
|
|
|
|
|
corpus_variety_mapping = { |
|
|
"hindi-urdu": { |
|
|
"shrutilipi_hindi": "hindi", |
|
|
"common_voice_hindi": "hindi", |
|
|
"musc2021_cs_hindi": "hindi", |
|
|
"musc2021_hindi": "hindi", |
|
|
"common_voice_urdu": "urdu", |
|
|
"shrutilipi_urdu": "urdu", |
|
|
} |
|
|
} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
for lang in lang_codes: |
|
|
print(lang) |
|
|
lang_corpus_dir = os.path.join(training_root, lang) |
|
|
config_path = os.path.join(config_dir, lang + ".yaml") |
|
|
if lang in corpus_variety_mapping: |
|
|
for subcorpus, dialect in corpus_variety_mapping[lang].items(): |
|
|
print(subcorpus, dialect) |
|
|
dictionary_path = os.path.join(dictionary_dir, f"{dialect}_mfa.dict") |
|
|
model_path = os.path.join(g2p_dir, f"{dialect}_mfa.zip") |
|
|
if not os.path.exists(model_path): |
|
|
continue |
|
|
output_file = os.path.join(output_dir, f"{dialect}_{subcorpus}_mfa.dict") |
|
|
command = [ |
|
|
"g2p", |
|
|
lang_corpus_dir, |
|
|
model_path, |
|
|
output_file, |
|
|
"--clean", |
|
|
"-j", |
|
|
"10", |
|
|
"--dictionary_path", |
|
|
dictionary_path, |
|
|
"--oov_count_threshold", |
|
|
"3", |
|
|
"--use_mp", |
|
|
"--no_use_postgres", |
|
|
"--num_pronunciations", |
|
|
"1", |
|
|
] |
|
|
if os.path.exists(config_path): |
|
|
command += ["--config_path", config_path] |
|
|
mfa_cli(command, standalone_mode=False) |
|
|
error |
|
|
else: |
|
|
dictionary_path = os.path.join(dictionary_dir, f"{lang}_mfa.dict") |
|
|
model_path = os.path.join(g2p_dir, f"{lang}_mfa.zip") |
|
|
if not os.path.exists(model_path): |
|
|
continue |
|
|
output_file = os.path.join(output_dir, f"{lang}_mfa.dict") |
|
|
command = [ |
|
|
"g2p", |
|
|
lang_corpus_dir, |
|
|
model_path, |
|
|
output_file, |
|
|
"--clean", |
|
|
"-j", |
|
|
"10", |
|
|
"--dictionary_path", |
|
|
dictionary_path, |
|
|
"--use_mp", |
|
|
"--evaluate", |
|
|
"--num_pronunciations", |
|
|
"1", |
|
|
] |
|
|
config_path = os.path.join(config_dir, lang + ".yaml") |
|
|
if os.path.exists(config_path): |
|
|
command += ["--config_path", config_path] |
|
|
mfa_cli(command, standalone_mode=False) |
|
|
|