MFA

File size: 4,142 Bytes

2f6b10b

import os.path
import re
import sys

from montreal_forced_aligner.command_line.mfa import mfa_cli
from montreal_forced_aligner.config import TEMPORARY_DIRECTORY

MODEL_VERSION = "3.0.0"

root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dictionary_dir = os.path.join(root_dir, "dictionary", "training")
g2p_dir = os.path.join(root_dir, "g2p", "staging")
output_dir = os.path.join(root_dir, "dictionary", "training", "g2pped")
config_dir = os.path.join(root_dir, "config", "acoustic")
temp_dir = TEMPORARY_DIRECTORY
os.makedirs(output_dir, exist_ok=True)

if sys.platform == "win32":
    training_root = "D:/Data/speech/model_training_corpora"
else:
    training_root = "/mnt/d/Data/speech/model_training_corpora"


lang_codes = [
    "czech",
    "russian",
    "french",
    "german",
    "portuguese_brazil",
    "portuguese_portugal",
    "spanish_spain",
    "spanish_latin_america",
    "swedish",
    "thai",
    "turkish",
    "english_us",
    "english_us_arpa",
    "english_uk",
    "english_nigeria",
    "korean_jamo",
    "korean",
    "hausa",
    "swahili",
    "vietnamese_hanoi",
    "vietnamese_hue",
    "vietnamese_ho_chi_minh_city",
    "ukrainian",
    "polish",
    "croatian",
    "bulgarian",
    "japanese",
    "japanese_katakana",
    #'mandarin_china', 'mandarin_erhua', 'mandarin_taiwan'
    "tamil",
    "hindi",
    "urdu",
]

lang_codes = [
    "hindi-urdu",
    #'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_ho_chi_minh_city',
]

corpus_variety_mapping = {
    "hindi-urdu": {
        "shrutilipi_hindi": "hindi",
        "common_voice_hindi": "hindi",
        "musc2021_cs_hindi": "hindi",
        "musc2021_hindi": "hindi",
        "common_voice_urdu": "urdu",
        "shrutilipi_urdu": "urdu",
    }
}

if __name__ == "__main__":
    for lang in lang_codes:
        print(lang)
        lang_corpus_dir = os.path.join(training_root, lang)
        config_path = os.path.join(config_dir, lang + ".yaml")
        if lang in corpus_variety_mapping:
            for subcorpus, dialect in corpus_variety_mapping[lang].items():
                print(subcorpus, dialect)
                dictionary_path = os.path.join(dictionary_dir, f"{dialect}_mfa.dict")
                model_path = os.path.join(g2p_dir, f"{dialect}_mfa.zip")
                if not os.path.exists(model_path):
                    continue
                output_file = os.path.join(output_dir, f"{dialect}_{subcorpus}_mfa.dict")
                command = [
                    "g2p",
                    lang_corpus_dir,
                    model_path,
                    output_file,
                    "--clean",
                    "-j",
                    "10",
                    "--dictionary_path",
                    dictionary_path,
                    "--oov_count_threshold",
                    "3",
                    "--use_mp",
                    "--no_use_postgres",
                    "--num_pronunciations",
                    "1",
                ]
                if os.path.exists(config_path):
                    command += ["--config_path", config_path]
                mfa_cli(command, standalone_mode=False)
                error
        else:
            dictionary_path = os.path.join(dictionary_dir, f"{lang}_mfa.dict")
            model_path = os.path.join(g2p_dir, f"{lang}_mfa.zip")
            if not os.path.exists(model_path):
                continue
            output_file = os.path.join(output_dir, f"{lang}_mfa.dict")
            command = [
                "g2p",
                lang_corpus_dir,
                model_path,
                output_file,
                "--clean",
                "-j",
                "10",
                "--dictionary_path",
                dictionary_path,
                "--use_mp",
                "--evaluate",
                "--num_pronunciations",
                "1",
            ]
            config_path = os.path.join(config_dir, lang + ".yaml")
            if os.path.exists(config_path):
                command += ["--config_path", config_path]
            mfa_cli(command, standalone_mode=False)