File size: 4,142 Bytes
2f6b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os.path
import re
import sys

from montreal_forced_aligner.command_line.mfa import mfa_cli
from montreal_forced_aligner.config import TEMPORARY_DIRECTORY

MODEL_VERSION = "3.0.0"

root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dictionary_dir = os.path.join(root_dir, "dictionary", "training")
g2p_dir = os.path.join(root_dir, "g2p", "staging")
output_dir = os.path.join(root_dir, "dictionary", "training", "g2pped")
config_dir = os.path.join(root_dir, "config", "acoustic")
temp_dir = TEMPORARY_DIRECTORY
os.makedirs(output_dir, exist_ok=True)

if sys.platform == "win32":
    training_root = "D:/Data/speech/model_training_corpora"
else:
    training_root = "/mnt/d/Data/speech/model_training_corpora"


lang_codes = [
    "czech",
    "russian",
    "french",
    "german",
    "portuguese_brazil",
    "portuguese_portugal",
    "spanish_spain",
    "spanish_latin_america",
    "swedish",
    "thai",
    "turkish",
    "english_us",
    "english_us_arpa",
    "english_uk",
    "english_nigeria",
    "korean_jamo",
    "korean",
    "hausa",
    "swahili",
    "vietnamese_hanoi",
    "vietnamese_hue",
    "vietnamese_ho_chi_minh_city",
    "ukrainian",
    "polish",
    "croatian",
    "bulgarian",
    "japanese",
    "japanese_katakana",
    #'mandarin_china', 'mandarin_erhua', 'mandarin_taiwan'
    "tamil",
    "hindi",
    "urdu",
]

lang_codes = [
    "hindi-urdu",
    #'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_ho_chi_minh_city',
]

corpus_variety_mapping = {
    "hindi-urdu": {
        "shrutilipi_hindi": "hindi",
        "common_voice_hindi": "hindi",
        "musc2021_cs_hindi": "hindi",
        "musc2021_hindi": "hindi",
        "common_voice_urdu": "urdu",
        "shrutilipi_urdu": "urdu",
    }
}

if __name__ == "__main__":
    for lang in lang_codes:
        print(lang)
        lang_corpus_dir = os.path.join(training_root, lang)
        config_path = os.path.join(config_dir, lang + ".yaml")
        if lang in corpus_variety_mapping:
            for subcorpus, dialect in corpus_variety_mapping[lang].items():
                print(subcorpus, dialect)
                dictionary_path = os.path.join(dictionary_dir, f"{dialect}_mfa.dict")
                model_path = os.path.join(g2p_dir, f"{dialect}_mfa.zip")
                if not os.path.exists(model_path):
                    continue
                output_file = os.path.join(output_dir, f"{dialect}_{subcorpus}_mfa.dict")
                command = [
                    "g2p",
                    lang_corpus_dir,
                    model_path,
                    output_file,
                    "--clean",
                    "-j",
                    "10",
                    "--dictionary_path",
                    dictionary_path,
                    "--oov_count_threshold",
                    "3",
                    "--use_mp",
                    "--no_use_postgres",
                    "--num_pronunciations",
                    "1",
                ]
                if os.path.exists(config_path):
                    command += ["--config_path", config_path]
                mfa_cli(command, standalone_mode=False)
                error
        else:
            dictionary_path = os.path.join(dictionary_dir, f"{lang}_mfa.dict")
            model_path = os.path.join(g2p_dir, f"{lang}_mfa.zip")
            if not os.path.exists(model_path):
                continue
            output_file = os.path.join(output_dir, f"{lang}_mfa.dict")
            command = [
                "g2p",
                lang_corpus_dir,
                model_path,
                output_file,
                "--clean",
                "-j",
                "10",
                "--dictionary_path",
                dictionary_path,
                "--use_mp",
                "--evaluate",
                "--num_pronunciations",
                "1",
            ]
            config_path = os.path.join(config_dir, lang + ".yaml")
            if os.path.exists(config_path):
                command += ["--config_path", config_path]
            mfa_cli(command, standalone_mode=False)