File size: 7,254 Bytes
2f6b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os

from montreal_forced_aligner.command_line.mfa import mfa_cli

root_dir = r"D:\Data\experiments\alignment_benchmarking"
mfa10_dir = r"D:\Data\models\1.0_archived"
mfa20_dir = r"D:\Data\models\2.0_archived"
mfa20a_dir = r"D:\Data\models\2.0.0a_archived"
mfa21_dir = r"D:\Data\models\2.1_trained"
mfa22_dir = r"D:\Data\models\2.2_trained"
mfa30_dir = r"D:\Data\models\3.0_trained"
mfa31_dir = r"D:\Data\models\3.1_trained"
adapted_dir = r"D:\Data\models\adapted"
trained22_dir = r"D:\Data\models\2.2_trained\buckeye"
trained30_dir = r"D:\Data\models\3.0_trained\buckeye"
mapping_directory = os.path.join(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "mapping_files"
)

corpus_directories = {
    "timit": r"D:\Data\speech\benchmark_datasets\timit\benchmark",
    "buckeye": r"D:\Data\speech\Buckeye\buckeye_corpus_benchmark",
}

reference_directories = {
    "timit": r"D:\Data\speech\benchmark_datasets\timit\reference",
    "buckeye": r"D:\Data\speech\Buckeye\buckeye_reference_alignments",
}

conditions = {
    "arpa_1.0": (os.path.join(mfa10_dir, "english.dict"), os.path.join(mfa10_dir, "english.zip")),
    "arpa_1.0_finetune": (
        os.path.join(mfa10_dir, "english.dict"),
        os.path.join(mfa10_dir, "english.zip"),
    ),
    "arpa_1.0_adapted": (
        os.path.join(mfa10_dir, "english.dict"),
        os.path.join(mfa10_dir, "english.zip"),
    ),
    "arpa_1.0_adapted_finetune": (
        os.path.join(mfa10_dir, "english.dict"),
        os.path.join(mfa10_dir, "english.zip"),
    ),
    "arpa_2.0": (
        os.path.join(mfa20_dir, "english_us_arpa.dict"),
        os.path.join(mfa20_dir, "english_us_arpa.zip"),
    ),
    "arpa_2.0a": (
        os.path.join(mfa20a_dir, "english_us_arpa.dict"),
        os.path.join(mfa20a_dir, "english_us_arpa.zip"),
    ),
    "mfa_2.0": (
        os.path.join(mfa20_dir, "english_us_mfa.dict"),
        os.path.join(mfa20_dir, "english_mfa.zip"),
    ),
    "mfa_2.0a": (
        os.path.join(mfa20a_dir, "english_us_mfa.dict"),
        os.path.join(mfa20a_dir, "english_mfa.zip"),
    ),
    "mfa_2.1": (
        os.path.join(mfa21_dir, "english_us_mfa.dict"),
        os.path.join(mfa21_dir, "english_mfa.zip"),
    ),
    "mfa_2.2": (
        os.path.join(mfa22_dir, "english_us_mfa.dict"),
        os.path.join(mfa22_dir, "english_mfa.zip"),
    ),
    "mfa_3.0": (
        os.path.join(mfa30_dir, "english_us_mfa.dict"),
        os.path.join(mfa30_dir, "english_mfa.zip"),
    ),
    "mfa_3.0_finetune": (
        os.path.join(mfa30_dir, "english_us_mfa.dict"),
        os.path.join(mfa30_dir, "english_mfa.zip"),
    ),
    "mfa_3.1": (
        os.path.join(mfa31_dir, "english_us_mfa.dict"),
        os.path.join(mfa31_dir, "english_mfa.zip"),
    ),
    "mfa_3.1_finetune": (
        os.path.join(mfa31_dir, "english_us_mfa.dict"),
        os.path.join(mfa31_dir, "english_mfa.zip"),
    ),
    "mfa_3.1_adapted": (
        os.path.join(mfa31_dir, "english_us_mfa.dict"),
        os.path.join(mfa31_dir, "english_mfa.zip"),
    ),
    "mfa_3.1_adapted_finetune": (
        os.path.join(mfa31_dir, "english_us_mfa.dict"),
        os.path.join(mfa31_dir, "english_mfa.zip"),
    ),
    "arpa_3.0": (
        os.path.join(mfa30_dir, "english_us_arpa.dict"),
        os.path.join(mfa30_dir, "english_us_arpa.zip"),
    ),
    "arpa_3.0_finetune": (
        os.path.join(mfa30_dir, "english_us_arpa.dict"),
        os.path.join(mfa30_dir, "english_us_arpa.zip"),
    ),
    "arpa_3.0_adapted": (
        os.path.join(mfa30_dir, "english_us_arpa.dict"),
        os.path.join(mfa30_dir, "english_us_arpa.zip"),
    ),
    "arpa_3.0_adapted_finetune": (
        os.path.join(mfa30_dir, "english_us_arpa.dict"),
        os.path.join(mfa30_dir, "english_us_arpa.zip"),
    ),
    "trained_2.2": (
        os.path.join(trained22_dir, "english_us_mfa.dict"),
        os.path.join(trained22_dir, "english_mfa.zip"),
    ),
    "trained_3.0": (
        os.path.join(trained30_dir, "english_us_mfa.dict"),
        os.path.join(trained30_dir, "english_mfa.zip"),
    ),
    "arpa_2.2": (
        os.path.join(mfa20a_dir, "english_us_arpa.dict"),
        os.path.join(mfa20a_dir, "english_us_arpa.zip"),
    ),
    "arpa_2.2_adapted": (
        os.path.join(mfa20a_dir, "english_us_arpa.dict"),
        os.path.join(mfa20a_dir, "english_us_arpa.zip"),
    ),
    "arpa_2.2_adapted_finetune": (
        os.path.join(mfa20a_dir, "english_us_arpa.dict"),
        os.path.join(mfa20a_dir, "english_us_arpa.zip"),
    ),
}
mapping_files = {}
for k in conditions.keys():
    for corpus in corpus_directories:
        if "arpa" in k:
            phone_set = "arpa"
        else:
            phone_set = "mfa"
        mapping_files[(k, corpus)] = os.path.join(
            mapping_directory, f"{phone_set}_{corpus}_mapping.yaml"
        )

if __name__ == "__main__":
    for condition, (dictionary_path, model_path) in conditions.items():
        print(condition)
        for corpus, root in corpus_directories.items():
            output_directory = os.path.join(root_dir, "alignments", condition, corpus)
            if os.path.exists(output_directory):
                continue
            if not os.path.exists(model_path):
                continue
            if not os.path.exists(dictionary_path):
                continue
            if "adapt" in condition:
                os.makedirs(adapted_dir, exist_ok=True)
                output_model_path = os.path.join(
                    adapted_dir, f"{condition.replace('_finetune', '')}.zip"
                )
                if not os.path.exists(output_model_path):
                    command = [
                        "adapt",
                        root,
                        str(dictionary_path),
                        str(model_path),
                        str(output_model_path),
                        "-j",
                        "10",
                        "--clean",
                        "--no_debug",
                        "--use_mp",
                        "--use_cutoff_model",
                        "--use_postgres",
                        "--beam",
                        "10",
                        "--retry_beam",
                        "40",
                    ]
                    print(command)
                    mfa_cli(command, standalone_mode=False)
                model_path = output_model_path
            command = [
                "align",
                root,
                dictionary_path,
                model_path,
                output_directory,
                "-j",
                "10",
                "--clean",
                "--no_debug",
                "--use_mp",
                "--use_cutoff_model",
                "--use_postgres",
                "--cleanup_textgrids",
                "--reference_directory",
                reference_directories[corpus],
                "--custom_mapping_path",
                mapping_files[(condition, corpus)],
                "--beam",
                "10",
                "--retry_beam",
                "40",
            ]
            if "finetune" in condition:
                command += ["--fine_tune"]
            print(command)
            mfa_cli(command, standalone_mode=False)