File size: 2,432 Bytes
2f6b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import hangul_jamo
import click.testing
from montreal_forced_aligner.command_line.mfa import mfa_cli

root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
dictionary_dir = os.path.join(root_dir, 'dictionary')
skip_words_dir = os.path.join(dictionary_dir, 'filter_lists')
staging_dir = os.path.join(dictionary_dir, 'training')


if __name__ == '__main__':
    for fn in os.listdir(staging_dir):
        print(fn)
        if not fn.endswith('.dict'):
            continue
        if fn.endswith("_cv.dict"):
            continue
        if fn.endswith("prosodylab.dict"):
            continue
        language = fn.split('_')[0]
        if language not in {'japanese'}:
            continue
        skip_words_path = os.path.join(skip_words_dir, language+'.txt')
        skip_word_set = set()
        if os.path.exists(skip_words_path):
            with open(skip_words_path, 'r', encoding='utf8') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    if language == 'korean' and 'jamo' in fn:
                        line = hangul_jamo.compose(line)
                    skip_word_set.add(line.lower())
        print(skip_word_set)
        dict_path = os.path.join(staging_dir, fn)
        command = ['model', 'inspect', 'dictionary', dict_path]
        result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(mfa_cli, command,
                                                                                   catch_exceptions=True)
        print(result.stdout)
        print(result.stderr)
        if result.exception:
            print(result.exc_info)
            raise result.exception
        line_set = set()
        line_counter = 0
        with open(dict_path, 'r', encoding='utf8') as f:
            for line in f:
                line_counter +=1
                line = line.strip()
                if not line:
                    continue
                word = line.split('\t')[0].lower()
                if word in skip_word_set:
                    continue
                line_set.add(line)
        if line_counter == len(line_set):
            continue
        with open(dict_path, 'w', encoding='utf8') as f:
            for line in sorted(line_set):
                f.write(line + '\n')
        print(f"Reduced {fn} from {line_counter} to {len(line_set)}")