import os import hangul_jamo import click.testing from montreal_forced_aligner.command_line.mfa import mfa_cli root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) dictionary_dir = os.path.join(root_dir, 'dictionary') skip_words_dir = os.path.join(dictionary_dir, 'filter_lists') staging_dir = os.path.join(dictionary_dir, 'training') if __name__ == '__main__': for fn in os.listdir(staging_dir): print(fn) if not fn.endswith('.dict'): continue if fn.endswith("_cv.dict"): continue if fn.endswith("prosodylab.dict"): continue language = fn.split('_')[0] if language not in {'japanese'}: continue skip_words_path = os.path.join(skip_words_dir, language+'.txt') skip_word_set = set() if os.path.exists(skip_words_path): with open(skip_words_path, 'r', encoding='utf8') as f: for line in f: line = line.strip() if not line: continue if language == 'korean' and 'jamo' in fn: line = hangul_jamo.compose(line) skip_word_set.add(line.lower()) print(skip_word_set) dict_path = os.path.join(staging_dir, fn) command = ['model', 'inspect', 'dictionary', dict_path] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(mfa_cli, command, catch_exceptions=True) print(result.stdout) print(result.stderr) if result.exception: print(result.exc_info) raise result.exception line_set = set() line_counter = 0 with open(dict_path, 'r', encoding='utf8') as f: for line in f: line_counter +=1 line = line.strip() if not line: continue word = line.split('\t')[0].lower() if word in skip_word_set: continue line_set.add(line) if line_counter == len(line_set): continue with open(dict_path, 'w', encoding='utf8') as f: for line in sorted(line_set): f.write(line + '\n') print(f"Reduced {fn} from {line_counter} to {len(line_set)}")