niobures
/

MFA

Model card Files Files and versions

MFA / scripts /training_models /generate_g2p.py

niobures's picture

MFA

2f6b10b verified 3 months ago

history blame contribute delete

4.14 kB

	import os.path
	import re
	import sys

	from montreal_forced_aligner.command_line.mfa import mfa_cli
	from montreal_forced_aligner.config import TEMPORARY_DIRECTORY

	MODEL_VERSION = "3.0.0"

	root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	dictionary_dir = os.path.join(root_dir, "dictionary", "training")
	g2p_dir = os.path.join(root_dir, "g2p", "staging")
	output_dir = os.path.join(root_dir, "dictionary", "training", "g2pped")
	config_dir = os.path.join(root_dir, "config", "acoustic")
	temp_dir = TEMPORARY_DIRECTORY
	os.makedirs(output_dir, exist_ok=True)

	if sys.platform == "win32":
	training_root = "D:/Data/speech/model_training_corpora"
	else:
	training_root = "/mnt/d/Data/speech/model_training_corpora"


	lang_codes = [
	"czech",
	"russian",
	"french",
	"german",
	"portuguese_brazil",
	"portuguese_portugal",
	"spanish_spain",
	"spanish_latin_america",
	"swedish",
	"thai",
	"turkish",
	"english_us",
	"english_us_arpa",
	"english_uk",
	"english_nigeria",
	"korean_jamo",
	"korean",
	"hausa",
	"swahili",
	"vietnamese_hanoi",
	"vietnamese_hue",
	"vietnamese_ho_chi_minh_city",
	"ukrainian",
	"polish",
	"croatian",
	"bulgarian",
	"japanese",
	"japanese_katakana",
	#'mandarin_china', 'mandarin_erhua', 'mandarin_taiwan'
	"tamil",
	"hindi",
	"urdu",
	]

	lang_codes = [
	"hindi-urdu",
	#'vietnamese_hanoi', 'vietnamese_hue', 'vietnamese_ho_chi_minh_city',
	]

	corpus_variety_mapping = {
	"hindi-urdu": {
	"shrutilipi_hindi": "hindi",
	"common_voice_hindi": "hindi",
	"musc2021_cs_hindi": "hindi",
	"musc2021_hindi": "hindi",
	"common_voice_urdu": "urdu",
	"shrutilipi_urdu": "urdu",
	}
	}

	if __name__ == "__main__":
	for lang in lang_codes:
	print(lang)
	lang_corpus_dir = os.path.join(training_root, lang)
	config_path = os.path.join(config_dir, lang + ".yaml")
	if lang in corpus_variety_mapping:
	for subcorpus, dialect in corpus_variety_mapping[lang].items():
	print(subcorpus, dialect)
	dictionary_path = os.path.join(dictionary_dir, f"{dialect}_mfa.dict")
	model_path = os.path.join(g2p_dir, f"{dialect}_mfa.zip")
	if not os.path.exists(model_path):
	continue
	output_file = os.path.join(output_dir, f"{dialect}_{subcorpus}_mfa.dict")
	command = [
	"g2p",
	lang_corpus_dir,
	model_path,
	output_file,
	"--clean",
	"-j",
	"10",
	"--dictionary_path",
	dictionary_path,
	"--oov_count_threshold",
	"3",
	"--use_mp",
	"--no_use_postgres",
	"--num_pronunciations",
	"1",
	]
	if os.path.exists(config_path):
	command += ["--config_path", config_path]
	mfa_cli(command, standalone_mode=False)
	error
	else:
	dictionary_path = os.path.join(dictionary_dir, f"{lang}_mfa.dict")
	model_path = os.path.join(g2p_dir, f"{lang}_mfa.zip")
	if not os.path.exists(model_path):
	continue
	output_file = os.path.join(output_dir, f"{lang}_mfa.dict")
	command = [
	"g2p",
	lang_corpus_dir,
	model_path,
	output_file,
	"--clean",
	"-j",
	"10",
	"--dictionary_path",
	dictionary_path,
	"--use_mp",
	"--evaluate",
	"--num_pronunciations",
	"1",
	]
	config_path = os.path.join(config_dir, lang + ".yaml")
	if os.path.exists(config_path):
	command += ["--config_path", config_path]
	mfa_cli(command, standalone_mode=False)