Spaces:

KdaiP
/

StableTTS1.1

Running

App Files Files Community

StableTTS1.1 / text /mandarin.py

KdaiP

Upload 80 files

3dd84f8 verified over 1 year ago

raw

history blame contribute delete

4.28 kB

	import re
	from typing import Dict, List
	from pypinyin import lazy_pinyin, Style
	from .custom_pypinyin_dict import phrase_pinyin_data
	import jieba
	from .cn2an import an2cn

	# 加载自定义拼音词典数据
	phrase_pinyin_data.load()

	# 标点符号正则
	PUNC_MAP: Dict[str, str] = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"·": ",",
	"、": ",",
	"$": ".",
	"/": ",",
	"“": "'",
	"”": "'",
	'"': "'",
	"‘": "'",
	"’": "'",
	"（": "'",
	"）": "'",
	"(": "'",
	")": "'",
	"《": "'",
	"》": "'",
	"【": "'",
	"】": "'",
	"[": "'",
	"]": "'",
	"—": "-",
	"～": "~",
	"「": "'",
	"」": "'",
	"『": "'",
	"』": "'",
	}

	# from GPT_SoVITS.text.zh_normalization.text_normlization
	PUNC_MAP.update ({
	'/': '每',
	'①': '一',
	'②': '二',
	'③': '三',
	'④': '四',
	'⑤': '五',
	'⑥': '六',
	'⑦': '七',
	'⑧': '八',
	'⑨': '九',
	'⑩': '十',
	'α': '阿尔法',
	'β': '贝塔',
	'γ': '伽玛',
	'Γ': '伽玛',
	'δ': '德尔塔',
	'Δ': '德尔塔',
	'ε': '艾普西龙',
	'ζ': '捷塔',
	'η': '依塔',
	'θ': '西塔',
	'Θ': '西塔',
	'ι': '艾欧塔',
	'κ': '喀帕',
	'λ': '拉姆达',
	'Λ': '拉姆达',
	'μ': '缪',
	'ν': '拗',
	'ξ': '克西',
	'Ξ': '克西',
	'ο': '欧米克伦',
	'π': '派',
	'Π': '派',
	'ρ': '肉',
	'ς': '西格玛',
	'σ': '西格玛',
	'Σ': '西格玛',
	'τ': '套',
	'υ': '宇普西龙',
	'φ': '服艾',
	'Φ': '服艾',
	'χ': '器',
	'ψ': '普赛',
	'Ψ': '普赛',
	'ω': '欧米伽',
	'Ω': '欧米伽',
	'+': '加',
	'-': '减',
	'×': '乘',
	'÷': '除',
	'=': '等',

	"嗯": "恩",
	"呣": "母"
	})

	PUNC_TABLE = str.maketrans(PUNC_MAP)

	# 数字正则化
	NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?')

	# 阿拉伯数字转汉字
	def replace_number(match: re.Match) -> str:
	return an2cn(match.group())

	def normalize_number(text: str) -> str:
	return NUMBER_PATTERN.sub(replace_number, text)

	# get symbols of phones, not used
	def load_pinyin_symbols(path):
	pinyin_dict={}
	temp = []
	with open(path, "r", encoding='utf-8') as f:
	content = f.readlines()
	for line in content:
	cuts = line.strip().split(',')
	pinyin = cuts[0]
	phones = cuts[1].split(' ')
	pinyin_dict[pinyin] = phones
	temp.extend(phones)
	temp = list(set(temp))
	tone = []
	for phone in temp:
	for i in range(1, 6):
	phone2 = phone + str(i)
	tone.append(phone2)
	print(sorted(tone, key=lambda x: len(x)))
	return pinyin_dict

	def load_pinyin_dict(path: str) -> Dict[str, List[str]]:
	pinyin_dict = {}
	with open(path, "r", encoding='utf-8') as f:
	for line in f:
	key, value = line.strip().split(',', 1)
	pinyin_dict[key] = value.split()
	return pinyin_dict

	import os
	pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt'))
	# pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')

	def chinese_to_cnm3(text: str) -> List[str]:
	# 标点符号和数字正则化
	text = text.translate(PUNC_TABLE)
	text = normalize_number(text)
	# 过滤掉特殊字符
	text = re.sub(r'[#&@“”^_\|\\]', '', text)

	words = jieba.lcut(text, cut_all=False)

	phones = []
	for word in words:
	pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
	for pinyin in pinyin_list:
	if pinyin[-1].isdigit():
	tone = pinyin[-1]
	syllable = pinyin[:-1]
	phone = pinyin_dict[syllable]
	phones.extend([ph + tone for ph in phone])
	elif pinyin[-1].isalpha():
	pass
	else:
	phones.extend(pinyin)

	return phones