Spaces:

wasmdashai
/

wasm-ara

Paused

App Files Files Community

wasm-ara / text /__init__.py

wasmdashai

first commit

4451360 over 1 year ago

raw

history blame contribute delete

2.17 kB

	from text.symbols import symbols, DOUBLING_TOKEN, EOS_TOKEN, SEPARATOR_TOKEN
	from text.phonetise_buckwalter import (
	arabic_to_buckwalter,
	buckwalter_to_arabic,
	process_utterance
	)

	vowels = ['aa', 'AA', 'uu0', 'uu1', 'UU0', 'UU1', 'ii0', 'ii1',
	'II0', 'II1', 'a', 'A', 'u0', 'u1', 'U0', 'U1', 'i0', 'i1',
	'I0', 'I1']

	vowel_map = {
	'aa': 'aa', 'AA': 'aa',
	'uu0': 'uu', 'uu1': 'uu', 'UU0': 'uu', 'UU1': 'uu',
	'ii0': 'ii', 'ii1': 'ii', 'II0': 'ii', 'II1': 'ii',
	'a': 'a', 'A': 'a',
	'u0': 'u', 'u1': 'u', 'U0': 'u', 'U1': 'u',
	'i0': 'i', 'i1': 'i', 'I0': 'i', 'I1': 'i'
	}

	phon_to_id_ = {phon: i for i, phon in enumerate(symbols)}


	def tokens_to_ids(phonemes, phon_to_id=None):
	if phon_to_id is None:
	return [phon_to_id_[phon] for phon in phonemes]
	return [phon_to_id[phon] for phon in phonemes]


	def ids_to_tokens(ids):
	return [symbols[id] for id in ids]


	def arabic_to_phonemes(arabic):
	buckw = arabic_to_buckwalter(arabic)
	return process_utterance(buckw)


	def buckwalter_to_phonemes(buckw):
	return process_utterance(buckw)


	def phonemes_to_tokens(phonemes: str, append_space=True):
	phonemes = phonemes \
	.replace("sil", "") \
	.replace("+", "_+_") \
	.split()
	for i, phon in enumerate(phonemes):
	if len(phon) == 2 and phon not in vowels and phon[0] == phon[1]:
	phonemes[i] = phon[0]
	phonemes.insert(i+1, DOUBLING_TOKEN)
	if phonemes[i] in vowels:
	phonemes[i] = vowel_map[phonemes[i]]

	if append_space:
	phonemes.append(SEPARATOR_TOKEN)

	phonemes.append(EOS_TOKEN)

	return phonemes


	def buckwalter_to_tokens(buckw, append_space=True):
	phonemes = buckwalter_to_phonemes(buckw)
	tokens = phonemes_to_tokens(phonemes, append_space=append_space)
	return tokens


	def arabic_to_tokens(arabic, append_space=True):
	buckw = arabic_to_buckwalter(arabic)
	tokens = buckwalter_to_tokens(buckw, append_space=append_space)
	return tokens


	def simplify_phonemes(phonemes):
	for k, v in vowel_map.items():
	phonemes = phonemes.replace(k, v)
	return phonemes