Spaces:

saeedzou
/

Persian_Phonemizer

Sleeping

App Files Files Community

Persian_Phonemizer / phonemizer.py

saeedzou

Update phonemizer.py

b130bd0 verified 9 months ago

raw

history blame contribute delete

7.82 kB

	import subprocess
	import re
	import string
	import pandas as pd
	from fastapi import FastAPI, Request
	from pydantic import BaseModel
	from hazm import POSTagger, word_tokenize
	from parsnorm import ParsNorm

	def map_words_dict(csv_file="map_words.csv"):
	map_df = pd.DataFrame(pd.read_csv(csv_file))
	mapping_dict = {}
	for i in range(len(map_df['original'])):
	mapping_dict[map_df['original'][i]] = map_df['corrected'][i]
	return {str(k): str(v) for k, v in mapping_dict.items() if pd.notna(k) and pd.notna(v)}

	def create_pattern_from_mapping_dict_words(mapping_dict):
	# Use \b (word boundary) to ensure only full words are matched
	return r"\b(" + "\|".join(map(re.escape, mapping_dict.keys())) + r")\b"

	def multiple_replace(text, mapping_dict, mapping_pattern):
	return re.sub(mapping_pattern, lambda m: mapping_dict[m.group()], str(text))

	app = FastAPI()
	# Setup
	normalizer = ParsNorm(remove_diacritics=False)
	words_mapping_dict = map_words_dict('./final_map_words.csv')
	words_mapping_pattern = create_pattern_from_mapping_dict_words(words_mapping_dict)
	tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
	punctuation = string.punctuation + "؟:؛»«،"
	pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])\|(?<=[{re.escape(punctuation)}])(?=\w)"

	ambiguity_dict = {
	'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
	'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
	'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
	'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
	'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
	'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
	'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
	'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
	'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
	'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
	'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
	'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
	'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
	'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
	'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
	}
	def get_phoneme_for_pos(entry, target_pos):
	for i, pos_tag in enumerate(entry['pos']):
	if pos_tag == target_pos:
	return entry['phonemes'][i]
	return None # Return None if target POS tag is not found


	def get_phonemes(word):
	"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
	cmd = f'espeak-ng -v fa --ipa -q "{word}" \| sed "s/[ˈˌː]//g" \| sed "s/q1/q/g"'
	try:
	# Run the subprocess with 'latin1' encoding to handle special characters
	result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
	# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
	return result.stdout.strip()
	except UnicodeDecodeError as e:
	print(f"UnicodeDecodeError: {e}\n{word}")
	return None # Or handle the error appropriately


	def process_sentence(sentence, tagger, pattern, punctuation):
	sentence = re.sub(pattern, r' ', sentence)
	"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
	words = word_tokenize(sentence)
	tagged_words = tagger.tag(words)

	phoneme_list = []
	tag_index = 0 # Track the index of words that get POS tags

	for word in words:
	if word in punctuation:
	if phoneme_list:
	phoneme_list[-1] += word
	else:
	phoneme_list.append(word)
	else: # If it's a word, process normally
	words = word.replace('_', ' ').split("\u200c")
	phoneme_for_word_list = []
	for w in words:
	phonemes = get_phonemes(w)
	kaamel_phonemes = ambiguity_dict.get(w)
	if kaamel_phonemes:
	if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
	phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))

	# If word has Ezafe (EZ tag), modify phoneme
	if 'EZ' in tagged_words[tag_index][1]:
	if phonemes.endswith('jeː'):
	pass
	elif phonemes.endswith('ː'): # Ends in long vowel
	phonemes += 'je'
	elif phonemes.endswith('i'): # e.g زندگی
	phonemes += 'je'
	elif phonemes.endswith('je'): # e.g برای
	pass
	elif phonemes.endswith('e'): # e.g مدرسه
	phonemes += 'je'
	else:
	phonemes += 'e'

	phoneme_for_word_list.append(phonemes)
	phoneme_for_word = "".join(phoneme_for_word_list)
	phoneme_list.append(phoneme_for_word)
	tag_index += 1 # Move to next tagged word

	phoneme_text = ' '.join(phoneme_list)
	phoneme_text = re.sub(r"\s+", " ", phoneme_text)

	return phoneme_text

	# FastAPI input model
	class InputText(BaseModel):
	text: str

	# Route
	@app.get("/")
	async def root():
	return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}

	@app.post("/phonemize")
	async def phonemize(input_data: InputText):
	normalized = normalizer.normalize(input_data.text, remove_punct=False)
	normalized = multiple_replace(normalized, words_mapping_dict, words_mapping_pattern)
	result = process_sentence(normalized, tagger, pattern, punctuation)
	return {"phonemes": result}