Spaces:

MLSpeech
/

FALCON

Running

App Files Files Community

FALCON / dutch_preprocess.py

MLSpeech

Deploy FALCON demo (app + bundled MFA G2P assets + example inputs)

0cf1a58 verified 3 days ago

Raw

History Blame Contribute Delete

7.63 kB

	import re
	import panphon
	import panphon.distance

	ft = panphon.FeatureTable()
	dst = panphon.distance.Distance()

	# IFA_TO_IPA = TODO - FUNCTION THAT COMPUTES SCORE
	# NOT MAPPING BUT NEEDS TO BE A PUNCTION

	# IPA_TO_LEEHON39 = TODO - NOT MAPPING NEEDS TO BE A FUNCTION BUT ONE TIME - MAYBE IT IS MAPPING?

	IFA_TO_IPA = {
	"p":"p", "b":"b", "t":"t", "d":"d", "k":"k", "g":"ɡ",
	"f":"f", "v":"v", "s":"s", "z":"z", "h":"h", "x":"x", "G":"ɣ",
	"m":"m", "n":"n", "N":"ŋ", "l":"l", "r":"r", "w":"ʋ", "j":"j",
	"S":"ʃ", "Z":"ʒ", "J":"ɲ", "L":"ʎ",
	"i":"i", "I":"ɪ", "e":"eː", "E":"ɛ", "a":"aː", "A":"ɑ",
	"o":"oː", "O":"ɔ", "u":"u", "y":"y", "Y":"ʏ", "2":"øː",
	"9":"œ", "@":"ə", "!" : "ɛi", "V" : "ʌu", "W" : "œy", "h#" : "h#"
	}

	LH39_IPA = {
	"AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
	"EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
	"OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
	"DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
	"L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "ɹ",
	"S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
	"Y": "j", "Z": "z", "ZH": "ʒ"
	}

	timit_leehon_39_phonemes = [
	'ao', 'ae', 'ah','aw', 'er', 'ay',
	'b', 'sil', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'm', 'en', 'ng', 'ey',
	'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'v', 'w', 'y', 'z', 'sh', 't', 'r', 's', 'th','uh', 'uw', 'oy', 'ow','p'
	]

	def get_ipa_from_ifa(ifa_label):

	if ifa_label.lower() in timit_leehon_39_phonemes:
	return [ifa_label.lower()]
	if ifa_label in ['h#', 'tcl']:
	# if ifa_label in ['h#']:
	return ["sil"]
	# if ifa_label in ['tcl']:
	# return []

	# Convert underscores and hyphens to spaces, and remove colons (length is handled by the base vowel mapping or discarded)
	cleaned = ifa_label.replace(':', '').replace('_', ' ').replace('-', ' ') #.replace('tcl', ' ')
	# Remove Stress ("), Secondary Stress ('), Syllable dots (.), and nasal tildes (~)
	cleaned = re.sub(r'[".\'~]', '', cleaned)
	# parts = cleaned.split()
	# parts = cleaned.strip()
	parts = cleaned.strip().split()
	if not parts:
	return []
	if len(parts) == 1 and len(ifa_label) >1 and ifa_label not in IFA_TO_IPA: #it's a long phoneme label that needs to be splitted to several IPA symbols
	parts = list(ifa_label)

	ipa_list = [IFA_TO_IPA.get(p,p) for p in parts if p.strip()]
	return ipa_list

	_leehon39_cache = {}

	def find_best_leehon39(target_ipa):
	# Deterministic IPA->LH39 mapping; memoize since the panphon feature-edit
	# distance over a fixed inventory recomputes the same answer per occurrence.
	if target_ipa in _leehon39_cache:
	return _leehon39_cache[target_ipa]
	result = _find_best_leehon39(target_ipa)
	_leehon39_cache[target_ipa] = result
	return result

	def _find_best_leehon39(target_ipa):

	if not target_ipa or target_ipa.strip() == "":
	return "sil", 0.0

	if target_ipa.lower() in timit_leehon_39_phonemes:
	return target_ipa.lower(), 0.0

	# if target_ipa.lower() in ['h#', 'tcl', 'sil']:
	if target_ipa.lower() in ['h#', 'sil']:
	return "sil", 0.0
	if target_ipa.lower() in ["r", "ɾ"]:
	return "r", 0.0

	best_label = "sil"
	min_dist = 100.0

	for lh_label, lh_ipa in LH39_IPA.items():

	d = dst.feature_edit_distance(target_ipa, lh_ipa)
	if d< min_dist:
	min_dist = d
	best_label = lh_label.lower()
	return best_label, round(min_dist,3)

	def aligner_pipeline(ifa_input):
	ifa_segments = get_ipa_from_ifa(ifa_input)
	results = []

	for ipa_seg in ifa_segments:
	match, d = find_best_leehon39(ipa_seg)
	results.append( {"ifa_ipa_part" :ipa_seg, "lh39" :match, "dist" :d} )
	return results


	import os

	# def convert_all_lab_files(directory):
	# for filename in os.listdir(directory):
	# if filename.endswith(".lab"):
	# path = os.path.join(directory, filename)
	# with open(path, 'r') as f:
	# content = f.read().strip()

	# # Use your existing pipeline logic
	# # Note: We split the content by space to process each phone
	# ifa_phones = content.split()
	# ipa_output = []
	# for p in ifa_phones:
	# # Get the IPA parts from your existing function
	# ipa_parts = get_ipa_from_ifa(p)
	# ipa_output.extend(ipa_parts)

	# # Join with spaces and write back
	# new_content = " ".join(ipa_output)
	# with open(path, 'w') as f:
	# f.write(new_content)
	# print(f"Done! All .lab files in {directory} converted to IPA.")

	# # Run this in your main block
	# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')

	# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')



	import os

	def create_lab_files(phn_folder, lab_folder):
	if not os.path.exists(lab_folder):
	os.makedirs(lab_folder)

	for filename in os.listdir(phn_folder):
	if filename.endswith(".phn"):
	with open(os.path.join(phn_folder, filename), 'r') as f:
	lines = f.readlines()

	ipa_sequence = []
	for line in lines:
	parts = line.strip().split()
	if len(parts) < 3: continue

	label = parts[2]
	# Use your existing mapping function
	ipa_symbols = get_ipa_from_ifa(label)

	# Filter out 'sil' if you want MFA to handle silence automatically,
	# but usually keeping them is fine for phone-level alignment.
	ipa_sequence.extend(ipa_symbols)

	# Save to .lab file (space separated string)
	lab_filename = filename.replace(".phn", ".lab")
	with open(os.path.join(lab_folder, lab_filename), 'w') as f:
	f.write(" ".join(ipa_sequence))

	def generate_ipa_lexicon(all_ipa_symbols, output_path):
	with open(output_path, 'w') as f:
	# Add a silence mapping just in case
	f.write("sil\tsil\n")
	# Map every unique IPA symbol to itself
	for symbol in sorted(list(set(all_ipa_symbols))):
	if symbol != "sil":
	f.write(f"{symbol}\t{symbol}\n")

	# Run it
	# create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")

	if __name__ == "__main__":
	test_cases = ["sil n Y l sil e: n sil t w e: sil d r i sil v i r sil v Ei f sil z E s sil z e: v @ n sil A x t sil n e: x @ sil t i n sil E l f sil t w a: l f sil n Y l sil sil"]
	# test_cases = ["x@l", "@-r-h-a", "e:-j", "r9y", "ao", "sil", "@", "E", "he:l-@_hAr", "t_b", "o:", "N"]
	for case in test_cases:
	print(f"\nINPUT: {case}")
	output = aligner_pipeline(case)
	# [x["lh39"] for x in output]
	if not output:
	print("Results: None")
	else:
	for item in output:
	print(f" Mapped '{item['ifa_ipa_part']}' -> {item['lh39']} (dist_score: {item['dist']})")
	# convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')
	# Run it
	create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")