FALCON / dutch_preprocess.py
MLSpeech's picture
Deploy FALCON demo (app + bundled MFA G2P assets + example inputs)
0cf1a58 verified
Raw
History Blame Contribute Delete
7.63 kB
import re
import panphon
import panphon.distance
ft = panphon.FeatureTable()
dst = panphon.distance.Distance()
# IFA_TO_IPA = TODO - FUNCTION THAT COMPUTES SCORE
# NOT MAPPING BUT NEEDS TO BE A PUNCTION
# IPA_TO_LEEHON39 = TODO - NOT MAPPING NEEDS TO BE A FUNCTION BUT ONE TIME - MAYBE IT IS MAPPING?
IFA_TO_IPA = {
"p":"p", "b":"b", "t":"t", "d":"d", "k":"k", "g":"ɡ",
"f":"f", "v":"v", "s":"s", "z":"z", "h":"h", "x":"x", "G":"ɣ",
"m":"m", "n":"n", "N":"ŋ", "l":"l", "r":"r", "w":"ʋ", "j":"j",
"S":"ʃ", "Z":"ʒ", "J":"ɲ", "L":"ʎ",
"i":"i", "I":"ɪ", "e":"eː", "E":"ɛ", "a":"aː", "A":"ɑ",
"o":"oː", "O":"ɔ", "u":"u", "y":"y", "Y":"ʏ", "2":"øː",
"9":"œ", "@":"ə", "!" : "ɛi", "V" : "ʌu", "W" : "œy", "h#" : "h#"
}
LH39_IPA = {
"AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
"EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
"OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
"DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
"L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "ɹ",
"S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
"Y": "j", "Z": "z", "ZH": "ʒ"
}
timit_leehon_39_phonemes = [
'ao', 'ae', 'ah','aw', 'er', 'ay',
'b', 'sil', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'm', 'en', 'ng', 'ey',
'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'v', 'w', 'y', 'z', 'sh', 't', 'r', 's', 'th','uh', 'uw', 'oy', 'ow','p'
]
def get_ipa_from_ifa(ifa_label):
if ifa_label.lower() in timit_leehon_39_phonemes:
return [ifa_label.lower()]
if ifa_label in ['h#', 'tcl']:
# if ifa_label in ['h#']:
return ["sil"]
# if ifa_label in ['tcl']:
# return []
# Convert underscores and hyphens to spaces, and remove colons (length is handled by the base vowel mapping or discarded)
cleaned = ifa_label.replace(':', '').replace('_', ' ').replace('-', ' ') #.replace('tcl', ' ')
# Remove Stress ("), Secondary Stress ('), Syllable dots (.), and nasal tildes (~)
cleaned = re.sub(r'[".\'~]', '', cleaned)
# parts = cleaned.split()
# parts = cleaned.strip()
parts = cleaned.strip().split()
if not parts:
return []
if len(parts) == 1 and len(ifa_label) >1 and ifa_label not in IFA_TO_IPA: #it's a long phoneme label that needs to be splitted to several IPA symbols
parts = list(ifa_label)
ipa_list = [IFA_TO_IPA.get(p,p) for p in parts if p.strip()]
return ipa_list
_leehon39_cache = {}
def find_best_leehon39(target_ipa):
# Deterministic IPA->LH39 mapping; memoize since the panphon feature-edit
# distance over a fixed inventory recomputes the same answer per occurrence.
if target_ipa in _leehon39_cache:
return _leehon39_cache[target_ipa]
result = _find_best_leehon39(target_ipa)
_leehon39_cache[target_ipa] = result
return result
def _find_best_leehon39(target_ipa):
if not target_ipa or target_ipa.strip() == "":
return "sil", 0.0
if target_ipa.lower() in timit_leehon_39_phonemes:
return target_ipa.lower(), 0.0
# if target_ipa.lower() in ['h#', 'tcl', 'sil']:
if target_ipa.lower() in ['h#', 'sil']:
return "sil", 0.0
if target_ipa.lower() in ["r", "ɾ"]:
return "r", 0.0
best_label = "sil"
min_dist = 100.0
for lh_label, lh_ipa in LH39_IPA.items():
d = dst.feature_edit_distance(target_ipa, lh_ipa)
if d< min_dist:
min_dist = d
best_label = lh_label.lower()
return best_label, round(min_dist,3)
def aligner_pipeline(ifa_input):
ifa_segments = get_ipa_from_ifa(ifa_input)
results = []
for ipa_seg in ifa_segments:
match, d = find_best_leehon39(ipa_seg)
results.append( {"ifa_ipa_part" :ipa_seg, "lh39" :match, "dist" :d} )
return results
import os
# def convert_all_lab_files(directory):
# for filename in os.listdir(directory):
# if filename.endswith(".lab"):
# path = os.path.join(directory, filename)
# with open(path, 'r') as f:
# content = f.read().strip()
# # Use your existing pipeline logic
# # Note: We split the content by space to process each phone
# ifa_phones = content.split()
# ipa_output = []
# for p in ifa_phones:
# # Get the IPA parts from your existing function
# ipa_parts = get_ipa_from_ifa(p)
# ipa_output.extend(ipa_parts)
# # Join with spaces and write back
# new_content = " ".join(ipa_output)
# with open(path, 'w') as f:
# f.write(new_content)
# print(f"Done! All .lab files in {directory} converted to IPA.")
# # Run this in your main block
# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')
# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')
import os
def create_lab_files(phn_folder, lab_folder):
if not os.path.exists(lab_folder):
os.makedirs(lab_folder)
for filename in os.listdir(phn_folder):
if filename.endswith(".phn"):
with open(os.path.join(phn_folder, filename), 'r') as f:
lines = f.readlines()
ipa_sequence = []
for line in lines:
parts = line.strip().split()
if len(parts) < 3: continue
label = parts[2]
# Use your existing mapping function
ipa_symbols = get_ipa_from_ifa(label)
# Filter out 'sil' if you want MFA to handle silence automatically,
# but usually keeping them is fine for phone-level alignment.
ipa_sequence.extend(ipa_symbols)
# Save to .lab file (space separated string)
lab_filename = filename.replace(".phn", ".lab")
with open(os.path.join(lab_folder, lab_filename), 'w') as f:
f.write(" ".join(ipa_sequence))
def generate_ipa_lexicon(all_ipa_symbols, output_path):
with open(output_path, 'w') as f:
# Add a silence mapping just in case
f.write("sil\tsil\n")
# Map every unique IPA symbol to itself
for symbol in sorted(list(set(all_ipa_symbols))):
if symbol != "sil":
f.write(f"{symbol}\t{symbol}\n")
# Run it
# create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")
if __name__ == "__main__":
test_cases = ["sil n Y l sil e: n sil t w e: sil d r i sil v i r sil v Ei f sil z E s sil z e: v @ n sil A x t sil n e: x @ sil t i n sil E l f sil t w a: l f sil n Y l sil sil"]
# test_cases = ["x@l", "@-r-h-a", "e:-j", "r9y", "ao", "sil", "@", "E", "he:l-@_hAr", "t_b", "o:", "N"]
for case in test_cases:
print(f"\nINPUT: {case}")
output = aligner_pipeline(case)
# [x["lh39"] for x in output]
if not output:
print("Results: None")
else:
for item in output:
print(f" Mapped '{item['ifa_ipa_part']}' -> {item['lh39']} (dist_score: {item['dist']})")
# convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')
# Run it
create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")