import re
import panphon
import panphon.distance

ft = panphon.FeatureTable()
dst = panphon.distance.Distance()

# IFA_TO_IPA = TODO - FUNCTION THAT COMPUTES SCORE
# NOT MAPPING BUT NEEDS TO BE A PUNCTION

# IPA_TO_LEEHON39 = TODO - NOT MAPPING NEEDS TO BE A FUNCTION BUT ONE TIME - MAYBE IT IS MAPPING?

IFA_TO_IPA = {
    "p":"p", "b":"b", "t":"t", "d":"d", "k":"k", "g":"ɡ",
    "f":"f", "v":"v", "s":"s", "z":"z", "h":"h", "x":"x", "G":"ɣ",
    "m":"m", "n":"n", "N":"ŋ", "l":"l", "r":"r", "w":"ʋ", "j":"j",
    "S":"ʃ", "Z":"ʒ", "J":"ɲ", "L":"ʎ",
    "i":"i", "I":"ɪ", "e":"eː", "E":"ɛ", "a":"aː", "A":"ɑ",
    "o":"oː", "O":"ɔ", "u":"u", "y":"y", "Y":"ʏ", "2":"øː",
    "9":"œ", "@":"ə", "!" : "ɛi", "V" : "ʌu", "W" : "œy", "h#" : "h#"
}

LH39_IPA = {
    "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
    "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
    "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
    "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
    "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "ɹ",
    "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
    "Y": "j", "Z": "z", "ZH": "ʒ"
}

timit_leehon_39_phonemes = [
    'ao', 'ae', 'ah','aw', 'er', 'ay', 
    'b', 'sil', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'm', 'en', 'ng', 'ey',
    'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'v', 'w', 'y', 'z', 'sh', 't', 'r', 's', 'th','uh', 'uw', 'oy', 'ow','p'
]

def get_ipa_from_ifa(ifa_label):
    
    if ifa_label.lower() in timit_leehon_39_phonemes:
        return [ifa_label.lower()]
    if ifa_label in ['h#', 'tcl']:
    # if ifa_label in ['h#']:
        return ["sil"]
    # if ifa_label in ['tcl']:
    #     return []
    
    # Convert underscores and hyphens to spaces, and remove colons (length is handled by the base vowel mapping or discarded)
    cleaned = ifa_label.replace(':', '').replace('_', ' ').replace('-', ' ') #.replace('tcl', ' ')
    # Remove Stress ("), Secondary Stress ('), Syllable dots (.), and nasal tildes (~)
    cleaned = re.sub(r'[".\'~]', '', cleaned)
    # parts = cleaned.split()   
    # parts = cleaned.strip()   
    parts = cleaned.strip().split()
    if not parts:
        return [] 
    if len(parts) == 1 and len(ifa_label) >1 and ifa_label not in IFA_TO_IPA: #it's a long phoneme label that needs to be splitted to several IPA symbols
        parts = list(ifa_label)
    
    ipa_list = [IFA_TO_IPA.get(p,p) for p in parts if p.strip()]
    return ipa_list

_leehon39_cache = {}

def find_best_leehon39(target_ipa):
    # Deterministic IPA->LH39 mapping; memoize since the panphon feature-edit
    # distance over a fixed inventory recomputes the same answer per occurrence.
    if target_ipa in _leehon39_cache:
        return _leehon39_cache[target_ipa]
    result = _find_best_leehon39(target_ipa)
    _leehon39_cache[target_ipa] = result
    return result

def _find_best_leehon39(target_ipa):

    if not target_ipa or target_ipa.strip() == "":
        return "sil", 0.0
    
    if target_ipa.lower() in timit_leehon_39_phonemes:
        return target_ipa.lower(), 0.0
    
    # if target_ipa.lower() in ['h#', 'tcl', 'sil']:
    if target_ipa.lower() in ['h#', 'sil']:
        return "sil", 0.0
    if target_ipa.lower() in ["r", "ɾ"]:
        return "r", 0.0
    
    best_label = "sil"
    min_dist = 100.0
    
    for lh_label, lh_ipa in LH39_IPA.items():
        
        d = dst.feature_edit_distance(target_ipa, lh_ipa)
        if d< min_dist:
            min_dist = d
            best_label = lh_label.lower()
    return best_label, round(min_dist,3)

def aligner_pipeline(ifa_input):
    ifa_segments = get_ipa_from_ifa(ifa_input)
    results = []
    
    for ipa_seg in ifa_segments:
        match, d = find_best_leehon39(ipa_seg)
        results.append( {"ifa_ipa_part" :ipa_seg, "lh39" :match, "dist" :d} )
    return results


import os

# def convert_all_lab_files(directory):
#     for filename in os.listdir(directory):
#         if filename.endswith(".lab"):
#             path = os.path.join(directory, filename)
#             with open(path, 'r') as f:
#                 content = f.read().strip()
            
#             # Use your existing pipeline logic
#             # Note: We split the content by space to process each phone
#             ifa_phones = content.split()
#             ipa_output = []
#             for p in ifa_phones:
#                 # Get the IPA parts from your existing function
#                 ipa_parts = get_ipa_from_ifa(p)
#                 ipa_output.extend(ipa_parts)
            
#             # Join with spaces and write back
#             new_content = " ".join(ipa_output)
#             with open(path, 'w') as f:
#                 f.write(new_content)
#     print(f"Done! All .lab files in {directory} converted to IPA.")

# # Run this in your main block
# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')

# # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')


import os

def create_lab_files(phn_folder, lab_folder):
    if not os.path.exists(lab_folder):
        os.makedirs(lab_folder)

    for filename in os.listdir(phn_folder):
        if filename.endswith(".phn"):
            with open(os.path.join(phn_folder, filename), 'r') as f:
                lines = f.readlines()
            
            ipa_sequence = []
            for line in lines:
                parts = line.strip().split()
                if len(parts) < 3: continue
                
                label = parts[2]
                # Use your existing mapping function
                ipa_symbols = get_ipa_from_ifa(label) 
                
                # Filter out 'sil' if you want MFA to handle silence automatically, 
                # but usually keeping them is fine for phone-level alignment.
                ipa_sequence.extend(ipa_symbols)
            
            # Save to .lab file (space separated string)
            lab_filename = filename.replace(".phn", ".lab")
            with open(os.path.join(lab_folder, lab_filename), 'w') as f:
                f.write(" ".join(ipa_sequence))
                
def generate_ipa_lexicon(all_ipa_symbols, output_path):
    with open(output_path, 'w') as f:
        # Add a silence mapping just in case
        f.write("sil\tsil\n")
        # Map every unique IPA symbol to itself
        for symbol in sorted(list(set(all_ipa_symbols))):
            if symbol != "sil":
                f.write(f"{symbol}\t{symbol}\n")

# Run it
# create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")

if __name__ == "__main__":
    test_cases = ["sil n Y l sil e: n sil t w e: sil d r i sil v i r sil v Ei f sil z E s sil z e: v @ n sil A x t sil n e: x @ sil t i n sil E l f sil t w a: l f sil n Y l sil sil"]
    # test_cases = ["x@l", "@-r-h-a", "e:-j", "r9y", "ao", "sil", "@", "E", "he:l-@_hAr", "t_b", "o:", "N"]
    for case in test_cases:
        print(f"\nINPUT: {case}")
        output = aligner_pipeline(case)
        # [x["lh39"] for x in output]
        if not output:
            print("Results: None")
        else:
            for item in output:
                print(f" Mapped '{item['ifa_ipa_part']}' -> {item['lh39']} (dist_score: {item['dist']})")
    # convert_all_lab_files('/home/rotem/projects/datasets/IFA_dutch_split/test')
    # Run it
    create_lab_files("/home/rotem/projects/datasets/IFA_dutch_split/test", "/home/rotem/projects/datasets/IFA_dutch_split/test")