| | import os |
| | import json |
| | import re |
| | import string |
| |
|
| | |
| | def load_custom_dict(dict_path): |
| | with open(dict_path, 'r', encoding='utf-8') as f: |
| | return json.load(f) |
| |
|
| | custom_dict_path = "./syllable_dict_ENNI_refine.json" |
| | custom_dict = load_custom_dict(custom_dict_path) |
| |
|
| | vowels_phonemes = [ |
| | "iː", "uː", "ɜː", "ɔː", "ɑː", |
| | "ɪ", "ʊ", "e", "ə", "æ", "ʌ", "ɛ", "ɒ", |
| | "eɪ", "aɪ", "ɔɪ", "aʊ", "əʊ", "ɪə", "eə", "ʊə" |
| | ] |
| |
|
| | def phoneme_type(phoneme): |
| | return 'V' if phoneme in vowels_phonemes else 'C' |
| |
|
| | def get_pronunciation_from_dict(word): |
| | clean_word = word.strip(string.punctuation).lower() |
| | return custom_dict.get(clean_word, "") |
| |
|
| | def split_ipa_into_syllables(ipa_str): |
| | ipa_str = ipa_str.replace("ˈ", ".").replace("ˌ", ".") |
| | return [s for s in ipa_str.split('.') if s.strip()] |
| |
|
| | def split_syllable_into_phonemes(syllable): |
| | vowels_sorted = sorted(vowels_phonemes, key=len, reverse=True) |
| | phonemes = [] |
| | i = 0 |
| | while i < len(syllable): |
| | matched = None |
| | for v in vowels_sorted: |
| | if syllable[i:i+len(v)] == v: |
| | matched = v |
| | break |
| | if matched: |
| | phonemes.append(matched) |
| | i += len(matched) |
| | else: |
| | phonemes.append(syllable[i]) |
| | i += 1 |
| | return phonemes |
| |
|
| | def analyze_word_syllables(word): |
| | ipa_str = get_pronunciation_from_dict(word) |
| | if not ipa_str: |
| | return [] |
| | syllables_ipa = split_ipa_into_syllables(ipa_str) |
| | syllable_data = [] |
| | for syl in syllables_ipa: |
| | phs = split_syllable_into_phonemes(syl) |
| | CV = ''.join(phoneme_type(p) for p in phs) |
| | syllable_data.append({ |
| | "syllable": ''.join(phs), |
| | "phonemes": phs, |
| | "CV_pattern": CV |
| | }) |
| | return syllable_data |
| |
|
| | def annotate_syllables(session_id, base_dir="session_data"): |
| | json_file = os.path.join(base_dir, session_id, f"{session_id}_transcriptionCW.json") |
| | if not os.path.exists(json_file): |
| | print(f"[Error] Cannot find file: {json_file}") |
| | return |
| |
|
| | with open(json_file, "r", encoding="utf-8") as f: |
| | data = json.load(f) |
| |
|
| | for segment in data.get("segments", []): |
| | text = segment.get("text", "") |
| | words_info = segment.get("words", []) |
| | syllables = [] |
| |
|
| | for idx, word_obj in enumerate(words_info): |
| | word = word_obj.get("word", "") |
| | if re.fullmatch(r"\[.*?\]", word): |
| | continue |
| | word_syllables = analyze_word_syllables(word) |
| | for syl in word_syllables: |
| | syl["word_index"] = idx |
| | syllables.extend(word_syllables) |
| |
|
| | segment["syllables"] = syllables |
| |
|
| | with open(json_file, "w", encoding="utf-8") as f: |
| | json.dump(data, f, indent=4, ensure_ascii=False) |
| |
|
| | print(f"Session {session_id} syllable annotation done: {json_file}") |
| | return data |
| |
|
| |
|
| | if __name__ == "__main__": |
| | annotate_syllables("000030") |
| |
|