Spaces:
Sleeping
Sleeping
| """ | |
| phonological_features.py | |
| ======================== | |
| Defines the 35 phonological features from Table 1 of Shahin et al. (2025) | |
| and provides the phoneme-to-feature mapping for the 39-phoneme CMU set. | |
| Feature categories (paper Table 1): | |
| Manners: consonant, sonorant, fricative, nasal, stop, approximant, | |
| affricate, liquid, vowel, semivowel, continuant | |
| Places: alveolar, palatal, dental, glottal, labial, velar, mid, high, | |
| low, front, back, central, anterior, posterior, retroflex, | |
| bilabial, coronal, dorsal | |
| Others: long, short, monophthong, diphthong, round, voiced | |
| The model output has 71 nodes: 35 (+att) + 35 (-att) + 1 (shared blank). | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # The 35 phonological features (paper Table 1), in a fixed canonical order | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PHONOLOGICAL_FEATURES = [ | |
| # Manners (11) | |
| "consonant", "sonorant", "fricative", "nasal", "stop", | |
| "approximant", "affricate", "liquid", "vowel", "semivowel", "continuant", | |
| # Places (18) | |
| "alveolar", "palatal", "dental", "glottal", "labial", "velar", | |
| "mid", "high", "low", "front", "back", "central", | |
| "anterior", "posterior", "retroflex", "bilabial", "coronal", "dorsal", | |
| # Others (6) | |
| "long", "short", "monophthong", "diphthong", "round", "voiced", | |
| ] | |
| assert len(PHONOLOGICAL_FEATURES) == 35, "Must have exactly 35 features" | |
| FEATURE_TO_IDX = {feat: i for i, feat in enumerate(PHONOLOGICAL_FEATURES)} | |
| NUM_FEATURES = len(PHONOLOGICAL_FEATURES) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Output node layout (paper Section 3.3): | |
| # nodes 0..34 β +att for features 0..34 | |
| # nodes 35..69 β -att for features 0..34 | |
| # node 70 β shared blank | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| NUM_OUTPUT_NODES = 71 # 35 + 35 + 1 | |
| BLANK_IDX = 70 | |
| def feature_idx_to_pos_node(feat_idx: int) -> int: | |
| """Return output node index for +att of a given feature.""" | |
| return feat_idx | |
| def feature_idx_to_neg_node(feat_idx: int) -> int: | |
| """Return output node index for -att of a given feature.""" | |
| return feat_idx + NUM_FEATURES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CMU 39-phoneme set (TIMIT 61β39 reduced set used in the paper) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CMU_39_PHONEMES = [ | |
| "aa", "ae", "ah", "aw", "ay","ao", | |
| "b", "ch", "d", "dh", "eh", | |
| "er", "ey", "f", "g", "hh", | |
| "ih", "iy", "jh", "k", "l", | |
| "m", "n", "ng", "ow", "oy", | |
| "p", "r", "s", "sh", "t", | |
| "th", "uh", "uw", "v", "w", | |
| "y", "z", "zh", | |
| ] | |
| PHONEME_TO_IDX = {p: i for i, p in enumerate(CMU_39_PHONEMES)} | |
| NUM_PHONEMES = len(CMU_39_PHONEMES) # 39 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Phoneme β phonological feature binary vector | |
| # Each phoneme maps to a dict {feature_name: True/False}. | |
| # Derived from standard phonological feature charts (Chomsky & Halle 1968, | |
| # as referenced in the paper). | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _p(features_present: list[str]) -> dict[str, bool]: | |
| """Helper: build feature dict from list of present features.""" | |
| return {f: (f in features_present) for f in PHONOLOGICAL_FEATURES} | |
| PHONEME_FEATURES: dict[str, dict[str, bool]] = { | |
| # ββ Stops ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "p": _p(["consonant", "stop", "labial", "anterior", "bilabial"]), | |
| "b": _p(["consonant", "stop", "labial", "anterior", "bilabial", | |
| "voiced"]), | |
| "t": _p(["consonant", "stop", "alveolar", "anterior", "coronal"]), | |
| "d": _p(["consonant", "stop", "alveolar", "anterior", "coronal", | |
| "voiced"]), | |
| "k": _p(["consonant", "stop", "velar", "posterior", "dorsal"]), | |
| "g": _p(["consonant", "stop", "velar", "posterior", "dorsal", | |
| "voiced"]), | |
| # ββ Fricatives βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "f": _p(["consonant", "fricative", "continuant", "labial", "anterior"]), | |
| "v": _p(["consonant", "fricative", "continuant", "labial", "anterior", "voiced"]), | |
| "th": _p(["consonant", "fricative", "continuant", "dental", "anterior", | |
| "coronal"]), | |
| "dh": _p(["consonant", "fricative", "continuant", "dental", "anterior", | |
| "coronal", "voiced"]), | |
| "s": _p(["consonant", "fricative", "continuant", "alveolar", "anterior", | |
| "coronal"]), | |
| "z": _p(["consonant", "fricative", "continuant", "alveolar", "anterior", | |
| "coronal", "voiced"]), | |
| "sh": _p(["consonant", "fricative", "continuant", "palatal", "posterior", | |
| "coronal"]), | |
| "zh": _p(["consonant", "fricative", "continuant", "palatal", "posterior", | |
| "coronal", "voiced"]), | |
| "hh": _p(["consonant", "fricative", "continuant", "glottal", "posterior", | |
| "dorsal"]), | |
| # ββ Affricates βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "ch": _p(["consonant", "affricate", "palatal", "posterior", "coronal"]), | |
| "jh": _p(["consonant", "affricate", "palatal", "posterior", "coronal", | |
| "voiced"]), | |
| # ββ Nasals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "m": _p(["consonant", "sonorant", "nasal", "continuant", "labial", | |
| "anterior", "bilabial", "voiced"]), | |
| "n": _p(["consonant", "sonorant", "nasal", "continuant", "alveolar", | |
| "anterior", "coronal", "voiced"]), | |
| "ng": _p(["consonant", "sonorant", "nasal", "continuant", "velar", | |
| "posterior", "dorsal", "voiced"]), | |
| # ββ Liquids ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "l": _p(["consonant", "sonorant", "approximant", "liquid", "continuant", | |
| "alveolar", "anterior", "coronal", "voiced"]), | |
| "r": _p(["consonant", "sonorant", "approximant", "liquid", "continuant", | |
| "alveolar", "anterior", "retroflex", "coronal", "voiced"]), | |
| # ββ Semivowels (Glides) ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "w": _p(["sonorant", "approximant", "semivowel", "continuant", "labial", | |
| "high", "anterior", "bilabial", "round", "voiced"]), | |
| "y": _p(["sonorant", "approximant", "semivowel", "continuant", "palatal", | |
| "high", "posterior", "coronal", "voiced"]), | |
| # ββ Short Monophthong Vowels βββββββββββββββββββββββββββββββββββββββββββ | |
| "ih": _p(["sonorant", "vowel", "continuant", "high", "front", | |
| "short", "monophthong", "voiced"]), | |
| "eh": _p(["sonorant", "vowel", "mid", "front", | |
| "short", "monophthong", "voiced"]), | |
| "ae": _p(["sonorant", "vowel", "continuant", "low", "front", | |
| "long", "monophthong", "voiced"]), | |
| "ah": _p(["sonorant", "vowel", "continuant", "mid", "back", | |
| "short", "monophthong", "voiced"]), | |
| "uh": _p(["sonorant", "vowel", "continuant", "high", "back", | |
| "short", "monophthong", "round", "voiced"]), | |
| # ββ Long Monophthong Vowels ββββββββββββββββββββββββββββββββββββββββββββ | |
| "iy": _p(["sonorant", "vowel", "continuant", "high", "front", | |
| "long", "monophthong", "voiced"]), | |
| "aa": _p(["sonorant", "vowel", "continuant", "low", "back", | |
| "long", "monophthong", "voiced"]), | |
| "ao": _p(["sonorant", "vowel", "continuant", "mid", "back", | |
| "long", "monophthong", "round", "voiced"]), | |
| "er": _p(["sonorant", "vowel", "continuant", "mid", "central", | |
| "retroflex", "short", "monophthong", "voiced"]), | |
| "uw": _p(["sonorant", "vowel", "continuant", "high", "back", | |
| "long", "monophthong", "round", "voiced"]), | |
| # ββ Diphthongs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "ey": _p(["sonorant", "vowel", "continuant", "mid", "front", | |
| "long", "diphthong", "voiced"]), | |
| "aw": _p(["sonorant", "vowel", "continuant", "low", "central", | |
| "long", "diphthong", "round", "voiced"]), | |
| "ay": _p(["sonorant", "vowel", "low", "central", | |
| "long", "diphthong", "voiced"]), | |
| "oy": _p(["sonorant", "vowel", "continuant", "mid", "back", | |
| "long", "diphthong", "round", "voiced"]), | |
| "ow": _p(["sonorant", "vowel", "continuant", "mid", "central", | |
| "long", "diphthong", "round", "voiced"]), | |
| # ββ Silence ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Paper: "All silence labels were further removed leaving silence frames | |
| # to be handled by the blank label." | |
| "sil": _p([]), # all features absent; treated as blank during training | |
| } | |
| # Verify all 39 phonemes are covered. | |
| # "sil" is intentionally extra β it is a fallback/blank placeholder, not a | |
| # speech target, so it lives in PHONEME_FEATURES but not in CMU_39_PHONEMES. | |
| _expected = set(CMU_39_PHONEMES) | {"sil"} | |
| assert set(PHONEME_FEATURES.keys()) == _expected, ( | |
| f"Missing from PHONEME_FEATURES : {_expected - set(PHONEME_FEATURES.keys())}\n" | |
| f"Unexpected in PHONEME_FEATURES: {set(PHONEME_FEATURES.keys()) - _expected}" | |
| ) | |
| assert NUM_PHONEMES == 39, f"Expected 39 phonemes, got {NUM_PHONEMES}" | |
| def phoneme_to_feature_vector(phoneme: str) -> list[bool]: | |
| """Return a binary list of length 35 for a given phoneme.""" | |
| feat_dict = PHONEME_FEATURES.get(phoneme, PHONEME_FEATURES["sil"]) | |
| return [feat_dict[f] for f in PHONOLOGICAL_FEATURES] | |
| def phoneme_sequence_to_feature_sequences( | |
| phonemes: list[str], | |
| ) -> list[list[int]]: | |
| """ | |
| Convert a phoneme sequence to N=35 binary label sequences. | |
| Returns: | |
| feature_seqs: list of 35 lists, each containing +att(1) or -att(0) | |
| integers for each phoneme position. | |
| """ | |
| feature_seqs = [[] for _ in range(NUM_FEATURES)] | |
| for ph in phonemes: | |
| vec = phoneme_to_feature_vector(ph) | |
| for feat_idx, present in enumerate(vec): | |
| feature_seqs[feat_idx].append(1 if present else 0) | |
| return feature_seqs | |
| def feature_sequences_to_ctc_labels( | |
| feature_seqs: list[list[int]], | |
| ) -> list[list[int]]: | |
| """ | |
| Convert binary feature sequences (0/1) to CTC label indices. | |
| For category i: | |
| - +att β node index i (feature_idx_to_pos_node) | |
| - -att β node index i + 35 (feature_idx_to_neg_node) | |
| Returns: | |
| ctc_labels: list of 35 lists of node indices (int) | |
| """ | |
| ctc_labels = [] | |
| for feat_idx, seq in enumerate(feature_seqs): | |
| label_seq = [] | |
| for val in seq: | |
| if val == 1: | |
| label_seq.append(feature_idx_to_pos_node(feat_idx)) | |
| else: | |
| label_seq.append(feature_idx_to_neg_node(feat_idx)) | |
| ctc_labels.append(label_seq) | |
| return ctc_labels | |