Pronunciation-Coach / phonological_features.py
heldtomaturity's picture
initial app deploy
0515ef3
"""
phonological_features.py
========================
Defines the 35 phonological features from Table 1 of Shahin et al. (2025)
and provides the phoneme-to-feature mapping for the 39-phoneme CMU set.
Feature categories (paper Table 1):
Manners: consonant, sonorant, fricative, nasal, stop, approximant,
affricate, liquid, vowel, semivowel, continuant
Places: alveolar, palatal, dental, glottal, labial, velar, mid, high,
low, front, back, central, anterior, posterior, retroflex,
bilabial, coronal, dorsal
Others: long, short, monophthong, diphthong, round, voiced
The model output has 71 nodes: 35 (+att) + 35 (-att) + 1 (shared blank).
"""
# ─────────────────────────────────────────────────────────────────────────────
# The 35 phonological features (paper Table 1), in a fixed canonical order
# ─────────────────────────────────────────────────────────────────────────────
PHONOLOGICAL_FEATURES = [
# Manners (11)
"consonant", "sonorant", "fricative", "nasal", "stop",
"approximant", "affricate", "liquid", "vowel", "semivowel", "continuant",
# Places (18)
"alveolar", "palatal", "dental", "glottal", "labial", "velar",
"mid", "high", "low", "front", "back", "central",
"anterior", "posterior", "retroflex", "bilabial", "coronal", "dorsal",
# Others (6)
"long", "short", "monophthong", "diphthong", "round", "voiced",
]
assert len(PHONOLOGICAL_FEATURES) == 35, "Must have exactly 35 features"
FEATURE_TO_IDX = {feat: i for i, feat in enumerate(PHONOLOGICAL_FEATURES)}
NUM_FEATURES = len(PHONOLOGICAL_FEATURES)
# ─────────────────────────────────────────────────────────────────────────────
# Output node layout (paper Section 3.3):
# nodes 0..34 β†’ +att for features 0..34
# nodes 35..69 β†’ -att for features 0..34
# node 70 β†’ shared blank
# ─────────────────────────────────────────────────────────────────────────────
NUM_OUTPUT_NODES = 71 # 35 + 35 + 1
BLANK_IDX = 70
def feature_idx_to_pos_node(feat_idx: int) -> int:
"""Return output node index for +att of a given feature."""
return feat_idx
def feature_idx_to_neg_node(feat_idx: int) -> int:
"""Return output node index for -att of a given feature."""
return feat_idx + NUM_FEATURES
# ─────────────────────────────────────────────────────────────────────────────
# CMU 39-phoneme set (TIMIT 61β†’39 reduced set used in the paper)
# ─────────────────────────────────────────────────────────────────────────────
CMU_39_PHONEMES = [
"aa", "ae", "ah", "aw", "ay","ao",
"b", "ch", "d", "dh", "eh",
"er", "ey", "f", "g", "hh",
"ih", "iy", "jh", "k", "l",
"m", "n", "ng", "ow", "oy",
"p", "r", "s", "sh", "t",
"th", "uh", "uw", "v", "w",
"y", "z", "zh",
]
PHONEME_TO_IDX = {p: i for i, p in enumerate(CMU_39_PHONEMES)}
NUM_PHONEMES = len(CMU_39_PHONEMES) # 39
# ─────────────────────────────────────────────────────────────────────────────
# Phoneme β†’ phonological feature binary vector
# Each phoneme maps to a dict {feature_name: True/False}.
# Derived from standard phonological feature charts (Chomsky & Halle 1968,
# as referenced in the paper).
# ─────────────────────────────────────────────────────────────────────────────
def _p(features_present: list[str]) -> dict[str, bool]:
"""Helper: build feature dict from list of present features."""
return {f: (f in features_present) for f in PHONOLOGICAL_FEATURES}
PHONEME_FEATURES: dict[str, dict[str, bool]] = {
# ── Stops ──────────────────────────────────────────────────────────────
"p": _p(["consonant", "stop", "labial", "anterior", "bilabial"]),
"b": _p(["consonant", "stop", "labial", "anterior", "bilabial",
"voiced"]),
"t": _p(["consonant", "stop", "alveolar", "anterior", "coronal"]),
"d": _p(["consonant", "stop", "alveolar", "anterior", "coronal",
"voiced"]),
"k": _p(["consonant", "stop", "velar", "posterior", "dorsal"]),
"g": _p(["consonant", "stop", "velar", "posterior", "dorsal",
"voiced"]),
# ── Fricatives ─────────────────────────────────────────────────────────
"f": _p(["consonant", "fricative", "continuant", "labial", "anterior"]),
"v": _p(["consonant", "fricative", "continuant", "labial", "anterior", "voiced"]),
"th": _p(["consonant", "fricative", "continuant", "dental", "anterior",
"coronal"]),
"dh": _p(["consonant", "fricative", "continuant", "dental", "anterior",
"coronal", "voiced"]),
"s": _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
"coronal"]),
"z": _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
"coronal", "voiced"]),
"sh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
"coronal"]),
"zh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
"coronal", "voiced"]),
"hh": _p(["consonant", "fricative", "continuant", "glottal", "posterior",
"dorsal"]),
# ── Affricates ─────────────────────────────────────────────────────────
"ch": _p(["consonant", "affricate", "palatal", "posterior", "coronal"]),
"jh": _p(["consonant", "affricate", "palatal", "posterior", "coronal",
"voiced"]),
# ── Nasals ─────────────────────────────────────────────────────────────
"m": _p(["consonant", "sonorant", "nasal", "continuant", "labial",
"anterior", "bilabial", "voiced"]),
"n": _p(["consonant", "sonorant", "nasal", "continuant", "alveolar",
"anterior", "coronal", "voiced"]),
"ng": _p(["consonant", "sonorant", "nasal", "continuant", "velar",
"posterior", "dorsal", "voiced"]),
# ── Liquids ────────────────────────────────────────────────────────────
"l": _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
"alveolar", "anterior", "coronal", "voiced"]),
"r": _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
"alveolar", "anterior", "retroflex", "coronal", "voiced"]),
# ── Semivowels (Glides) ────────────────────────────────────────────────
"w": _p(["sonorant", "approximant", "semivowel", "continuant", "labial",
"high", "anterior", "bilabial", "round", "voiced"]),
"y": _p(["sonorant", "approximant", "semivowel", "continuant", "palatal",
"high", "posterior", "coronal", "voiced"]),
# ── Short Monophthong Vowels ───────────────────────────────────────────
"ih": _p(["sonorant", "vowel", "continuant", "high", "front",
"short", "monophthong", "voiced"]),
"eh": _p(["sonorant", "vowel", "mid", "front",
"short", "monophthong", "voiced"]),
"ae": _p(["sonorant", "vowel", "continuant", "low", "front",
"long", "monophthong", "voiced"]),
"ah": _p(["sonorant", "vowel", "continuant", "mid", "back",
"short", "monophthong", "voiced"]),
"uh": _p(["sonorant", "vowel", "continuant", "high", "back",
"short", "monophthong", "round", "voiced"]),
# ── Long Monophthong Vowels ────────────────────────────────────────────
"iy": _p(["sonorant", "vowel", "continuant", "high", "front",
"long", "monophthong", "voiced"]),
"aa": _p(["sonorant", "vowel", "continuant", "low", "back",
"long", "monophthong", "voiced"]),
"ao": _p(["sonorant", "vowel", "continuant", "mid", "back",
"long", "monophthong", "round", "voiced"]),
"er": _p(["sonorant", "vowel", "continuant", "mid", "central",
"retroflex", "short", "monophthong", "voiced"]),
"uw": _p(["sonorant", "vowel", "continuant", "high", "back",
"long", "monophthong", "round", "voiced"]),
# ── Diphthongs ─────────────────────────────────────────────────────────
"ey": _p(["sonorant", "vowel", "continuant", "mid", "front",
"long", "diphthong", "voiced"]),
"aw": _p(["sonorant", "vowel", "continuant", "low", "central",
"long", "diphthong", "round", "voiced"]),
"ay": _p(["sonorant", "vowel", "low", "central",
"long", "diphthong", "voiced"]),
"oy": _p(["sonorant", "vowel", "continuant", "mid", "back",
"long", "diphthong", "round", "voiced"]),
"ow": _p(["sonorant", "vowel", "continuant", "mid", "central",
"long", "diphthong", "round", "voiced"]),
# ── Silence ────────────────────────────────────────────────────────────
# Paper: "All silence labels were further removed leaving silence frames
# to be handled by the blank label."
"sil": _p([]), # all features absent; treated as blank during training
}
# Verify all 39 phonemes are covered.
# "sil" is intentionally extra β€” it is a fallback/blank placeholder, not a
# speech target, so it lives in PHONEME_FEATURES but not in CMU_39_PHONEMES.
_expected = set(CMU_39_PHONEMES) | {"sil"}
assert set(PHONEME_FEATURES.keys()) == _expected, (
f"Missing from PHONEME_FEATURES : {_expected - set(PHONEME_FEATURES.keys())}\n"
f"Unexpected in PHONEME_FEATURES: {set(PHONEME_FEATURES.keys()) - _expected}"
)
assert NUM_PHONEMES == 39, f"Expected 39 phonemes, got {NUM_PHONEMES}"
def phoneme_to_feature_vector(phoneme: str) -> list[bool]:
"""Return a binary list of length 35 for a given phoneme."""
feat_dict = PHONEME_FEATURES.get(phoneme, PHONEME_FEATURES["sil"])
return [feat_dict[f] for f in PHONOLOGICAL_FEATURES]
def phoneme_sequence_to_feature_sequences(
phonemes: list[str],
) -> list[list[int]]:
"""
Convert a phoneme sequence to N=35 binary label sequences.
Returns:
feature_seqs: list of 35 lists, each containing +att(1) or -att(0)
integers for each phoneme position.
"""
feature_seqs = [[] for _ in range(NUM_FEATURES)]
for ph in phonemes:
vec = phoneme_to_feature_vector(ph)
for feat_idx, present in enumerate(vec):
feature_seqs[feat_idx].append(1 if present else 0)
return feature_seqs
def feature_sequences_to_ctc_labels(
feature_seqs: list[list[int]],
) -> list[list[int]]:
"""
Convert binary feature sequences (0/1) to CTC label indices.
For category i:
- +att β†’ node index i (feature_idx_to_pos_node)
- -att β†’ node index i + 35 (feature_idx_to_neg_node)
Returns:
ctc_labels: list of 35 lists of node indices (int)
"""
ctc_labels = []
for feat_idx, seq in enumerate(feature_seqs):
label_seq = []
for val in seq:
if val == 1:
label_seq.append(feature_idx_to_pos_node(feat_idx))
else:
label_seq.append(feature_idx_to_neg_node(feat_idx))
ctc_labels.append(label_seq)
return ctc_labels