Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

File size: 13,359 Bytes

0515ef3

"""
phonological_features.py
========================
Defines the 35 phonological features from Table 1 of Shahin et al. (2025)
and provides the phoneme-to-feature mapping for the 39-phoneme CMU set.

Feature categories (paper Table 1):
  Manners: consonant, sonorant, fricative, nasal, stop, approximant,
           affricate, liquid, vowel, semivowel, continuant
  Places:  alveolar, palatal, dental, glottal, labial, velar, mid, high,
           low, front, back, central, anterior, posterior, retroflex,
           bilabial, coronal, dorsal
  Others:  long, short, monophthong, diphthong, round, voiced

The model output has 71 nodes: 35 (+att) + 35 (-att) + 1 (shared blank).
"""

# ─────────────────────────────────────────────────────────────────────────────
# The 35 phonological features (paper Table 1), in a fixed canonical order
# ─────────────────────────────────────────────────────────────────────────────
PHONOLOGICAL_FEATURES = [
    # Manners (11)
    "consonant", "sonorant", "fricative", "nasal", "stop",
    "approximant", "affricate", "liquid", "vowel", "semivowel", "continuant",
    # Places (18)
    "alveolar", "palatal", "dental", "glottal", "labial", "velar",
    "mid", "high", "low", "front", "back", "central",
    "anterior", "posterior", "retroflex", "bilabial", "coronal", "dorsal",
    # Others (6)
    "long", "short", "monophthong", "diphthong", "round", "voiced",
]
assert len(PHONOLOGICAL_FEATURES) == 35, "Must have exactly 35 features"

FEATURE_TO_IDX = {feat: i for i, feat in enumerate(PHONOLOGICAL_FEATURES)}
NUM_FEATURES = len(PHONOLOGICAL_FEATURES)

# ─────────────────────────────────────────────────────────────────────────────
# Output node layout (paper Section 3.3):
#   nodes 0..34        → +att for features 0..34
#   nodes 35..69       → -att for features 0..34
#   node  70           → shared blank
# ─────────────────────────────────────────────────────────────────────────────
NUM_OUTPUT_NODES = 71   # 35 + 35 + 1
BLANK_IDX = 70

def feature_idx_to_pos_node(feat_idx: int) -> int:
    """Return output node index for +att of a given feature."""
    return feat_idx

def feature_idx_to_neg_node(feat_idx: int) -> int:
    """Return output node index for -att of a given feature."""
    return feat_idx + NUM_FEATURES


# ─────────────────────────────────────────────────────────────────────────────
# CMU 39-phoneme set  (TIMIT 61→39 reduced set used in the paper)
# ─────────────────────────────────────────────────────────────────────────────
CMU_39_PHONEMES = [
    "aa", "ae", "ah", "aw", "ay","ao",
    "b",  "ch", "d",  "dh", "eh",
    "er", "ey", "f",  "g",  "hh",
    "ih", "iy", "jh", "k",  "l",
    "m",  "n",  "ng", "ow", "oy",
    "p",  "r",  "s",  "sh", "t",
    "th", "uh", "uw", "v",  "w",
    "y",  "z",  "zh",
]
PHONEME_TO_IDX = {p: i for i, p in enumerate(CMU_39_PHONEMES)}
NUM_PHONEMES = len(CMU_39_PHONEMES)   # 39


# ─────────────────────────────────────────────────────────────────────────────
# Phoneme → phonological feature binary vector
# Each phoneme maps to a dict {feature_name: True/False}.
# Derived from standard phonological feature charts (Chomsky & Halle 1968,
# as referenced in the paper).
# ─────────────────────────────────────────────────────────────────────────────
def _p(features_present: list[str]) -> dict[str, bool]:
    """Helper: build feature dict from list of present features."""
    return {f: (f in features_present) for f in PHONOLOGICAL_FEATURES}


PHONEME_FEATURES: dict[str, dict[str, bool]] = {
    # ── Stops ──────────────────────────────────────────────────────────────
    "p":  _p(["consonant", "stop", "labial", "anterior", "bilabial"]),
    "b":  _p(["consonant", "stop", "labial", "anterior", "bilabial",
               "voiced"]),
    "t":  _p(["consonant", "stop", "alveolar", "anterior", "coronal"]),
    "d":  _p(["consonant", "stop", "alveolar", "anterior", "coronal",
               "voiced"]),
    "k":  _p(["consonant", "stop", "velar", "posterior", "dorsal"]),
    "g":  _p(["consonant", "stop", "velar", "posterior", "dorsal",
               "voiced"]),
 
    # ── Fricatives ─────────────────────────────────────────────────────────
    
    "f":  _p(["consonant", "fricative", "continuant", "labial", "anterior"]),
    "v":  _p(["consonant", "fricative", "continuant", "labial", "anterior", "voiced"]),
    "th": _p(["consonant", "fricative", "continuant", "dental", "anterior",
               "coronal"]),
    "dh": _p(["consonant", "fricative", "continuant", "dental", "anterior",
               "coronal", "voiced"]),
    "s":  _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
               "coronal"]),
    "z":  _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
               "coronal", "voiced"]),

    "sh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
               "coronal"]),

    "zh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
               "coronal", "voiced"]),
    
    "hh": _p(["consonant", "fricative", "continuant", "glottal", "posterior",
               "dorsal"]),
 
    # ── Affricates ─────────────────────────────────────────────────────────
    
    "ch": _p(["consonant", "affricate", "palatal", "posterior", "coronal"]),
    "jh": _p(["consonant", "affricate", "palatal", "posterior", "coronal",
               "voiced"]),
 
    # ── Nasals ─────────────────────────────────────────────────────────────
    
    "m":  _p(["consonant", "sonorant", "nasal", "continuant", "labial",
               "anterior", "bilabial", "voiced"]),
    "n":  _p(["consonant", "sonorant", "nasal", "continuant", "alveolar",
               "anterior", "coronal", "voiced"]),
    "ng": _p(["consonant", "sonorant", "nasal", "continuant", "velar",
               "posterior", "dorsal", "voiced"]),
 
    # ── Liquids ────────────────────────────────────────────────────────────
    "l":  _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
               "alveolar", "anterior", "coronal", "voiced"]),
    
    "r":  _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
               "alveolar", "anterior", "retroflex", "coronal", "voiced"]),
 
    # ── Semivowels (Glides) ────────────────────────────────────────────────

    "w":  _p(["sonorant", "approximant", "semivowel", "continuant", "labial",
               "high", "anterior", "bilabial", "round", "voiced"]),
    "y":  _p(["sonorant", "approximant", "semivowel", "continuant", "palatal",
               "high", "posterior", "coronal", "voiced"]),
 
    # ── Short Monophthong Vowels ───────────────────────────────────────────
    "ih": _p(["sonorant", "vowel", "continuant", "high", "front",
               "short", "monophthong", "voiced"]),

    "eh": _p(["sonorant", "vowel", "mid", "front",
               "short", "monophthong", "voiced"]),

    "ae": _p(["sonorant", "vowel", "continuant", "low", "front",
               "long", "monophthong", "voiced"]),

    "ah": _p(["sonorant", "vowel", "continuant", "mid", "back",
               "short", "monophthong", "voiced"]),

    "uh": _p(["sonorant", "vowel", "continuant", "high", "back",
               "short", "monophthong", "round", "voiced"]),
 
    # ── Long Monophthong Vowels ────────────────────────────────────────────
    "iy": _p(["sonorant", "vowel", "continuant", "high", "front",
               "long", "monophthong", "voiced"]),
    "aa": _p(["sonorant", "vowel", "continuant", "low", "back",
               "long", "monophthong", "voiced"]),
    "ao": _p(["sonorant", "vowel", "continuant", "mid", "back",
               "long", "monophthong", "round", "voiced"]),
    "er": _p(["sonorant", "vowel", "continuant", "mid", "central",
               "retroflex", "short", "monophthong", "voiced"]),
    "uw": _p(["sonorant", "vowel", "continuant", "high", "back",
               "long", "monophthong", "round", "voiced"]),
 
    # ── Diphthongs ─────────────────────────────────────────────────────────
    "ey": _p(["sonorant", "vowel", "continuant", "mid", "front",
               "long", "diphthong", "voiced"]),

    "aw": _p(["sonorant", "vowel", "continuant", "low", "central",
               "long", "diphthong", "round", "voiced"]),

    "ay": _p(["sonorant", "vowel", "low", "central",
               "long", "diphthong", "voiced"]),

    "oy": _p(["sonorant", "vowel", "continuant", "mid", "back",
               "long", "diphthong", "round", "voiced"]),

    "ow": _p(["sonorant", "vowel", "continuant", "mid", "central",
               "long", "diphthong", "round", "voiced"]),
 
    # ── Silence ────────────────────────────────────────────────────────────
    # Paper: "All silence labels were further removed leaving silence frames
    # to be handled by the blank label."
    "sil": _p([]),   # all features absent; treated as blank during training
}

# Verify all 39 phonemes are covered.
# "sil" is intentionally extra — it is a fallback/blank placeholder, not a
# speech target, so it lives in PHONEME_FEATURES but not in CMU_39_PHONEMES.
_expected = set(CMU_39_PHONEMES) | {"sil"}
assert set(PHONEME_FEATURES.keys()) == _expected, (
    f"Missing from PHONEME_FEATURES : {_expected - set(PHONEME_FEATURES.keys())}\n"
    f"Unexpected in PHONEME_FEATURES: {set(PHONEME_FEATURES.keys()) - _expected}"
)
assert NUM_PHONEMES == 39, f"Expected 39 phonemes, got {NUM_PHONEMES}"


def phoneme_to_feature_vector(phoneme: str) -> list[bool]:
    """Return a binary list of length 35 for a given phoneme."""
    feat_dict = PHONEME_FEATURES.get(phoneme, PHONEME_FEATURES["sil"])
    return [feat_dict[f] for f in PHONOLOGICAL_FEATURES]


def phoneme_sequence_to_feature_sequences(
    phonemes: list[str],
) -> list[list[int]]:
    """
    Convert a phoneme sequence to N=35 binary label sequences.

    Returns:
        feature_seqs: list of 35 lists, each containing +att(1) or -att(0)
                      integers for each phoneme position.
    """
    feature_seqs = [[] for _ in range(NUM_FEATURES)]
    for ph in phonemes:
        vec = phoneme_to_feature_vector(ph)
        for feat_idx, present in enumerate(vec):
            feature_seqs[feat_idx].append(1 if present else 0)
    return feature_seqs


def feature_sequences_to_ctc_labels(
    feature_seqs: list[list[int]],
) -> list[list[int]]:
    """
    Convert binary feature sequences (0/1) to CTC label indices.

    For category i:
      - +att  →  node index i          (feature_idx_to_pos_node)
      - -att  →  node index i + 35     (feature_idx_to_neg_node)

    Returns:
        ctc_labels: list of 35 lists of node indices (int)
    """
    ctc_labels = []
    for feat_idx, seq in enumerate(feature_seqs):
        label_seq = []
        for val in seq:
            if val == 1:
                label_seq.append(feature_idx_to_pos_node(feat_idx))
            else:
                label_seq.append(feature_idx_to_neg_node(feat_idx))
        ctc_labels.append(label_seq)
    return ctc_labels