Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

App Files Files Community

Pronunciation-Coach / phonological_features.py

heldtomaturity

initial app deploy

0515ef3 12 days ago

raw

history blame contribute delete

13.4 kB

	"""
	phonological_features.py
	========================
	Defines the 35 phonological features from Table 1 of Shahin et al. (2025)
	and provides the phoneme-to-feature mapping for the 39-phoneme CMU set.

	Feature categories (paper Table 1):
	Manners: consonant, sonorant, fricative, nasal, stop, approximant,
	affricate, liquid, vowel, semivowel, continuant
	Places: alveolar, palatal, dental, glottal, labial, velar, mid, high,
	low, front, back, central, anterior, posterior, retroflex,
	bilabial, coronal, dorsal
	Others: long, short, monophthong, diphthong, round, voiced

	The model output has 71 nodes: 35 (+att) + 35 (-att) + 1 (shared blank).
	"""

	# ─────────────────────────────────────────────────────────────────────────────
	# The 35 phonological features (paper Table 1), in a fixed canonical order
	# ─────────────────────────────────────────────────────────────────────────────
	PHONOLOGICAL_FEATURES = [
	# Manners (11)
	"consonant", "sonorant", "fricative", "nasal", "stop",
	"approximant", "affricate", "liquid", "vowel", "semivowel", "continuant",
	# Places (18)
	"alveolar", "palatal", "dental", "glottal", "labial", "velar",
	"mid", "high", "low", "front", "back", "central",
	"anterior", "posterior", "retroflex", "bilabial", "coronal", "dorsal",
	# Others (6)
	"long", "short", "monophthong", "diphthong", "round", "voiced",
	]
	assert len(PHONOLOGICAL_FEATURES) == 35, "Must have exactly 35 features"

	FEATURE_TO_IDX = {feat: i for i, feat in enumerate(PHONOLOGICAL_FEATURES)}
	NUM_FEATURES = len(PHONOLOGICAL_FEATURES)

	# ─────────────────────────────────────────────────────────────────────────────
	# Output node layout (paper Section 3.3):
	# nodes 0..34 → +att for features 0..34
	# nodes 35..69 → -att for features 0..34
	# node 70 → shared blank
	# ─────────────────────────────────────────────────────────────────────────────
	NUM_OUTPUT_NODES = 71 # 35 + 35 + 1
	BLANK_IDX = 70

	def feature_idx_to_pos_node(feat_idx: int) -> int:
	"""Return output node index for +att of a given feature."""
	return feat_idx

	def feature_idx_to_neg_node(feat_idx: int) -> int:
	"""Return output node index for -att of a given feature."""
	return feat_idx + NUM_FEATURES


	# ─────────────────────────────────────────────────────────────────────────────
	# CMU 39-phoneme set (TIMIT 61→39 reduced set used in the paper)
	# ─────────────────────────────────────────────────────────────────────────────
	CMU_39_PHONEMES = [
	"aa", "ae", "ah", "aw", "ay","ao",
	"b", "ch", "d", "dh", "eh",
	"er", "ey", "f", "g", "hh",
	"ih", "iy", "jh", "k", "l",
	"m", "n", "ng", "ow", "oy",
	"p", "r", "s", "sh", "t",
	"th", "uh", "uw", "v", "w",
	"y", "z", "zh",
	]
	PHONEME_TO_IDX = {p: i for i, p in enumerate(CMU_39_PHONEMES)}
	NUM_PHONEMES = len(CMU_39_PHONEMES) # 39


	# ─────────────────────────────────────────────────────────────────────────────
	# Phoneme → phonological feature binary vector
	# Each phoneme maps to a dict {feature_name: True/False}.
	# Derived from standard phonological feature charts (Chomsky & Halle 1968,
	# as referenced in the paper).
	# ─────────────────────────────────────────────────────────────────────────────
	def _p(features_present: list[str]) -> dict[str, bool]:
	"""Helper: build feature dict from list of present features."""
	return {f: (f in features_present) for f in PHONOLOGICAL_FEATURES}


	PHONEME_FEATURES: dict[str, dict[str, bool]] = {
	# ── Stops ──────────────────────────────────────────────────────────────
	"p": _p(["consonant", "stop", "labial", "anterior", "bilabial"]),
	"b": _p(["consonant", "stop", "labial", "anterior", "bilabial",
	"voiced"]),
	"t": _p(["consonant", "stop", "alveolar", "anterior", "coronal"]),
	"d": _p(["consonant", "stop", "alveolar", "anterior", "coronal",
	"voiced"]),
	"k": _p(["consonant", "stop", "velar", "posterior", "dorsal"]),
	"g": _p(["consonant", "stop", "velar", "posterior", "dorsal",
	"voiced"]),

	# ── Fricatives ─────────────────────────────────────────────────────────

	"f": _p(["consonant", "fricative", "continuant", "labial", "anterior"]),
	"v": _p(["consonant", "fricative", "continuant", "labial", "anterior", "voiced"]),
	"th": _p(["consonant", "fricative", "continuant", "dental", "anterior",
	"coronal"]),
	"dh": _p(["consonant", "fricative", "continuant", "dental", "anterior",
	"coronal", "voiced"]),
	"s": _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
	"coronal"]),
	"z": _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
	"coronal", "voiced"]),

	"sh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
	"coronal"]),

	"zh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
	"coronal", "voiced"]),

	"hh": _p(["consonant", "fricative", "continuant", "glottal", "posterior",
	"dorsal"]),

	# ── Affricates ─────────────────────────────────────────────────────────

	"ch": _p(["consonant", "affricate", "palatal", "posterior", "coronal"]),
	"jh": _p(["consonant", "affricate", "palatal", "posterior", "coronal",
	"voiced"]),

	# ── Nasals ─────────────────────────────────────────────────────────────

	"m": _p(["consonant", "sonorant", "nasal", "continuant", "labial",
	"anterior", "bilabial", "voiced"]),
	"n": _p(["consonant", "sonorant", "nasal", "continuant", "alveolar",
	"anterior", "coronal", "voiced"]),
	"ng": _p(["consonant", "sonorant", "nasal", "continuant", "velar",
	"posterior", "dorsal", "voiced"]),

	# ── Liquids ────────────────────────────────────────────────────────────
	"l": _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
	"alveolar", "anterior", "coronal", "voiced"]),

	"r": _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
	"alveolar", "anterior", "retroflex", "coronal", "voiced"]),

	# ── Semivowels (Glides) ────────────────────────────────────────────────

	"w": _p(["sonorant", "approximant", "semivowel", "continuant", "labial",
	"high", "anterior", "bilabial", "round", "voiced"]),
	"y": _p(["sonorant", "approximant", "semivowel", "continuant", "palatal",
	"high", "posterior", "coronal", "voiced"]),

	# ── Short Monophthong Vowels ───────────────────────────────────────────
	"ih": _p(["sonorant", "vowel", "continuant", "high", "front",
	"short", "monophthong", "voiced"]),

	"eh": _p(["sonorant", "vowel", "mid", "front",
	"short", "monophthong", "voiced"]),

	"ae": _p(["sonorant", "vowel", "continuant", "low", "front",
	"long", "monophthong", "voiced"]),

	"ah": _p(["sonorant", "vowel", "continuant", "mid", "back",
	"short", "monophthong", "voiced"]),

	"uh": _p(["sonorant", "vowel", "continuant", "high", "back",
	"short", "monophthong", "round", "voiced"]),

	# ── Long Monophthong Vowels ────────────────────────────────────────────
	"iy": _p(["sonorant", "vowel", "continuant", "high", "front",
	"long", "monophthong", "voiced"]),
	"aa": _p(["sonorant", "vowel", "continuant", "low", "back",
	"long", "monophthong", "voiced"]),
	"ao": _p(["sonorant", "vowel", "continuant", "mid", "back",
	"long", "monophthong", "round", "voiced"]),
	"er": _p(["sonorant", "vowel", "continuant", "mid", "central",
	"retroflex", "short", "monophthong", "voiced"]),
	"uw": _p(["sonorant", "vowel", "continuant", "high", "back",
	"long", "monophthong", "round", "voiced"]),

	# ── Diphthongs ─────────────────────────────────────────────────────────
	"ey": _p(["sonorant", "vowel", "continuant", "mid", "front",
	"long", "diphthong", "voiced"]),

	"aw": _p(["sonorant", "vowel", "continuant", "low", "central",
	"long", "diphthong", "round", "voiced"]),

	"ay": _p(["sonorant", "vowel", "low", "central",
	"long", "diphthong", "voiced"]),

	"oy": _p(["sonorant", "vowel", "continuant", "mid", "back",
	"long", "diphthong", "round", "voiced"]),

	"ow": _p(["sonorant", "vowel", "continuant", "mid", "central",
	"long", "diphthong", "round", "voiced"]),

	# ── Silence ────────────────────────────────────────────────────────────
	# Paper: "All silence labels were further removed leaving silence frames
	# to be handled by the blank label."
	"sil": _p([]), # all features absent; treated as blank during training
	}

	# Verify all 39 phonemes are covered.
	# "sil" is intentionally extra — it is a fallback/blank placeholder, not a
	# speech target, so it lives in PHONEME_FEATURES but not in CMU_39_PHONEMES.
	_expected = set(CMU_39_PHONEMES) \| {"sil"}
	assert set(PHONEME_FEATURES.keys()) == _expected, (
	f"Missing from PHONEME_FEATURES : {_expected - set(PHONEME_FEATURES.keys())}\n"
	f"Unexpected in PHONEME_FEATURES: {set(PHONEME_FEATURES.keys()) - _expected}"
	)
	assert NUM_PHONEMES == 39, f"Expected 39 phonemes, got {NUM_PHONEMES}"


	def phoneme_to_feature_vector(phoneme: str) -> list[bool]:
	"""Return a binary list of length 35 for a given phoneme."""
	feat_dict = PHONEME_FEATURES.get(phoneme, PHONEME_FEATURES["sil"])
	return [feat_dict[f] for f in PHONOLOGICAL_FEATURES]


	def phoneme_sequence_to_feature_sequences(
	phonemes: list[str],
	) -> list[list[int]]:
	"""
	Convert a phoneme sequence to N=35 binary label sequences.

	Returns:
	feature_seqs: list of 35 lists, each containing +att(1) or -att(0)
	integers for each phoneme position.
	"""
	feature_seqs = [[] for _ in range(NUM_FEATURES)]
	for ph in phonemes:
	vec = phoneme_to_feature_vector(ph)
	for feat_idx, present in enumerate(vec):
	feature_seqs[feat_idx].append(1 if present else 0)
	return feature_seqs


	def feature_sequences_to_ctc_labels(
	feature_seqs: list[list[int]],
	) -> list[list[int]]:
	"""
	Convert binary feature sequences (0/1) to CTC label indices.

	For category i:
	- +att → node index i (feature_idx_to_pos_node)
	- -att → node index i + 35 (feature_idx_to_neg_node)

	Returns:
	ctc_labels: list of 35 lists of node indices (int)
	"""
	ctc_labels = []
	for feat_idx, seq in enumerate(feature_seqs):
	label_seq = []
	for val in seq:
	if val == 1:
	label_seq.append(feature_idx_to_pos_node(feat_idx))
	else:
	label_seq.append(feature_idx_to_neg_node(feat_idx))
	ctc_labels.append(label_seq)
	return ctc_labels