Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

App Files Files Community

heldtomaturity commited on 12 days ago

Commit

0515ef3

1 Parent(s): c35de4f

initial app deploy

Browse files

Files changed (6) hide show

app.py +287 -0
feedback_generator.py +690 -0
mdd_engine.py +534 -0
phonological_features.py +253 -0
requirements.txt +17 -0
wav2vec2_phonological.py +267 -0

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+Mispronunciation Detection & Diagnosis — HuggingFace Space
+===========================================================
+Wires together:
+  1. PhonologicalWav2Vec2  (your best_model.pt, loaded once at cold start)
+  2. MDD engine            (per-feature NW alignment → errors + score)
+  3. Feedback generator    (rule engine + optional LLM rewriter)
+Environment variables to set in Space → Settings → Variables and secrets:
+  HF_TOKEN          (secret)   — read token for your private model repo
+  HF_MODEL_REPO     (variable) — e.g. "Backlighteu/phonological-mdd"
+  HF_MODEL_FILENAME (variable) — e.g. "best_model.pt"  (default)
+"""
+import os
+import json
+import torch
+import numpy as np
+import gradio as gr
+import librosa
+from huggingface_hub import hf_hub_download, snapshot_download
+from transformers import Wav2Vec2FeatureExtractor
+from wav2vec2_phonological import PhonologicalWav2Vec2
+from mdd_engine import run_mdd
+from feedback_generator import generate_feedback
+from phonological_features import (
+    CMU_39_PHONEMES,
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# 1.  Model — loaded once at cold start, reused for every request
+# ─────────────────────────────────────────────────────────────────────────────
+_model = None
+_feature_extractor = None
+_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PRETRAINED_BASE = "facebook/wav2vec2-large-robust"
+MODEL_REPO      = os.environ.get("HF_MODEL_REPO",    "Backlighteu/phonological-mdd")
+MODEL_FILENAME  = os.environ.get("HF_MODEL_FILENAME", "best_model.pt")
+HF_TOKEN        = os.environ.get("HF_TOKEN",          None)
+def load_model():
+    global _model, _feature_extractor
+    if _model is not None:
+        return
+    # Download entire repo into ./model_cache once, then load from disk.
+    # hf_hub_download checks cache first — no re-download if already present.
+    print(f"[startup] Caching {MODEL_REPO} to ./model_cache ...")
+    snapshot_download(
+        repo_id=MODEL_REPO,
+        token=HF_TOKEN,
+        local_dir="./model_cache",
+    )
+    weights_path = "./model_cache/best_model.pt"
+    print(f"[startup] Loading weights from {weights_path}")
+    model = PhonologicalWav2Vec2(
+        pretrained_model_name=PRETRAINED_BASE,
+        num_output_nodes=71,
+        freeze_cnn_encoder=True,
+    )
+    state_dict = torch.load(weights_path, map_location=_device)
+    model.load_state_dict(state_dict)
+    model.to(_device)
+    model.eval()
+    _model = model
+    print(f"[startup] Model ready on {_device}.")
+    print(f"[startup] Loading feature extractor from '{PRETRAINED_BASE}' ...")
+    _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
+    print("[startup] Feature extractor ready.")
+# ─────────────────────────────────────────────────────────────────────────────
+# 2.  Audio → decoded feature sequences
+# ─────────────────────────────────────────────────────────────────────────────
+TARGET_SR = 16_000
+def decode_audio(audio_path: str) -> list:
+    """
+    Load audio, run the phonological model, return CTC-decoded feature seqs.
+    Returns
+    -------
+    actual_feature_seqs : list of 35 lists of int (0 or 1)
+        CTC-decoded +att / -att sequence for each of the 35 features.
+    """
+    load_model()
+    waveform, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
+    waveform = waveform.astype(np.float32)
+    inputs = _feature_extractor(
+        waveform,
+        sampling_rate=TARGET_SR,
+        return_tensors="pt",
+        padding=True,
+    )
+    input_values   = inputs.input_values.to(_device)
+    attention_mask = inputs.get("attention_mask")
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(_device)
+    with torch.no_grad():
+        logits, output_lengths = _model(
+            input_values,
+            attention_mask,
+            apply_spec_augment=False,
+        )
+    # model.decode() returns list[B][35][list[bool]]  — True=+att, False=-att
+    decoded_batch = _model.decode(logits, output_lengths)
+    decoded_35 = decoded_batch[0]   # [35][list[bool]]
+    # Convert bool → int (1/0)
+    actual_feature_seqs = [
+        [1 if v else 0 for v in feat_seq]
+        for feat_seq in decoded_35
+    ]
+    return actual_feature_seqs
+# ──────────────────���──────────────────────────────────────────────────────────
+# 3.  Text → canonical phoneme sequence
+# ─────────────────────────────────────────────────────────────────────────────
+_VALID_PHONEMES = set(CMU_39_PHONEMES) | {"sil"}
+def parse_phoneme_input(text: str) -> list:
+    """
+    Accept space-separated CMU ARPAbet tokens typed by the user.
+    Unknown tokens are skipped with a warning.
+    """
+    tokens = text.lower().split()
+    valid, skipped = [], []
+    for t in tokens:
+        if t in _VALID_PHONEMES:
+            valid.append(t)
+        else:
+            skipped.append(t)
+    if skipped:
+        print(f"[warning] Unrecognised tokens skipped: {skipped}")
+    return valid if valid else ["sil"]
+# ─────────────────────────────────────────────────────────────────────────────
+# 4.  Gradio processing function
+# ─────────────────────────────────────────────────────────────────────────────
+def process(audio_input, script_text, use_llm, max_issues):
+    if audio_input is None:
+        return "Please record or upload audio first.", "", "{}"
+    script_text = script_text.strip()
+    if not script_text:
+        return (
+            "Please type the target sentence as ARPAbet phoneme tokens.\n"
+            "Example: `dh ae k ae t` for 'the cat'",
+            "", "{}",
+        )
+    try:
+        actual_feature_seqs = decode_audio(audio_input)
+    except Exception as e:
+        return f"Audio processing error: {e}", "", "{}"
+    target_phonemes = parse_phoneme_input(script_text)
+    try:
+        result = run_mdd(
+            actual_feature_seqs=actual_feature_seqs,
+            target_phonemes=target_phonemes,
+        )
+    except Exception as e:
+        return f"MDD engine error: {e}", "", "{}"
+    feedback_dict = generate_feedback(
+        result,
+        use_llm=use_llm,
+        max_issues=int(max_issues),
+    )
+    score = feedback_dict["score"]
+    main_feedback = (
+        f"**Pronunciation Score: {score}/100**\n\n"
+        + feedback_dict["final_feedback"]
+    )
+    detail_lines = ["### Per-phoneme detail\n"]
+    for e in feedback_dict["error_summary"]:
+        deletion_tag = " *(deleted)*" if e.get("is_deletion") else ""
+        detail_lines.append(
+            f"- **/{e['target']}/** (pos {e['position']}){deletion_tag}: "
+            f"severity=`{e['severity']}`, accuracy={e['accuracy']:.0%}\n"
+            f"  - Missing: {', '.join(e['missing_features']) or '—'}\n"
+            f"  - Extra:   {', '.join(e['extra_features'])   or '—'}"
+        )
+    if not feedback_dict["error_summary"]:
+        detail_lines.append("No feature-level errors detected — great pronunciation!")
+    detail_text = "\n".join(detail_lines)
+    json_output = json.dumps({
+        "score":                feedback_dict["score"],
+        "deletion_count":       result.deletion_count,
+        "insertion_count":      result.insertion_count,
+        "feature_error_counts": feedback_dict["feature_error_counts"],
+        "rules_triggered":      feedback_dict["rules_triggered"],
+        "target_phonemes":      target_phonemes,
+        "actual_seq_lengths":   [len(s) for s in actual_feature_seqs],
+    }, indent=2)
+    return main_feedback, detail_text, json_output
+# ─────────────────────────────────────────────────────────────────────────────
+# 5.  Gradio UI
+# ─────────────────────────────────────────────────────────────────────────────
+VALID_PHONEME_LIST = ", ".join(sorted(CMU_39_PHONEMES))
+with gr.Blocks(title="Pronunciation Coach", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # Pronunciation Coach
+        Speak a sentence, type what you meant to say as **ARPAbet phoneme tokens**,
+        and get phonological-feature-level feedback with articulation tips.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Your speech",
+            )
+            script_input = gr.Textbox(
+                label="Target sentence — space-separated ARPAbet tokens",
+                placeholder="e.g.  dh ae k ae t   (= 'the cat')",
+                lines=2,
+            )
+            with gr.Accordion("Valid phoneme tokens", open=False):
+                gr.Markdown(f"`{VALID_PHONEME_LIST}`")
+            with gr.Row():
+                use_llm    = gr.Checkbox(value=False, label="LLM feedback rewriter")
+                max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues shown")
+            submit_btn = gr.Button("Analyse", variant="primary")
+        with gr.Column(scale=2):
+            feedback_out = gr.Markdown(label="Coaching feedback")
+            with gr.Accordion("Per-phoneme detail", open=False):
+                detail_out = gr.Markdown()
+            with gr.Accordion("Raw JSON (developers)", open=False):
+                json_out = gr.Code(language="json")
+    submit_btn.click(
+        fn=process,
+        inputs=[audio_input, script_input, use_llm, max_issues],
+        outputs=[feedback_out, detail_out, json_out],
+    )
+    gr.Markdown(
+        """
+        ---
+        **How to enter the target sentence:**
+        Convert your sentence to ARPAbet using the
+        [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict)
+        then paste the space-separated tokens here.
+        Example: *"the cat sat"* → `dh ax k ae t s ae t`
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()

feedback_generator.py ADDED Viewed

	@@ -0,0 +1,690 @@

+"""
+Feedback Generator
+==================
+Two-layer system:
+  Layer 1 — Rule engine: maps specific feature errors to expert articulatory cues
+  Layer 2 — LLM rewriter: takes rule outputs and rewrites them into natural,
+             encouraging coach-like language via a lightweight local model
+             (or cloud fallback).
+The rule templates are the ground truth; the LLM only adds warmth and fluency.
+"""
+from __future__ import annotations
+import os
+import json
+import textwrap
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+from mdd_engine import PhonemeError, MDDResult, FEATURE_NAMES
+# ──────────────────────────────────────────────
+# 1.  Articulatory feedback rule bank
+# ──────────────────────────────────────────────
+# Each rule = {trigger_features, direction, tip, drill, self_check}
+# direction: "missing" | "extra" | "both"
+FEATURE_RULES: List[Dict] = [
+    # ── VOICING (Others group) ────────────────────────────────────────────
+    {
+        "features": ["voiced"],
+        "direction": "missing",
+        "tip": (
+            "Your vocal cords are not vibrating when they should be. "
+            "Place two fingers lightly on your throat (the Adam's apple area). "
+            "Now say the sound — if you feel vibration, you've got it. "
+            "Try humming first ('mmm'), then slide into the target sound."
+        ),
+        "drill": "Practice pairs: /f/ → /v/, /s/ → /z/, /p/ → /b/. "
+                 "Feel the buzz turn on for the second sound each time.",
+        "self_check": "Put your hand on your throat. You should feel a gentle buzz.",
+    },
+    {
+        "features": ["voiced"],
+        "direction": "extra",
+        "tip": (
+            "You are voicing a sound that should be voiceless — your vocal cords "
+            "are buzzing when they should be still. "
+            "Whisper the sound first to train your cords to stay quiet, "
+            "then gradually add breath pressure without the buzz."
+        ),
+        "drill": "Whisper-shout drill: whisper /p/, /t/, /k/, /f/, /s/ ten times.",
+        "self_check": "Put your hand on your throat. It should feel still, no vibration.",
+    },
+    # ── MANNER: STOP ─────────────────────────────────────────────────────
+    {
+        "features": ["stop"],
+        "direction": "missing",
+        "tip": (
+            "This sound needs a full closure in your mouth — air must be completely "
+            "blocked and then released in a burst. "
+            "Your tongue or lips are not making a tight enough seal, letting air trickle "
+            "through instead of building up pressure."
+        ),
+        "drill": "Tap your fingers on the desk for each stop: /p/ – /t/ – /k/. "
+                 "Feel the 'pop' as pressure releases each time.",
+        "self_check": "Before the release, you should feel air pressure building behind the closure.",
+    },
+    {
+        "features": ["stop"],
+        "direction": "extra",
+        "tip": (
+            "You are closing your airway completely when the sound should be continuous. "
+            "Relax the articulators and keep a small opening so air can flow through "
+            "without a burst."
+        ),
+        "drill": "Say /s/ and /f/ — feel the continuous uninterrupted airflow, no pop.",
+        "self_check": "You should hear no 'pop' or sudden release — just steady air.",
+    },
+    # ── MANNER: FRICATIVE ────────────────────────────────────────────────
+    {
+        "features": ["fricative"],
+        "direction": "missing",
+        "tip": (
+            "This sound requires turbulent airflow — a hissing or buzzing quality. "
+            "Narrow the passage between your tongue (or lips) and the articulators just enough "
+            "that the air becomes turbulent. Too wide gives a vowel; full closure gives a stop."
+        ),
+        "drill": "Hold /s/, /f/, /sh/ for three full seconds each. Feel the continuous friction.",
+        "self_check": "You should hear a clear hissing or buzzing sound throughout, not silence or a pop.",
+    },
+    # ── MANNER: NASAL ─────────────────────────────────────────────────────
+    {
+        "features": ["nasal"],
+        "direction": "missing",
+        "tip": (
+            "This sound requires airflow through your nose. "
+            "Pinch your nostrils closed — if the sound changes dramatically, "
+            "you were accidentally blocking nasal airflow. "
+            "Let air flow freely through your nose as you make the sound."
+        ),
+        "drill": "Alternate: hum 'mmm' (nasal), then 'bbb' (not nasal). Feel the difference.",
+        "self_check": "Pinch your nose lightly — a nasal sound will feel 'stuffed up' when blocked.",
+    },
+    {
+        "features": ["nasal"],
+        "direction": "extra",
+        "tip": (
+            "Your sound has unwanted nasality — air is leaking through your nose. "
+            "Practice lifting the soft palate by saying 'uh-oh' firmly, then keep that "
+            "lifted feeling while producing the target sound."
+        ),
+        "drill": "Say 'back — bank', 'bad — band'. The first word of each pair is not nasal.",
+        "self_check": "Hold a mirror under your nose — it should not fog up.",
+    },
+    # ── MANNER: AFFRICATE ────────────────────────────────────────────────
+    {
+        "features": ["affricate"],
+        "direction": "missing",
+        "tip": (
+            "An affricate starts with a complete closure (like a stop) then releases "
+            "into a fricative — think of /ch/ in 'church' or /jh/ in 'judge'. "
+            "You are either skipping the closure or the friction release. "
+            "Make sure you feel both: a tight seal followed by a hissing release."
+        ),
+        "drill": "Say 'ch-ch-ch' rapidly, feeling the tap-and-hiss for each one.",
+        "self_check": "You should feel a brief closure then turbulent airflow — two phases in one sound.",
+    },
+    # ── MANNER: APPROXIMANT / LIQUID ─────────────────────────────────────
+    {
+        "features": ["approximant", "liquid"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/l/, /r/, /w/, /y/) needs your articulators to approach each other "
+            "closely without fully touching or creating friction. "
+            "Relax the contact — you may be pressing too hard and creating a stop, "
+            "or not shaping your mouth precisely enough."
+        ),
+        "drill": "Say 'la-la-la' for /l/ and 'ra-ra-ra' for /r/ slowly, keeping the tongue light.",
+        "self_check": "There should be no pop and no hiss — just a smooth, resonant glide.",
+    },
+    # ── MANNER: CONTINUANT ───────────────────────────────────────────────
+    {
+        "features": ["continuant"],
+        "direction": "missing",
+        "tip": (
+            "This sound should have continuous, uninterrupted airflow — it is not a stop. "
+            "Keep your airway open and let air flow through for the full duration of the sound."
+        ),
+        "drill": "Sustain /s/, /m/, /l/ or /v/ for three seconds without any interruption.",
+        "self_check": "You should be able to hold the sound indefinitely without cutting off air.",
+    },
+    # ── PLACE: BILABIAL ──────────────────────────────────────────────────
+    {
+        "features": ["bilabial"],
+        "direction": "missing",
+        "tip": (
+            "This sound needs both lips pressed firmly together (/p/, /b/, /m/). "
+            "You may be making it with only one lip or further back in the mouth. "
+            "Press your lips together completely before releasing."
+        ),
+        "drill": "Say 'pa-ba-ma' ten times, exaggerating full lip closure each time.",
+        "self_check": "Watch yourself in a mirror — both lips should close completely.",
+    },
+    # ── PLACE: LABIAL (labiodental /f/, /v/) ────────────────────────────
+    {
+        "features": ["labial"],
+        "direction": "missing",
+        "tip": (
+            "This sound needs your lips to be active — either both lips together (bilabial: /p/, /b/, /m/) "
+            "or upper teeth touching the lower lip (labiodental: /f/, /v/). "
+            "You may be making the sound too far back with the tongue."
+        ),
+        "drill": "Exaggerate lip contact. Say 'pop', 'bob', 'mom', 'five', 'very' in front of a mirror.",
+        "self_check": "Watch yourself in a mirror — you should see clear lip movement.",
+    },
+    # ── PLACE: DENTAL ────────────────────────────────────────────────────
+    {
+        "features": ["dental"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/th/, /dh/) requires your tongue tip to be right at or between your teeth. "
+            "Stick your tongue tip just between your upper and lower front teeth "
+            "and let air flow over it."
+        ),
+        "drill": "Say 'think' and 'this' slowly, deliberately placing your tongue between your teeth each time.",
+        "self_check": "You should feel your tongue tip touching the edges of your front teeth.",
+    },
+    # ── PLACE: ALVEOLAR ──────────────────────────────────────────────────
+    {
+        "features": ["alveolar"],
+        "direction": "missing",
+        "tip": (
+            "Your tongue tip needs to touch the alveolar ridge — the hard bump just behind "
+            "your upper front teeth. "
+            "This is the target for /t/, /d/, /n/, /s/, /z/, /l/. "
+            "You may be placing your tongue too far back or too far forward."
+        ),
+        "drill": "Touch the ridge behind your upper teeth with your tongue tip and feel it. "
+                 "Now tap /t/ ten times, always returning to that exact spot.",
+        "self_check": "Is your tongue tip touching the hard ridge — not the teeth and not the palate?",
+    },
+    # ── PLACE: PALATAL ────────────────────────────────────────────────────
+    {
+        "features": ["palatal"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/sh/, /zh/, /ch/, /jh/, /y/) is made with the tongue body raised "
+            "toward the hard palate — the hard, bony roof just behind the alveolar ridge. "
+            "Move your tongue further back from the teeth and arch it upward."
+        ),
+        "drill": "Say 'she', 'measure', 'church' — feel your tongue body rise toward the hard palate.",
+        "self_check": "You should feel your tongue broadly touching or approaching the middle of the roof.",
+    },
+    # ── PLACE: VELAR ──────────────────────────────────────────────────────
+    {
+        "features": ["velar"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/k/, /g/, /ng/) is made at the back of your mouth, with the back of your tongue "
+            "touching the soft palate (velum). "
+            "Try gargling — that back-of-tongue raised position is exactly what you need."
+        ),
+        "drill": "Say 'king', 'ring', 'sing' — focus on the back-of-tongue closure each time.",
+        "self_check": "You should feel the back of your tongue lift and meet the soft palate.",
+    },
+    # ── PLACE: GLOTTAL ────────────────────────────────────────────────────
+    {
+        "features": ["glottal"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/hh/) is made deep in the throat at the vocal folds. "
+            "Think of fogging up a mirror — breathe out gently with a completely open throat. "
+            "No tongue or lip constriction should be involved."
+        ),
+        "drill": "Say 'hi', 'hat', 'hot' — the /h/ should feel like a breath, not a friction sound.",
+        "self_check": "Place a hand on your throat — you should feel warmth from breath, not a hiss.",
+    },
+    # ── PLACE: RETROFLEX ─────────────────────────────────────────────────
+    {
+        "features": ["retroflex"],
+        "direction": "missing",
+        "tip": (
+            "This sound (/r/ in English, /er/) requires your tongue tip to curl back toward "
+            "the back of the alveolar ridge without touching anything, or to bunch up in the "
+            "center of your mouth. "
+            "Say 'uh' then slowly curl your tongue tip upward and backward."
+        ),
+        "drill": "Practice: 'uh' → curl tongue → 'er'. Hold 'er' for three seconds.",
+        "self_check": "Your tongue tip should point upward or backward but NOT touch the roof.",
+    },
+    # ── PLACE: CORONAL ───────────────────────────────────────────────────
+    {
+        "features": ["coronal"],
+        "direction": "missing",
+        "tip": (
+            "Coronal sounds are made with the front part (blade or tip) of the tongue — "
+            "this covers /t/, /d/, /s/, /z/, /n/, /l/, /sh/, /th/, and /r/. "
+            "Make sure your tongue front is active and positioned correctly for this sound."
+        ),
+        "drill": "Say 'tip', 'dip', 'sip', 'nip' — feel the tongue tip or blade doing the work.",
+        "self_check": "Is your tongue front — tip or blade — the part making contact?",
+    },
+    # ── PLACE: DORSAL ────────────────────────────────────────────────────
+    {
+        "features": ["dorsal"],
+        "direction": "missing",
+        "tip": (
+            "Dorsal sounds (/k/, /g/, /ng/, /w/, /y/) involve the back (body or root) of the tongue. "
+            "Your tongue body needs to arch toward the velum or palate. "
+            "You may be using your tongue tip when the back of the tongue should lead."
+        ),
+        "drill": "Say 'key', 'go', 'sing' — feel the back hump of your tongue rise each time.",
+        "self_check": "The front of your tongue should be relaxed; the back should be doing the work.",
+    },
+    # ── VOWEL HEIGHT ──────────────────────────────────────────────────────
+    {
+        "features": ["high"],
+        "direction": "missing",
+        "tip": (
+            "This vowel needs your tongue to be high in your mouth. "
+            "Think of 'ee' in 'feet' or 'oo' in 'food' — the tongue is raised close to the palate. "
+            "Raise your tongue toward the roof of your mouth as you say the vowel."
+        ),
+        "drill": "Slide from 'ah' (low, jaw open) → 'ee' (high, jaw nearly closed) and feel the tongue rise.",
+        "self_check": "Your jaw should be mostly closed; the tongue should be near the roof.",
+    },
+    {
+        "features": ["mid"],
+        "direction": "missing",
+        "tip": (
+            "This vowel needs a mid-height tongue position — halfway between fully raised and fully lowered. "
+            "Think of 'eh' in 'bed' or 'oh' in 'boat'. "
+            "Relax your jaw to a half-open position."
+        ),
+        "drill": "Slide 'ee' (high) → 'eh' (mid) → 'ah' (low) and stop at the middle position.",
+        "self_check": "Your jaw should be half open — neither clenched nor dropped wide.",
+    },
+    {
+        "features": ["low"],
+        "direction": "missing",
+        "tip": (
+            "This vowel needs your tongue to drop down and your jaw to open wide. "
+            "Think of 'ah' in 'father' or 'ae' in 'cat' — the tongue is flat and low. "
+            "Let your jaw drop and your tongue rest at the bottom of your mouth."
+        ),
+        "drill": "Say 'ah' like a doctor's exam — exaggerate the open jaw and flat tongue.",
+        "self_check": "Your jaw should be open wide; your tongue should feel flat at the bottom.",
+    },
+    # ── VOWEL BACKNESS ───────────────────────────────────────────────────
+    {
+        "features": ["front"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should be made with your tongue pushed toward the front of your mouth. "
+            "Smile slightly — this naturally pulls the tongue body forward."
+        ),
+        "drill": "Say 'ee – ay – eh' and feel your tongue staying at the front for all three.",
+        "self_check": "You should feel tension or contact toward the front of your mouth.",
+    },
+    {
+        "features": ["back"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should be made with your tongue retracted toward the back of your mouth. "
+            "Round your lips slightly and pull your tongue body backward as you say the vowel."
+        ),
+        "drill": "Say 'oo – oh – aw' — feel your tongue pulling back and the lips rounding each time.",
+        "self_check": "You should feel the back of your tongue arch upward and backward.",
+    },
+    {
+        "features": ["central"],
+        "direction": "missing",
+        "tip": (
+            "This vowel (like the schwa /ə/ in 'about') should be made with a completely neutral, "
+            "centered tongue — not pushed forward or pulled back. "
+            "Relax all tension in your jaw, lips, and tongue."
+        ),
+        "drill": "Say 'uh' with a completely relaxed, drooping jaw and limp tongue.",
+        "self_check": "Your mouth should feel effortless, tongue neither front nor back.",
+    },
+    # ── LIP ROUNDING (Others group: 'round') ─────────────────────────────
+    {
+        "features": ["round"],
+        "direction": "missing",
+        "tip": (
+            "This sound requires rounded, protruded lips — like you are blowing out a candle. "
+            "Form an 'oo' shape with your lips before and during the sound."
+        ),
+        "drill": "Exaggerate lip rounding: say 'oo – oh – aw' with very pursed lips.",
+        "self_check": "Look in a mirror — your lips should form a clear circle or oval.",
+    },
+    {
+        "features": ["round"],
+        "direction": "extra",
+        "tip": (
+            "You are rounding your lips when they should be spread or neutral. "
+            "Spread your lips into a slight smile and keep them flat as you say the sound."
+        ),
+        "drill": "Say 'ee – ih – eh' with a relaxed smile — no lip rounding at all.",
+        "self_check": "Your lips should be flat or slightly spread, not puckered.",
+    },
+    # ── VOWEL LENGTH (Others group: 'long' / 'short') ─────────────────────
+    {
+        "features": ["long"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should be noticeably longer in duration. "
+            "English long vowels (/iy/, /uw/, /aa/, /ao/, /ae/, /er/) are roughly twice "
+            "as long as their short counterparts. Stretch it out."
+        ),
+        "drill": "Say 'beat' and hold the vowel: 'beeeeat'. Then compare with the short 'bit'.",
+        "self_check": "Record yourself — the vowel should sound stretched, not clipped.",
+    },
+    {
+        "features": ["short"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should be brief and clipped. "
+            "Short vowels (/ih/, /eh/, /ah/, /uh/) are reduced in duration. "
+            "Don't let the vowel linger — move quickly to the next sound."
+        ),
+        "drill": "Say 'bit', 'bet', 'but', 'book' — snap off each vowel quickly.",
+        "self_check": "The vowel should feel brief. If you can hold it comfortably, it's too long.",
+    },
+    # ── VOWEL TYPE (Others group: 'monophthong' / 'diphthong') ──────────
+    {
+        "features": ["monophthong"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should be pure and steady — your tongue and lips should hold the same "
+            "position throughout. You may be letting the vowel glide (diphthongize). "
+            "Keep your tongue and jaw completely still from start to finish."
+        ),
+        "drill": "Hold /aa/, /iy/, or /uw/ for three seconds without any movement.",
+        "self_check": "The vowel quality should be identical at the beginning and end — no glide.",
+    },
+    {
+        "features": ["diphthong"],
+        "direction": "missing",
+        "tip": (
+            "This vowel should glide from one position to another — it is a diphthong. "
+            "English diphthongs like /ay/ ('bite'), /aw/ ('bout'), /oy/ ('boy'), "
+            "/ey/ ('bait'), /ow/ ('boat') have a clear movement. "
+            "Let your tongue and jaw glide smoothly to the second target."
+        ),
+        "drill": "Say 'buy – bow – boy – bay – boat' slowly and feel the glide in each vowel.",
+        "self_check": "The vowel should sound like it is moving, not fixed in one place.",
+    },
+]
+# Build a fast lookup: feature → list of applicable rules
+_RULE_INDEX: Dict[str, List[Dict]] = {}
+for rule in FEATURE_RULES:
+    for feat in rule["features"]:
+        _RULE_INDEX.setdefault(feat, []).append(rule)
+# ──────────────────────────────────────────────
+# 2.  Rule matcher
+# ──────────────────────────────────────────────
+@dataclass
+class RuleFeedback:
+    feature: str
+    direction: str       # "missing" | "extra"
+    tip: str
+    drill: str
+    self_check: str
+    count: int = 1       # how many phonemes triggered this rule
+def match_rules(errors: List[PhonemeError]) -> List[RuleFeedback]:
+    """
+    Given a list of phoneme errors, find the most relevant feedback rules.
+    Rules are deduplicated and sorted by frequency of occurrence.
+    """
+    triggered: Dict[Tuple[str, str], RuleFeedback] = {}
+    for error in errors:
+        for feat in error.missing_features:
+            for rule in _RULE_INDEX.get(feat, []):
+                if rule["direction"] in ("missing", "both"):
+                    key = (feat, "missing")
+                    if key in triggered:
+                        triggered[key].count += 1
+                    else:
+                        triggered[key] = RuleFeedback(
+                            feature=feat,
+                            direction="missing",
+                            tip=rule["tip"],
+                            drill=rule["drill"],
+                            self_check=rule["self_check"],
+                        )
+        for feat in error.extra_features:
+            for rule in _RULE_INDEX.get(feat, []):
+                if rule["direction"] in ("extra", "both"):
+                    key = (feat, "extra")
+                    if key in triggered:
+                        triggered[key].count += 1
+                    else:
+                        triggered[key] = RuleFeedback(
+                            feature=feat,
+                            direction="extra",
+                            tip=rule["tip"],
+                            drill=rule["drill"],
+                            self_check=rule["self_check"],
+                        )
+    # Sort by occurrence count descending
+    return sorted(triggered.values(), key=lambda r: -r.count)
+# ──────────────────────────────────────────────
+# 3.  Template-based fallback feedback (no LLM needed)
+# ──────────────────────────────────────────────
+def format_feedback_template(
+    result: MDDResult,
+    rules: List[RuleFeedback],
+    max_issues: int = 3,
+) -> str:
+    """Structured text feedback without LLM — always available."""
+    lines = []
+    score = result.utterance_score
+    # Score header
+    if score >= 85:
+        lines.append(f"🎉 Great pronunciation! Score: {score:.0f}/100")
+    elif score >= 65:
+        lines.append(f"👍 Good effort! Score: {score:.0f}/100 — a few things to polish.")
+    elif score >= 45:
+        lines.append(f"📚 Score: {score:.0f}/100 — let's work on some key areas.")
+    else:
+        lines.append(f"💪 Score: {score:.0f}/100 — keep practicing, you'll get there!")
+    if not rules:
+        lines.append("\nNo significant feature errors detected. Well done!")
+        return "\n".join(lines)
+    lines.append(f"\nI found {len(result.errors)} phoneme(s) that need attention.\n")
+    for i, rule in enumerate(rules[:max_issues]):
+        direction_word = "missing" if rule.direction == "missing" else "extra"
+        lines.append(f"— Issue {i+1}: [{rule.feature}] feature {direction_word}")
+        lines.append(f"  💡 {rule.tip}")
+        lines.append(f"  🏋️  Drill: {rule.drill}")
+        lines.append(f"  ✅ Self-check: {rule.self_check}\n")
+    return "\n".join(lines)
+# ──────────────────────────────────────────────
+# 4.  LLM-enhanced feedback
+# ──────────────────────────────────────────────
+LLM_SYSTEM_PROMPT = """You are a warm, encouraging English pronunciation coach.
+Your student just attempted to say a sentence and you've identified specific
+phonological feature errors. Your task is to rewrite the structured feedback
+into a single natural, conversational coaching response.
+Rules:
+- Keep ALL the articulatory tips and self-checks intact — do not omit or soften them.
+- Write as if speaking to the student directly.
+- Be encouraging but honest.
+- Limit response to 200 words maximum.
+- Do not add new advice not present in the structured feedback.
+- Start with a brief overall assessment, then naturally weave in the tips.
+- End with one motivating sentence.
+"""
+def generate_llm_feedback(
+    structured_feedback: str,
+    score: float,
+    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct",  # lightweight default
+    use_cloud_fallback: bool = True,
+) -> str:
+    """
+    Rewrites structured feedback into natural coaching language.
+    Tries (in order):
+    1. Local transformers model (if available)
+    2. Cloud LLM API (if use_cloud_fallback=True and API key set)
+    3. Returns structured_feedback unchanged as graceful degradation
+    """
+    prompt = f"""Here is structured pronunciation feedback for a student who scored {score:.0f}/100:
+{structured_feedback}
+Please rewrite this as a warm, natural coaching response."""
+    # --- Try local model first ---
+    try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        import torch
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto",
+        )
+        messages = [
+            {"role": "system", "content": LLM_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ]
+        text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = tokenizer([text], return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(
+            output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
+        )
+        return response.strip()
+    except Exception as local_err:
+        print(f"[Local LLM] Not available: {local_err}")
+    # --- Cloud fallback (OpenAI-compatible API) ---
+    if use_cloud_fallback:
+        api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("LLM_API_KEY")
+        api_base = os.environ.get("LLM_API_BASE", "https://api.openai.com/v1")
+        cloud_model = os.environ.get("LLM_MODEL", "gpt-4o-mini")
+        if api_key:
+            try:
+                import httpx
+                headers = {
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                }
+                body = {
+                    "model": cloud_model,
+                    "messages": [
+                        {"role": "system", "content": LLM_SYSTEM_PROMPT},
+                        {"role": "user", "content": prompt},
+                    ],
+                    "max_tokens": 300,
+                    "temperature": 0.7,
+                }
+                r = httpx.post(f"{api_base}/chat/completions", json=body, headers=headers, timeout=15)
+                r.raise_for_status()
+                return r.json()["choices"][0]["message"]["content"].strip()
+            except Exception as cloud_err:
+                print(f"[Cloud LLM] Failed: {cloud_err}")
+    # --- Graceful degradation ---
+    return structured_feedback
+# ──────────────────────────────────────────────
+# 5.  Main feedback pipeline
+# ──────────────────────────────────────────────
+def generate_feedback(
+    result: MDDResult,
+    use_llm: bool = True,
+    max_issues: int = 3,
+) -> Dict:
+    """
+    Full feedback pipeline. Returns a dict with keys:
+      score, template_feedback, final_feedback, error_summary, rules_triggered
+    """
+    rules = match_rules(result.errors)
+    template_fb = format_feedback_template(result, rules, max_issues)
+    if use_llm and rules:
+        final_fb = generate_llm_feedback(template_fb, result.utterance_score)
+    else:
+        final_fb = template_fb
+    error_summary = [
+        {
+            "position": e.position,
+            "target": e.target_phoneme,
+            "produced": e.produced_phoneme,
+            "missing_features": e.missing_features,
+            "extra_features": e.extra_features,
+            "accuracy": round(e.feature_accuracy, 3),
+            "severity": e.severity,
+        }
+        for e in result.errors
+    ]
+    return {
+        "score": round(result.utterance_score, 1),
+        "template_feedback": template_fb,
+        "final_feedback": final_fb,
+        "error_summary": error_summary,
+        "feature_error_counts": result.feature_error_counts,
+        "rules_triggered": [
+            {
+                "feature": r.feature,
+                "direction": r.direction,
+                "occurrences": r.count,
+            }
+            for r in rules
+        ],
+    }

mdd_engine.py ADDED Viewed

	@@ -0,0 +1,534 @@

+"""
+MDD Engine — Mispronunciation Detection and Diagnosis
+=====================================================
+Architecture (Shahin et al. 2025)
+----------------------------------
+Your model runs 35 independent CTC decoders, one per phonological feature.
+Each decoder outputs a sequence of +att(1) / -att(0) labels, with blanks
+already removed and runs collapsed — so the output length reflects the number
+of detected phoneme-level events, NOT audio frames.
+The canonical target comes from the user's typed sentence:
+    sentence → G2P (CMU ARPAbet) → phoneme_sequence_to_feature_sequences()
+    → 35 binary label sequences of length T (number of target phonemes)
+The problem: the actual decoded sequence per feature may have a DIFFERENT
+length than T, because the student may have:
+    - deleted phonemes  (actual shorter than target)
+    - inserted extras   (actual longer than target)
+    - substituted       (same length, wrong labels)
+Solution: Needleman-Wunsch (global sequence alignment) per feature
+------------------------------------------------------------------
+For each of the 35 features we run a global pairwise alignment between the
+target binary sequence and the actual binary sequence. This gives us an
+explicit alignment path with match / mismatch / insertion / deletion ops.
+We then aggregate across all 35 features to get, per target phoneme position:
+    - which actual position it maps to (or DELETION if no match)
+    - which features are missing (+att in target, -att or gap in actual)
+    - which features are extra   (-att in target, +att in actual)
+    - a weighted feature accuracy score
+This is the standard approach in phonological MDD literature when no frame-
+level forced alignment is available (see e.g. Lee & Glass 2015, Leung et al.
+2019, and the feature-based MDD track of the AIP challenge).
+Input/output contract
+---------------------
+  actual_feature_seqs : list[list[int]]   — 35 lists, each decoded CTC output
+                                            Values: 1 (+att) or 0 (-att)
+                                            Lengths may differ across features
+                                            and from the canonical length T
+  target_phonemes     : list[str]         — CMU ARPAbet phoneme sequence from
+                                            the user's typed sentence, length T
+Output: MDDResult (see dataclass below)
+"""
+from __future__ import annotations
+import numpy as np
+from dataclasses import dataclass, field
+from typing import List, Dict, Tuple, Optional
+from phonological_features import (
+    PHONOLOGICAL_FEATURES,
+    phoneme_sequence_to_feature_sequences,
+    phoneme_to_feature_vector,
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# 1.  Feature schema & weights
+# ─────────────────────────────────────────────────────────────────────────────
+FEATURE_NAMES: List[str] = PHONOLOGICAL_FEATURES   # 35 features, canonical order
+NUM_FEATURES = len(FEATURE_NAMES)                   # 35
+assert NUM_FEATURES == 35
+F2I: Dict[str, int] = {f: i for i, f in enumerate(FEATURE_NAMES)}
+# Perceptual salience weights — higher = more important mismatch.
+# Manner errors (wrong sound class) are most disruptive.
+# Voicing errors are highly salient in English.
+# Place errors matter but less so than manner.
+# Length/type distinctions are least salient in L2 MDD.
+FEATURE_WEIGHTS: np.ndarray = np.array([
+    # Manners (11): consonant sonorant fricative nasal stop
+    2.0, 1.5, 1.8, 2.0, 2.0,
+    # approximant affricate liquid vowel semivowel continuant
+    1.5, 1.8, 1.5, 2.0, 1.5, 1.2,
+    # Places (18): alveolar palatal dental glottal labial velar
+    1.5, 1.4, 1.3, 1.2, 1.5, 1.5,
+    # mid high low front back central
+    1.8, 1.8, 1.8, 1.6, 1.6, 1.2,
+    # anterior posterior retroflex bilabial coronal dorsal
+    1.3, 1.3, 1.3, 1.4, 1.3, 1.3,
+    # Others (6): long short monophthong diphthong round voiced
+    1.0, 1.0, 1.2, 1.2, 1.2, 2.5,
+], dtype=np.float32)
+assert len(FEATURE_WEIGHTS) == 35
+# Alignment op codes
+MATCH    =  0   # same label, same position
+MISMATCH =  1   # different label, same position
+DELETE   =  2   # target has event, actual has gap (deletion error)
+INSERT   =  3   # actual has event, target has gap (insertion error)
+# NW scoring scheme
+MATCH_SCORE    =  2
+MISMATCH_SCORE = -1
+GAP_PENALTY    = -2   # penalises deletions and insertions equally
+# ─────────────────────────────────────────────────────────────────────────────
+# 2.  Data classes
+# ─────────────────────────────────────────────────────────────────────────────
+@dataclass
+class AlignedPosition:
+    """One position in the target sequence after multi-feature alignment."""
+    target_idx:   int             # index in target phoneme sequence
+    actual_idx:   Optional[int]   # index in actual sequence, None = deletion
+    op:           int             # MATCH / MISMATCH / DELETE / INSERT
+    target_bits:  List[int]       # canonical feature vector (35 bits)
+    actual_bits:  List[int]       # observed feature vector (35 bits, 0 if deleted)
+    missing_features: List[str]   # +att in target, -att or gap in actual
+    extra_features:   List[str]   # -att in target, +att in actual
+    feature_accuracy: float       # weighted accuracy 0-1
+@dataclass
+class PhonemeError:
+    """One mispronounced phoneme with its full feature-level diagnosis."""
+    position:         int         # index in target sequence
+    target_phoneme:   str         # ARPAbet label from typed sentence
+    missing_features: List[str]   # features the student failed to produce
+    extra_features:   List[str]   # features the student added erroneously
+    is_deletion:      bool        # student dropped this phoneme entirely
+    feature_accuracy: float       # 0-1
+    severity:         str         # "mild" | "moderate" | "severe"
+@dataclass
+class MDDResult:
+    utterance_score:     float              # 0-100
+    phoneme_scores:      List[float]        # per target phoneme, 0-1
+    errors:              List[PhonemeError]
+    aligned_positions:   List[AlignedPosition]
+    feature_error_counts: Dict[str, int]   # aggregated across all phonemes
+    deletion_count:      int
+    insertion_count:     int
+# ─────────────────────────────────────────────────────────────────────────────
+# 3.  Needleman-Wunsch per-feature aligner
+# ─────────────────────────────────────────────────────────────────────────────
+def _nw_align(target_seq: List[int],
+              actual_seq: List[int]) -> List[Tuple[Optional[int], Optional[int]]]:
+    """
+    Global sequence alignment (Needleman-Wunsch) for two binary label sequences.
+    Returns a list of (target_idx, actual_idx) pairs where:
+        (i, j)      → match or mismatch at target[i], actual[j]
+        (i, None)   → deletion: target[i] has no corresponding actual event
+        (None, j)   → insertion: actual[j] has no corresponding target event
+    Binary values: 1 = +att, 0 = -att
+    """
+    T = len(target_seq)
+    A = len(actual_seq)
+    # Fill score matrix
+    score = np.zeros((T + 1, A + 1), dtype=np.float32)
+    score[0, :] = np.arange(A + 1) * GAP_PENALTY
+    score[:, 0] = np.arange(T + 1) * GAP_PENALTY
+    for i in range(1, T + 1):
+        for j in range(1, A + 1):
+            s = MATCH_SCORE if target_seq[i-1] == actual_seq[j-1] else MISMATCH_SCORE
+            score[i, j] = max(
+                score[i-1, j-1] + s,   # match/mismatch
+                score[i-1, j]   + GAP_PENALTY,   # deletion
+                score[i,   j-1] + GAP_PENALTY,   # insertion
+            )
+    # Traceback
+    path: List[Tuple[Optional[int], Optional[int]]] = []
+    i, j = T, A
+    while i > 0 or j > 0:
+        if i > 0 and j > 0:
+            s = MATCH_SCORE if target_seq[i-1] == actual_seq[j-1] else MISMATCH_SCORE
+            if score[i, j] == score[i-1, j-1] + s:
+                path.append((i-1, j-1))
+                i -= 1; j -= 1
+                continue
+        if i > 0 and score[i, j] == score[i-1, j] + GAP_PENALTY:
+            path.append((i-1, None))   # deletion
+            i -= 1
+        else:
+            path.append((None, j-1))   # insertion
+            j -= 1
+    path.reverse()
+    return path
+# ─────────────────────────────────────────────────────────────────────────────
+# 4.  Multi-feature alignment aggregator
+# ─────────────────────────────────────────────────────────────────────────────
+def _align_all_features(
+    target_feat_seqs: List[List[int]],   # 35 lists, each length T
+    actual_feat_seqs: List[List[int]],   # 35 lists, each possibly != T
+    T: int,                              # number of target phonemes
+) -> List[AlignedPosition]:
+    """
+    Run NW alignment independently on each of 35 feature sequences, then
+    aggregate the results per target phoneme position.
+    Strategy
+    --------
+    Each feature gives its own alignment path. We collect, for each target
+    position i, a vote over all 35 features about what actual position it
+    maps to. The plurality actual index wins. If the majority vote is "gap"
+    (deletion), the position is marked as a deletion.
+    Then per position we reconstruct the actual feature bits from the voted
+    actual index across all features.
+    """
+    # votes[target_idx] → list of actual_idx votes (None = deletion vote)
+    votes: List[List[Optional[int]]] = [[] for _ in range(T)]
+    # per_feature_actual_idx[feat][target_idx] → actual_idx or None
+    per_feat_map: List[Dict[int, Optional[int]]] = [
+        {} for _ in range(NUM_FEATURES)
+    ]
+    for feat_i in range(NUM_FEATURES):
+        t_seq = target_feat_seqs[feat_i]   # length T
+        a_seq = actual_feat_seqs[feat_i]   # length may differ
+        path = _nw_align(t_seq, a_seq)
+        for (ti, ai) in path:
+            if ti is None:
+                continue   # insertion — no target position, skip
+            votes[ti].append(ai)       # ai may be None (deletion)
+            per_feat_map[feat_i][ti] = ai
+    # Resolve votes per target position
+    aligned: List[AlignedPosition] = []
+    DELETION_VOTE_THRESHOLD = 0.5  # >50% gap votes → mark as DELETE
+    for ti in range(T):
+        v = votes[ti]
+        non_null = [x for x in v if x is not None]
+        null_count = len(v) - len(non_null)
+        deletion_fraction = null_count / max(len(v), 1)
+        if not non_null or deletion_fraction > DELETION_VOTE_THRESHOLD:
+            chosen_ai = None
+        else:
+            # Plurality vote among non-null actual indices
+            counts: Dict[int, int] = {}
+            for idx in non_null:
+                counts[idx] = counts.get(idx, 0) + 1
+            chosen_ai = max(counts, key=counts.__getitem__)
+        # Build target and actual bit vectors for this position
+        target_bits = [target_feat_seqs[f][ti] for f in range(NUM_FEATURES)]
+        if chosen_ai is not None:
+            actual_bits = []
+            for f in range(NUM_FEATURES):
+                # Use per-feature actual value if this feature agrees on chosen_ai
+                feat_ai = per_feat_map[f].get(ti, None)
+                if feat_ai == chosen_ai:
+                    actual_bits.append(actual_feat_seqs[f][feat_ai]
+                                       if feat_ai < len(actual_feat_seqs[f]) else 0)
+                else:
+                    # Feature disagrees on the position — use its own aligned value
+                    fa = per_feat_map[f].get(ti, None)
+                    if fa is not None and fa < len(actual_feat_seqs[f]):
+                        actual_bits.append(actual_feat_seqs[f][fa])
+                    else:
+                        actual_bits.append(0)   # treat as absent
+            op = MATCH if target_bits == actual_bits else MISMATCH
+        else:
+            actual_bits = [0] * NUM_FEATURES
+            op = DELETE
+        # Compute feature-level errors
+        missing = [FEATURE_NAMES[f] for f in range(NUM_FEATURES)
+                   if target_bits[f] == 1 and actual_bits[f] == 0]
+        extra   = [FEATURE_NAMES[f] for f in range(NUM_FEATURES)
+                   if target_bits[f] == 0 and actual_bits[f] == 1]
+        # Weighted accuracy: fraction of weighted features correctly produced
+        correct_weight = sum(
+            FEATURE_WEIGHTS[f]
+            for f in range(NUM_FEATURES)
+            if target_bits[f] == actual_bits[f]
+        )
+        total_weight = float(FEATURE_WEIGHTS.sum())
+        accuracy = float(correct_weight / total_weight)
+        aligned.append(AlignedPosition(
+            target_idx=ti,
+            actual_idx=chosen_ai,
+            op=op,
+            target_bits=target_bits,
+            actual_bits=actual_bits,
+            missing_features=missing,
+            extra_features=extra,
+            feature_accuracy=accuracy,
+        ))
+    return aligned
+# ─────────────────────────────────────────────────────────────────────────────
+# 5.  Insertion detector
+# ─────────────────────────────────────────────────────────────────────────────
+def _count_insertions(
+    actual_feat_seqs: List[List[int]],
+    actual_len: int,
+    aligned: List[AlignedPosition],
+) -> int:
+    """
+    Count actual positions that were voted as insertions (not mapped to any
+    target position) by the majority of features.
+    """
+    used_actual = set(
+        ap.actual_idx for ap in aligned if ap.actual_idx is not None
+    )
+    inserted = set(range(actual_len)) - used_actual
+    return len(inserted)
+# ─────────��───────────────────────────────────────────────────────────────────
+# 6.  Severity classifier
+# ─────────────────────────────────────────────────────────────────────────────
+# Thresholds on weighted feature error rate
+_SEV = {"mild": 0.85, "moderate": 0.65}   # accuracy thresholds (higher = easier)
+def _severity(accuracy: float, is_deletion: bool) -> str:
+    if is_deletion:
+        return "severe"
+    if accuracy >= _SEV["mild"]:
+        return "mild"
+    if accuracy >= _SEV["moderate"]:
+        return "moderate"
+    return "severe"
+# ─────────────────────────────────────────────────────────────────────────────
+# 7.  Scorer
+# ─────────────────────────────────────────────────────────────────────────────
+def _score_utterance(aligned: List[AlignedPosition]) -> Tuple[float, List[float]]:
+    """
+    Per-phoneme score: weighted feature accuracy (0-1).
+    Deletions score 0.
+    Utterance score: weighted mean, penalising deletions most.
+    """
+    phoneme_scores = [ap.feature_accuracy for ap in aligned]
+    utterance_score = float(np.mean(phoneme_scores)) * 100.0
+    return utterance_score, phoneme_scores
+# ─────────────────────────────────────────────────────────────────────────────
+# 8.  Error list builder
+# ─────────────────────────────────────────────────────────────────────────────
+def _build_errors(
+    aligned: List[AlignedPosition],
+    target_phonemes: List[str],
+) -> List[PhonemeError]:
+    errors = []
+    for ap in aligned:
+        if ap.op == MATCH and not ap.missing_features and not ap.extra_features:
+            continue   # perfectly correct, no error to report
+        errors.append(PhonemeError(
+            position=ap.target_idx,
+            target_phoneme=target_phonemes[ap.target_idx],
+            missing_features=ap.missing_features,
+            extra_features=ap.extra_features,
+            is_deletion=(ap.op == DELETE),
+            feature_accuracy=ap.feature_accuracy,
+            severity=_severity(ap.feature_accuracy, ap.op == DELETE),
+        ))
+    return errors
+# ─────────────────────────────────────────────────────────────────────────────
+# 9.  Aggregate feature error counts
+# ─────────────────────────────────────────────────────────────────────────────
+def _aggregate(errors: List[PhonemeError]) -> Dict[str, int]:
+    counts: Dict[str, int] = {}
+    for e in errors:
+        for f in e.missing_features + e.extra_features:
+            counts[f] = counts.get(f, 0) + 1
+    return dict(sorted(counts.items(), key=lambda x: -x[1]))
+# ─────────────────────────────────────────────────────────────────────────────
+# 10.  Public entry point
+# ─────────────────────────────────────────────────────────────────────────────
+def run_mdd(
+    actual_feature_seqs: List[List[int]],
+    target_phonemes: List[str],
+) -> MDDResult:
+    """
+    Full MDD pipeline for a CTC phonological-feature model.
+    Parameters
+    ----------
+    actual_feature_seqs : list of 35 lists of int (0 or 1)
+        CTC-decoded output of your model, AFTER blank removal and run-length
+        collapsing. Each list is the decoded +att/−att sequence for one feature.
+        Lengths may differ from each other and from len(target_phonemes).
+        Index order must match PHONOLOGICAL_FEATURES / FEATURE_NAMES.
+        Concretely, if your model outputs logits of shape (T_audio, 71):
+            nodes 0-34  = +att for features 0-34
+            nodes 35-69 = -att for features 0-34
+            node  70    = blank
+        Then for feature i, the CTC-decoded sequence is a list of 0s and 1s
+        (1 = +att node fired, 0 = -att node fired), blanks removed.
+    target_phonemes : list of str
+        CMU ARPAbet phoneme sequence from the user's typed sentence.
+        Obtain via any G2P tool, e.g. g2p_en:
+            from g2p_en import G2p
+            target_phonemes = G2p()(sentence)
+    Returns
+    -------
+    MDDResult
+    """
+    assert len(actual_feature_seqs) == 35, \
+        f"Expected 35 feature sequences, got {len(actual_feature_seqs)}"
+    assert len(target_phonemes) > 0, "target_phonemes must not be empty"
+    T = len(target_phonemes)
+    # Build canonical target feature sequences from the phoneme labels
+    target_feat_seqs: List[List[int]] = phoneme_sequence_to_feature_sequences(
+        target_phonemes
+    )   # 35 lists, each of length T
+    # Actual lengths (for insertion counting)
+    actual_len = max((len(s) for s in actual_feature_seqs), default=0)
+    # Step 1: per-feature NW alignment → per target-position feature bits
+    aligned = _align_all_features(target_feat_seqs, actual_feature_seqs, T)
+    # Step 2: count structural errors
+    deletions  = sum(1 for ap in aligned if ap.op == DELETE)
+    insertions = _count_insertions(actual_feature_seqs, actual_len, aligned)
+    # Step 3: score
+    utt_score, phoneme_scores = _score_utterance(aligned)
+    # Step 4: build error list
+    errors = _build_errors(aligned, target_phonemes)
+    # Step 5: aggregate feature error counts
+    feat_error_counts = _aggregate(errors)
+    return MDDResult(
+        utterance_score=utt_score,
+        phoneme_scores=phoneme_scores,
+        errors=errors,
+        aligned_positions=aligned,
+        feature_error_counts=feat_error_counts,
+        deletion_count=deletions,
+        insertion_count=insertions,
+    )
+# ─────────────────────────────────────────────────────────────────────────────
+# 11.  CTC decode helper  (use this on raw model logits)
+# ─────────────────────────────────────────────────────────────────────────────
+def ctc_decode_feature_seqs(
+    logits: np.ndarray,          # (T_audio, 71)  — raw model output per frame
+    blank_idx: int = 70,
+) -> List[List[int]]:
+    """
+    Greedy CTC decode for a phonological feature model with 71 output nodes.
+    For each of the 35 features independently:
+      1. At each frame, pick argmax between pos_node (feat_i) and neg_node (feat_i+35)
+         (ignoring blank).
+      2. Collapse runs and remove frames where blank wins overall.
+      3. Return the sequence of 1s (+att) and 0s (-att).
+    Parameters
+    ----------
+    logits : np.ndarray (T_audio, 71)
+        Raw model output before softmax. If you've already applied softmax,
+        pass probabilities — the argmax logic is identical.
+    blank_idx : int
+        Index of the shared blank node (default 70).
+    Returns
+    -------
+    List of 35 lists of int (0 or 1), CTC-decoded.
+    """
+    T_audio = logits.shape[0]
+    feature_seqs: List[List[int]] = [[] for _ in range(35)]
+    for feat_i in range(35):
+        pos_node = feat_i        # +att node
+        neg_node = feat_i + 35  # -att node
+        prev_label = None
+        for t in range(T_audio):
+            frame = logits[t]
+            best_overall = int(np.argmax(frame))
+            if best_overall == blank_idx:
+                prev_label = None   # blank resets run
+                continue
+            # Among pos/neg for this feature, pick the winner
+            label = 1 if frame[pos_node] >= frame[neg_node] else 0
+            # CTC run-length collapse
+            if label != prev_label:
+                feature_seqs[feat_i].append(label)
+                prev_label = label
+    return feature_seqs

phonological_features.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+phonological_features.py
+========================
+Defines the 35 phonological features from Table 1 of Shahin et al. (2025)
+and provides the phoneme-to-feature mapping for the 39-phoneme CMU set.
+Feature categories (paper Table 1):
+  Manners: consonant, sonorant, fricative, nasal, stop, approximant,
+           affricate, liquid, vowel, semivowel, continuant
+  Places:  alveolar, palatal, dental, glottal, labial, velar, mid, high,
+           low, front, back, central, anterior, posterior, retroflex,
+           bilabial, coronal, dorsal
+  Others:  long, short, monophthong, diphthong, round, voiced
+The model output has 71 nodes: 35 (+att) + 35 (-att) + 1 (shared blank).
+"""
+# ─────────────────────────────────────────────────────────────────────────────
+# The 35 phonological features (paper Table 1), in a fixed canonical order
+# ─────────────────────────────────────────────────────────────────────────────
+PHONOLOGICAL_FEATURES = [
+    # Manners (11)
+    "consonant", "sonorant", "fricative", "nasal", "stop",
+    "approximant", "affricate", "liquid", "vowel", "semivowel", "continuant",
+    # Places (18)
+    "alveolar", "palatal", "dental", "glottal", "labial", "velar",
+    "mid", "high", "low", "front", "back", "central",
+    "anterior", "posterior", "retroflex", "bilabial", "coronal", "dorsal",
+    # Others (6)
+    "long", "short", "monophthong", "diphthong", "round", "voiced",
+]
+assert len(PHONOLOGICAL_FEATURES) == 35, "Must have exactly 35 features"
+FEATURE_TO_IDX = {feat: i for i, feat in enumerate(PHONOLOGICAL_FEATURES)}
+NUM_FEATURES = len(PHONOLOGICAL_FEATURES)
+# ─────────────────────────────────────────────────────────────────────────────
+# Output node layout (paper Section 3.3):
+#   nodes 0..34        → +att for features 0..34
+#   nodes 35..69       → -att for features 0..34
+#   node  70           → shared blank
+# ─────────────────────────────────────────────────────────────────────────────
+NUM_OUTPUT_NODES = 71   # 35 + 35 + 1
+BLANK_IDX = 70
+def feature_idx_to_pos_node(feat_idx: int) -> int:
+    """Return output node index for +att of a given feature."""
+    return feat_idx
+def feature_idx_to_neg_node(feat_idx: int) -> int:
+    """Return output node index for -att of a given feature."""
+    return feat_idx + NUM_FEATURES
+# ─────────────────────────────────────────────────────────────────────────────
+# CMU 39-phoneme set  (TIMIT 61→39 reduced set used in the paper)
+# ─────────────────────────────────────────────────────────────────────────────
+CMU_39_PHONEMES = [
+    "aa", "ae", "ah", "aw", "ay","ao",
+    "b",  "ch", "d",  "dh", "eh",
+    "er", "ey", "f",  "g",  "hh",
+    "ih", "iy", "jh", "k",  "l",
+    "m",  "n",  "ng", "ow", "oy",
+    "p",  "r",  "s",  "sh", "t",
+    "th", "uh", "uw", "v",  "w",
+    "y",  "z",  "zh",
+]
+PHONEME_TO_IDX = {p: i for i, p in enumerate(CMU_39_PHONEMES)}
+NUM_PHONEMES = len(CMU_39_PHONEMES)   # 39
+# ─────────────────────────────────────────────────────────────────────────────
+# Phoneme → phonological feature binary vector
+# Each phoneme maps to a dict {feature_name: True/False}.
+# Derived from standard phonological feature charts (Chomsky & Halle 1968,
+# as referenced in the paper).
+# ─────────────────────────────────────────────────────────────────────────────
+def _p(features_present: list[str]) -> dict[str, bool]:
+    """Helper: build feature dict from list of present features."""
+    return {f: (f in features_present) for f in PHONOLOGICAL_FEATURES}
+PHONEME_FEATURES: dict[str, dict[str, bool]] = {
+    # ── Stops ──────────────────────────────────────────────────────────────
+    "p":  _p(["consonant", "stop", "labial", "anterior", "bilabial"]),
+    "b":  _p(["consonant", "stop", "labial", "anterior", "bilabial",
+               "voiced"]),
+    "t":  _p(["consonant", "stop", "alveolar", "anterior", "coronal"]),
+    "d":  _p(["consonant", "stop", "alveolar", "anterior", "coronal",
+               "voiced"]),
+    "k":  _p(["consonant", "stop", "velar", "posterior", "dorsal"]),
+    "g":  _p(["consonant", "stop", "velar", "posterior", "dorsal",
+               "voiced"]),
+    # ── Fricatives ─────────────────────────────────────────────────────────
+    "f":  _p(["consonant", "fricative", "continuant", "labial", "anterior"]),
+    "v":  _p(["consonant", "fricative", "continuant", "labial", "anterior", "voiced"]),
+    "th": _p(["consonant", "fricative", "continuant", "dental", "anterior",
+               "coronal"]),
+    "dh": _p(["consonant", "fricative", "continuant", "dental", "anterior",
+               "coronal", "voiced"]),
+    "s":  _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
+               "coronal"]),
+    "z":  _p(["consonant", "fricative", "continuant", "alveolar", "anterior",
+               "coronal", "voiced"]),
+    "sh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
+               "coronal"]),
+    "zh": _p(["consonant", "fricative", "continuant", "palatal", "posterior",
+               "coronal", "voiced"]),
+    "hh": _p(["consonant", "fricative", "continuant", "glottal", "posterior",
+               "dorsal"]),
+    # ── Affricates ─────────────────────────────────────────────────────────
+    "ch": _p(["consonant", "affricate", "palatal", "posterior", "coronal"]),
+    "jh": _p(["consonant", "affricate", "palatal", "posterior", "coronal",
+               "voiced"]),
+    # ── Nasals ─────────────────────────────────────────────────────────────
+    "m":  _p(["consonant", "sonorant", "nasal", "continuant", "labial",
+               "anterior", "bilabial", "voiced"]),
+    "n":  _p(["consonant", "sonorant", "nasal", "continuant", "alveolar",
+               "anterior", "coronal", "voiced"]),
+    "ng": _p(["consonant", "sonorant", "nasal", "continuant", "velar",
+               "posterior", "dorsal", "voiced"]),
+    # ── Liquids ────────────────────────────────────────────────────────────
+    "l":  _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
+               "alveolar", "anterior", "coronal", "voiced"]),
+    "r":  _p(["consonant", "sonorant", "approximant", "liquid", "continuant",
+               "alveolar", "anterior", "retroflex", "coronal", "voiced"]),
+    # ── Semivowels (Glides) ────────────────────────────────────────────────
+    "w":  _p(["sonorant", "approximant", "semivowel", "continuant", "labial",
+               "high", "anterior", "bilabial", "round", "voiced"]),
+    "y":  _p(["sonorant", "approximant", "semivowel", "continuant", "palatal",
+               "high", "posterior", "coronal", "voiced"]),
+    # ── Short Monophthong Vowels ───────────────────────────────────────────
+    "ih": _p(["sonorant", "vowel", "continuant", "high", "front",
+               "short", "monophthong", "voiced"]),
+    "eh": _p(["sonorant", "vowel", "mid", "front",
+               "short", "monophthong", "voiced"]),
+    "ae": _p(["sonorant", "vowel", "continuant", "low", "front",
+               "long", "monophthong", "voiced"]),
+    "ah": _p(["sonorant", "vowel", "continuant", "mid", "back",
+               "short", "monophthong", "voiced"]),
+    "uh": _p(["sonorant", "vowel", "continuant", "high", "back",
+               "short", "monophthong", "round", "voiced"]),
+    # ── Long Monophthong Vowels ────────────────────────────────────────────
+    "iy": _p(["sonorant", "vowel", "continuant", "high", "front",
+               "long", "monophthong", "voiced"]),
+    "aa": _p(["sonorant", "vowel", "continuant", "low", "back",
+               "long", "monophthong", "voiced"]),
+    "ao": _p(["sonorant", "vowel", "continuant", "mid", "back",
+               "long", "monophthong", "round", "voiced"]),
+    "er": _p(["sonorant", "vowel", "continuant", "mid", "central",
+               "retroflex", "short", "monophthong", "voiced"]),
+    "uw": _p(["sonorant", "vowel", "continuant", "high", "back",
+               "long", "monophthong", "round", "voiced"]),
+    # ── Diphthongs ─────────────────────────────────────────────────────────
+    "ey": _p(["sonorant", "vowel", "continuant", "mid", "front",
+               "long", "diphthong", "voiced"]),
+    "aw": _p(["sonorant", "vowel", "continuant", "low", "central",
+               "long", "diphthong", "round", "voiced"]),
+    "ay": _p(["sonorant", "vowel", "low", "central",
+               "long", "diphthong", "voiced"]),
+    "oy": _p(["sonorant", "vowel", "continuant", "mid", "back",
+               "long", "diphthong", "round", "voiced"]),
+    "ow": _p(["sonorant", "vowel", "continuant", "mid", "central",
+               "long", "diphthong", "round", "voiced"]),
+    # ── Silence ────────────────────────────────────────────────────────────
+    # Paper: "All silence labels were further removed leaving silence frames
+    # to be handled by the blank label."
+    "sil": _p([]),   # all features absent; treated as blank during training
+}
+# Verify all 39 phonemes are covered.
+# "sil" is intentionally extra — it is a fallback/blank placeholder, not a
+# speech target, so it lives in PHONEME_FEATURES but not in CMU_39_PHONEMES.
+_expected = set(CMU_39_PHONEMES) | {"sil"}
+assert set(PHONEME_FEATURES.keys()) == _expected, (
+    f"Missing from PHONEME_FEATURES : {_expected - set(PHONEME_FEATURES.keys())}\n"
+    f"Unexpected in PHONEME_FEATURES: {set(PHONEME_FEATURES.keys()) - _expected}"
+)
+assert NUM_PHONEMES == 39, f"Expected 39 phonemes, got {NUM_PHONEMES}"
+def phoneme_to_feature_vector(phoneme: str) -> list[bool]:
+    """Return a binary list of length 35 for a given phoneme."""
+    feat_dict = PHONEME_FEATURES.get(phoneme, PHONEME_FEATURES["sil"])
+    return [feat_dict[f] for f in PHONOLOGICAL_FEATURES]
+def phoneme_sequence_to_feature_sequences(
+    phonemes: list[str],
+) -> list[list[int]]:
+    """
+    Convert a phoneme sequence to N=35 binary label sequences.
+    Returns:
+        feature_seqs: list of 35 lists, each containing +att(1) or -att(0)
+                      integers for each phoneme position.
+    """
+    feature_seqs = [[] for _ in range(NUM_FEATURES)]
+    for ph in phonemes:
+        vec = phoneme_to_feature_vector(ph)
+        for feat_idx, present in enumerate(vec):
+            feature_seqs[feat_idx].append(1 if present else 0)
+    return feature_seqs
+def feature_sequences_to_ctc_labels(
+    feature_seqs: list[list[int]],
+) -> list[list[int]]:
+    """
+    Convert binary feature sequences (0/1) to CTC label indices.
+    For category i:
+      - +att  →  node index i          (feature_idx_to_pos_node)
+      - -att  →  node index i + 35     (feature_idx_to_neg_node)
+    Returns:
+        ctc_labels: list of 35 lists of node indices (int)
+    """
+    ctc_labels = []
+    for feat_idx, seq in enumerate(feature_seqs):
+        label_seq = []
+        for val in seq:
+            if val == 1:
+                label_seq.append(feature_idx_to_pos_node(feat_idx))
+            else:
+                label_seq.append(feature_idx_to_neg_node(feat_idx))
+        ctc_labels.append(label_seq)
+    return ctc_labels

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Core
+gradio>=4.0.0
+numpy>=1.24.0
+scipy>=1.10.0
+# Model
+torch>=2.0.0
+transformers>=4.40.0
+huggingface_hub>=0.20.0
+# Audio
+librosa>=0.10.0
+soundfile>=0.12.0
+# Optional LLM rewriter
+accelerate>=0.27.0
+httpx>=0.25.0

wav2vec2_phonological.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+wav2vec2_phonological.py
+========================
+Phonological feature detection model strictly following Figure 2 of
+Shahin et al. (Speech Communication, 2025).
+Architecture (Fig. 2):
+    Raw Speech
+        │
+        ▼
+    wav2vec2.0 (pre-trained, CNN encoder frozen)
+        ├─ CNN Feature Extractor   [FROZEN]
+        └─ Transformer             [FINE-TUNED]
+        │
+        ▼
+    Linear Layer  (hidden_size → 71 nodes)
+        │
+        ▼
+    SCTC-SB Loss (during training)
+    OR
+    argmax per category (during inference)
+Output nodes:
+    0..34   → +att_i  (presence of feature i)
+    35..69  → -att_i  (absence of feature i)
+    70      → shared blank
+"""
+import torch
+import torch.nn as nn
+from transformers import Wav2Vec2Model
+from typing import Optional
+from phonological_features import NUM_FEATURES, NUM_OUTPUT_NODES, BLANK_IDX
+class PhonologicalWav2Vec2(nn.Module):
+    """
+    wav2vec2-based phonological feature detection model.
+    Args:
+        pretrained_model_name (str): HuggingFace model ID.
+            Paper uses 'facebook/wav2vec2-large-robust' (best performing).
+        num_output_nodes (int): 71 = 35(+att) + 35(-att) + 1(blank).
+        freeze_cnn_encoder (bool): Whether to freeze CNN feature extractor.
+            Paper Section 5.2: "its parameters were fixed during fine-tuning".
+    """
+    def __init__(
+        self,
+        pretrained_model_name: str = "facebook/wav2vec2-large-robust",
+        num_output_nodes: int = NUM_OUTPUT_NODES,
+        freeze_cnn_encoder: bool = True,
+    ):
+        super().__init__()
+        self.num_output_nodes = num_output_nodes
+        self.num_features = NUM_FEATURES
+        self.blank_idx = BLANK_IDX
+        # ── Load pre-trained wav2vec2 ─────────────────────────────────────
+        print(f"[PhonologicalWav2Vec2] Loading '{pretrained_model_name}' ...")
+        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
+        # ── Freeze CNN encoder (feature extractor) ────────────────────────
+        # Paper: "Except for the CNN encoder layer, the whole network was
+        # then fine-tuned"
+        if freeze_cnn_encoder:
+            self.wav2vec2.feature_extractor._freeze_parameters()
+            print("[PhonologicalWav2Vec2] CNN encoder FROZEN.")
+        # ── Linear projection head (Fig. 2) ──────────────────────────────
+        # "A linear layer was added on top of the transformer module with
+        #  number of nodes equals to the number of target phonological-features"
+        hidden_size = self.wav2vec2.config.hidden_size
+        self.classifier = nn.Linear(hidden_size, num_output_nodes)
+        print(f"[PhonologicalWav2Vec2] hidden_size={hidden_size}, "
+              f"output_nodes={num_output_nodes}")
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        apply_spec_augment: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass.
+        Paper Section 5.2: "SpecAugment was applied to the output of the
+        CNN encoder to add more variations to the training data."
+        Wav2Vec2Model natively applies SpecAugment between the CNN encoder
+        and the transformer when mask_time_indices is provided.
+        Returns:
+            logits        : (B, T_frames, 71)  raw logits
+            output_lengths: (B,)  number of valid frames per batch item
+        """
+        # Build mask_time_indices for SpecAugment if training
+        mask_time_indices = None
+        if apply_spec_augment and self.training:
+            # Compute output frame lengths to know valid T per item
+            if attention_mask is not None:
+                feat_lengths = self._get_feat_extract_output_lengths(
+                    attention_mask.sum(dim=1)
+                )
+            else:
+                B, T_audio = input_values.shape
+                ones = torch.ones(B, dtype=torch.long, device=input_values.device) * T_audio
+                feat_lengths = self._get_feat_extract_output_lengths(ones)
+            B = input_values.shape[0]
+            T = int(feat_lengths.max().item())
+            # Build boolean mask: mask up to 10% of valid frames per utterance
+            mask_time_indices = torch.zeros(B, T, dtype=torch.bool,
+                                            device=input_values.device)
+            t_len = max(1, int(T * 0.10))
+            for b in range(B):
+                valid = int(feat_lengths[b].item())
+                if valid > t_len:
+                    t0 = torch.randint(0, valid - t_len, (1,)).item()
+                    mask_time_indices[b, t0:t0 + t_len] = True
+        outputs = self.wav2vec2(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            mask_time_indices=mask_time_indices,
+            output_hidden_states=False,
+        )
+        hidden_states = outputs.last_hidden_state  # (B, T, 1024)
+        logits = self.classifier(hidden_states)
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(
+                attention_mask.sum(dim=1)
+            )
+        else:
+            B, T_audio = input_values.shape
+            ones = torch.ones(B, dtype=torch.long, device=input_values.device) * T_audio
+            output_lengths = self._get_feat_extract_output_lengths(ones)
+        return logits, output_lengths
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: torch.Tensor
+    ) -> torch.Tensor:
+        return self.wav2vec2._get_feat_extract_output_lengths(input_lengths)
+    @torch.no_grad()
+    def decode(
+        self,
+        logits: torch.Tensor,                          # (B, T, 71) or (T, 71) for single item
+        output_lengths: Optional[torch.Tensor] = None, # (B,) valid frame counts
+    ) -> list[list[list[int]]]:
+        """
+        Greedy CTC decoding per category.
+        For each feature category i, applies argmax over the 3-node slice
+        [pos_i, neg_i, blank] and collapses repeated labels + blanks.
+        Paper Section 3.3, Eq. 7:
+            h_i(x) = argmax_j y^t_{i,j}
+        Args:
+            logits        : (B, T, 71) raw model logits
+            output_lengths: (B,) number of valid (non-padded) frames per item.
+                            If None, all T frames are used (may include padding noise).
+        Returns:
+            decoded: [B][35]  list of decoded label sequences
+                     Each label sequence contains +att(True) or -att(False)
+        """
+        if logits.dim() == 2:
+            logits = logits.unsqueeze(0)
+        B, T, _ = logits.shape
+        decoded_batch = []
+        for b in range(B):
+            valid_T = T if output_lengths is None else int(output_lengths[b].item())
+            decoded_features = []
+            for feat_idx in range(self.num_features):
+                pos_node = feat_idx
+                neg_node = feat_idx + self.num_features
+                # Extract 3-node slice over valid frames only: (valid_T, 3)
+                cat_logits = torch.stack([
+                    logits[b, :valid_T, pos_node],
+                    logits[b, :valid_T, neg_node],
+                    logits[b, :valid_T, self.blank_idx],
+                ], dim=-1)  # (valid_T, 3)
+                # Argmax: 0=+att, 1=-att, 2=blank
+                preds = cat_logits.argmax(dim=-1)  # (valid_T,)
+                # CTC collapse: remove blanks and repeated labels
+                collapsed = []
+                prev = -1
+                for p in preds.tolist():
+                    if p == 2:       # blank
+                        prev = -1
+                        continue
+                    if p != prev:
+                        collapsed.append(p == 0)   # True=+att, False=-att
+                        prev = p
+                decoded_features.append(collapsed)
+            decoded_batch.append(decoded_features)
+        return decoded_batch
+    def count_parameters(self) -> dict:
+        """Count trainable vs frozen parameters."""
+        total = sum(p.numel() for p in self.parameters())
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        frozen = total - trainable
+        return {"total": total, "trainable": trainable, "frozen": frozen}
+# ─────────────────────────────────────────────────────────────────────────────
+# Phoneme-level baseline model (for comparison, paper Section 3)
+# Same architecture but with 40 output nodes (39 phonemes + 1 blank)
+# and standard CTC loss
+# ─────────────────────────────────────────────────────────────────────────────
+class PhonemeLevelWav2Vec2(nn.Module):
+    """
+    Phoneme-level MDD baseline (paper Section 3, Fig. 1 top branch).
+    Uses standard CTC with 39 phonemes + blank.
+    """
+    def __init__(
+        self,
+        pretrained_model_name: str = "facebook/wav2vec2-large-robust",
+        num_phonemes: int = 39,
+        freeze_cnn_encoder: bool = True,
+    ):
+        super().__init__()
+        self.num_phonemes = num_phonemes
+        self.blank_idx = num_phonemes   # index 39 = blank
+        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
+        if freeze_cnn_encoder:
+            self.wav2vec2.feature_extractor._freeze_parameters()
+        hidden_size = self.wav2vec2.config.hidden_size
+        # 40 nodes: 39 phonemes + 1 blank
+        self.classifier = nn.Linear(hidden_size, num_phonemes + 1)
+    def forward(self, input_values, attention_mask=None):
+        outputs = self.wav2vec2(
+            input_values=input_values,
+            attention_mask=attention_mask,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.classifier(hidden_states)
+        if attention_mask is not None:
+            output_lengths = self.wav2vec2._get_feat_extract_output_lengths(
+                attention_mask.sum(dim=1))
+        else:
+            B, T = input_values.shape
+            ones = torch.ones(B, dtype=torch.long, device=input_values.device) * T
+            output_lengths = self.wav2vec2._get_feat_extract_output_lengths(ones)
+        return logits, output_lengths