Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

App Files Files Community

heldtomaturity commited on 17 days ago

Commit

770a612

1 Parent(s): 0515ef3

add automatic G2P - users type normal English now

Browse files

Files changed (2) hide show

app.py +99 -107
requirements.txt +1 -7

app.py CHANGED Viewed

@@ -3,21 +3,24 @@ Mispronunciation Detection & Diagnosis — HuggingFace Space
 ===========================================================
 Wires together:
   1. PhonologicalWav2Vec2  (your best_model.pt, loaded once at cold start)
-  2. MDD engine            (per-feature NW alignment → errors + score)
-  3. Feedback generator    (rule engine + optional LLM rewriter)
-Environment variables to set in Space → Settings → Variables and secrets:
   HF_TOKEN          (secret)   — read token for your private model repo
   HF_MODEL_REPO     (variable) — e.g. "Backlighteu/phonological-mdd"
-  HF_MODEL_FILENAME (variable) — e.g. "best_model.pt"  (default)
 """
 import os
 import json
 import torch
 import numpy as np
 import gradio as gr
 import librosa
 from huggingface_hub import hf_hub_download, snapshot_download
 from transformers import Wav2Vec2FeatureExtractor
@@ -25,12 +28,10 @@ from transformers import Wav2Vec2FeatureExtractor
 from wav2vec2_phonological import PhonologicalWav2Vec2
 from mdd_engine import run_mdd
 from feedback_generator import generate_feedback
-from phonological_features import (
-    CMU_39_PHONEMES,
-)
 # ─────────────────────────────────────────────────────────────────────────────
-# 1.  Model — loaded once at cold start, reused for every request
 # ─────────────────────────────────────────────────────────────────────────────
 _model = None
@@ -45,12 +46,9 @@ HF_TOKEN        = os.environ.get("HF_TOKEN",          None)
 def load_model():
     global _model, _feature_extractor
     if _model is not None:
         return
-    # Download entire repo into ./model_cache once, then load from disk.
-    # hf_hub_download checks cache first — no re-download if already present.
     print(f"[startup] Caching {MODEL_REPO} to ./model_cache ...")
     snapshot_download(
         repo_id=MODEL_REPO,
@@ -65,7 +63,6 @@ def load_model():
         num_output_nodes=71,
         freeze_cnn_encoder=True,
     )
     state_dict = torch.load(weights_path, map_location=_device)
     model.load_state_dict(state_dict)
     model.to(_device)
@@ -73,29 +70,59 @@ def load_model():
     _model = model
     print(f"[startup] Model ready on {_device}.")
-    print(f"[startup] Loading feature extractor from '{PRETRAINED_BASE}' ...")
     _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
     print("[startup] Feature extractor ready.")
 # ─────────────────────────────────────────────────────────────────────────────
-# 2.  Audio → decoded feature sequences
 # ─────────────────────────────────────────────────────────────────────────────
-TARGET_SR = 16_000
-def decode_audio(audio_path: str) -> list:
-    """
-    Load audio, run the phonological model, return CTC-decoded feature seqs.
-    Returns
-    -------
-    actual_feature_seqs : list of 35 lists of int (0 or 1)
-        CTC-decoded +att / -att sequence for each of the 35 features.
     """
     load_model()
-    waveform, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
     waveform = waveform.astype(np.float32)
     inputs = _feature_extractor(
@@ -112,154 +139,122 @@ def decode_audio(audio_path: str) -> list:
     with torch.no_grad():
         logits, output_lengths = _model(
-            input_values,
-            attention_mask,
-            apply_spec_augment=False,
         )
-    # model.decode() returns list[B][35][list[bool]]  — True=+att, False=-att
-    decoded_batch = _model.decode(logits, output_lengths)
-    decoded_35 = decoded_batch[0]   # [35][list[bool]]
-    # Convert bool → int (1/0)
-    actual_feature_seqs = [
-        [1 if v else 0 for v in feat_seq]
-        for feat_seq in decoded_35
-    ]
-    return actual_feature_seqs
-# ─────────────────────────────────────────────────────────────────────────────
-# 3.  Text → canonical phoneme sequence
-# ─────────────────────────────────────────────────────────────────────────────
-_VALID_PHONEMES = set(CMU_39_PHONEMES) | {"sil"}
-def parse_phoneme_input(text: str) -> list:
-    """
-    Accept space-separated CMU ARPAbet tokens typed by the user.
-    Unknown tokens are skipped with a warning.
-    """
-    tokens = text.lower().split()
-    valid, skipped = [], []
-    for t in tokens:
-        if t in _VALID_PHONEMES:
-            valid.append(t)
-        else:
-            skipped.append(t)
-    if skipped:
-        print(f"[warning] Unrecognised tokens skipped: {skipped}")
-    return valid if valid else ["sil"]
 # ─────────────────────────────────────────────────────────────────────────────
 # 4.  Gradio processing function
 # ─────────────────────────────────────────────────────────────────────────────
-def process(audio_input, script_text, use_llm, max_issues):
     if audio_input is None:
-        return "Please record or upload audio first.", "", "{}"
-    script_text = script_text.strip()
-    if not script_text:
         return (
-            "Please type the target sentence as ARPAbet phoneme tokens.\n"
-            "Example: `dh ae k ae t` for 'the cat'",
-            "", "{}",
         )
     try:
         actual_feature_seqs = decode_audio(audio_input)
     except Exception as e:
-        return f"Audio processing error: {e}", "", "{}"
-    target_phonemes = parse_phoneme_input(script_text)
     try:
         result = run_mdd(
             actual_feature_seqs=actual_feature_seqs,
             target_phonemes=target_phonemes,
         )
     except Exception as e:
-        return f"MDD engine error: {e}", "", "{}"
-    feedback_dict = generate_feedback(
-        result,
-        use_llm=use_llm,
-        max_issues=int(max_issues),
-    )
     score = feedback_dict["score"]
     main_feedback = (
-        f"**Pronunciation Score: {score}/100**\n\n"
         + feedback_dict["final_feedback"]
     )
-    detail_lines = ["### Per-phoneme detail\n"]
     for e in feedback_dict["error_summary"]:
-        deletion_tag = " *(deleted)*" if e.get("is_deletion") else ""
         detail_lines.append(
-            f"- **/{e['target']}/** (pos {e['position']}){deletion_tag}: "
             f"severity=`{e['severity']}`, accuracy={e['accuracy']:.0%}\n"
             f"  - Missing: {', '.join(e['missing_features']) or '—'}\n"
             f"  - Extra:   {', '.join(e['extra_features'])   or '—'}"
         )
     if not feedback_dict["error_summary"]:
-        detail_lines.append("No feature-level errors detected — great pronunciation!")
-    detail_text = "\n".join(detail_lines)
     json_output = json.dumps({
         "score":                feedback_dict["score"],
         "deletion_count":       result.deletion_count,
         "insertion_count":      result.insertion_count,
         "feature_error_counts": feedback_dict["feature_error_counts"],
-        "rules_triggered":      feedback_dict["rules_triggered"],
-        "target_phonemes":      target_phonemes,
         "actual_seq_lengths":   [len(s) for s in actual_feature_seqs],
     }, indent=2)
-    return main_feedback, detail_text, json_output
 # ─────────────────────────────────────────────────────────────────────────────
 # 5.  Gradio UI
 # ─────────────────────────────────────────────────────────────────────────────
-VALID_PHONEME_LIST = ", ".join(sorted(CMU_39_PHONEMES))
 with gr.Blocks(title="Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # Pronunciation Coach
-        Speak a sentence, type what you meant to say as **ARPAbet phoneme tokens**,
         and get phonological-feature-level feedback with articulation tips.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
-                label="Your speech",
-            )
-            script_input = gr.Textbox(
-                label="Target sentence — space-separated ARPAbet tokens",
-                placeholder="e.g.  dh ae k ae t   (= 'the cat')",
-                lines=2,
             )
-            with gr.Accordion("Valid phoneme tokens", open=False):
-                gr.Markdown(f"`{VALID_PHONEME_LIST}`")
             with gr.Row():
                 use_llm    = gr.Checkbox(value=False, label="LLM feedback rewriter")
                 max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues shown")
             submit_btn = gr.Button("Analyse", variant="primary")
         with gr.Column(scale=2):
-            feedback_out = gr.Markdown(label="Coaching feedback")
             with gr.Accordion("Per-phoneme detail", open=False):
                 detail_out = gr.Markdown()
             with gr.Accordion("Raw JSON (developers)", open=False):
@@ -267,18 +262,15 @@ with gr.Blocks(title="Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     submit_btn.click(
         fn=process,
-        inputs=[audio_input, script_input, use_llm, max_issues],
-        outputs=[feedback_out, detail_out, json_out],
     )
     gr.Markdown(
         """
         ---
-        **How to enter the target sentence:**
-        Convert your sentence to ARPAbet using the
-        [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict)
-        then paste the space-separated tokens here.
-        Example: *"the cat sat"* → `dh ax k ae t s ae t`
         """
     )

 ===========================================================
 Wires together:
   1. PhonologicalWav2Vec2  (your best_model.pt, loaded once at cold start)
+  2. G2P                   (user types normal English → auto-converted to ARPAbet)
+  3. MDD engine            (per-feature NW alignment → errors + score)
+  4. Feedback generator    (rule engine + optional LLM rewriter)
+Environment variables (Space → Settings → Variables and secrets):
   HF_TOKEN          (secret)   — read token for your private model repo
   HF_MODEL_REPO     (variable) — e.g. "Backlighteu/phonological-mdd"
+  HF_MODEL_FILENAME (variable) — e.g. "best_model.pt"
 """
 import os
+import re
 import json
 import torch
 import numpy as np
 import gradio as gr
 import librosa
+import pronouncing
 from huggingface_hub import hf_hub_download, snapshot_download
 from transformers import Wav2Vec2FeatureExtractor
 from wav2vec2_phonological import PhonologicalWav2Vec2
 from mdd_engine import run_mdd
 from feedback_generator import generate_feedback
+from phonological_features import CMU_39_PHONEMES
 # ─────────────────────────────────────────────────────────────────────────────
+# 1.  Model — loaded once, reused for every request
 # ─────────────────────────────────────────────────────────────────────────────
 _model = None
 def load_model():
     global _model, _feature_extractor
     if _model is not None:
         return
     print(f"[startup] Caching {MODEL_REPO} to ./model_cache ...")
     snapshot_download(
         repo_id=MODEL_REPO,
         num_output_nodes=71,
         freeze_cnn_encoder=True,
     )
     state_dict = torch.load(weights_path, map_location=_device)
     model.load_state_dict(state_dict)
     model.to(_device)
     _model = model
     print(f"[startup] Model ready on {_device}.")
+    print(f"[startup] Loading feature extractor ...")
     _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
     print("[startup] Feature extractor ready.")
 # ─────────────────────────────────────────────────────────────────────────────
+# 2.  G2P — normal English words → CMU-39 ARPAbet phonemes
 # ─────────────────────────────────────────────────────────────────────────────
+_CMU_39 = set(CMU_39_PHONEMES)
+def _word_to_phonemes(word: str) -> list[str] | None:
+    """Convert one word to CMU-39 phonemes using the bundled CMU dict."""
+    results = pronouncing.phones_for_word(word.lower())
+    if not results:
+        return None
+    phones = results[0].split()   # take first (most common) pronunciation
+    return [
+        re.sub(r"[0-9]", "", p).lower()   # strip stress digits
+        for p in phones
+        if re.sub(r"[0-9]", "", p).lower() in _CMU_39
+    ]
+def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
+    """
+    Convert a plain English sentence to a CMU-39 phoneme list.
+    Returns (phonemes, unknown_words).
+    Unknown words (not in CMU dict) are skipped and reported separately.
     """
+    words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
+    all_phonemes, unknown = [], []
+    for word in words:
+        phones = _word_to_phonemes(word)
+        if phones:
+            all_phonemes.extend(phones)
+        else:
+            unknown.append(word)
+    return all_phonemes, unknown
+# ─────────────────────────────────────────────────────────────────────────────
+# 3.  Audio → decoded feature sequences
+# ─────────────────────────────────────────────────────────────────────────────
+TARGET_SR = 16_000
+def decode_audio(audio_path: str) -> list[list[int]]:
     load_model()
+    waveform, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True)
     waveform = waveform.astype(np.float32)
     inputs = _feature_extractor(
     with torch.no_grad():
         logits, output_lengths = _model(
+            input_values, attention_mask, apply_spec_augment=False
         )
+    # decode() returns list[B][35][list[bool]]
+    decoded_35 = _model.decode(logits, output_lengths)[0]
+    return [[1 if v else 0 for v in seq] for seq in decoded_35]
 # ─────────────────────────────────────────────────────────────────────────────
 # 4.  Gradio processing function
 # ─────────────────────────────────────────────────────────────────────────────
+def process(audio_input, sentence_text, use_llm, max_issues):
     if audio_input is None:
+        return "Please record or upload audio first.", "", "", "{}"
+    sentence_text = sentence_text.strip()
+    if not sentence_text:
+        return "Please type the sentence you want to practise.", "", "", "{}"
+    # G2P conversion
+    target_phonemes, unknown_words = sentence_to_phonemes(sentence_text)
+    if not target_phonemes:
         return (
+            "Could not convert the sentence to phonemes. "
+            "Please use common English words.",
+            "", "", "{}",
         )
+    phoneme_display = " ".join(target_phonemes)
+    unknown_msg = ""
+    if unknown_words:
+        unknown_msg = f"\n\n⚠️ Words not found in dictionary (skipped): *{', '.join(unknown_words)}*"
+    # Audio inference
     try:
         actual_feature_seqs = decode_audio(audio_input)
     except Exception as e:
+        return f"Audio processing error: {e}", "", "", "{}"
+    # MDD
     try:
         result = run_mdd(
             actual_feature_seqs=actual_feature_seqs,
             target_phonemes=target_phonemes,
         )
     except Exception as e:
+        return f"MDD engine error: {e}", "", "", "{}"
+    # Feedback
+    feedback_dict = generate_feedback(result, use_llm=use_llm, max_issues=int(max_issues))
     score = feedback_dict["score"]
     main_feedback = (
+        f"**Score: {score}/100**{unknown_msg}\n\n"
         + feedback_dict["final_feedback"]
     )
+    # Per-phoneme detail
+    detail_lines = ["### Per-phoneme breakdown\n"]
     for e in feedback_dict["error_summary"]:
+        del_tag = " *(deleted)*" if e.get("is_deletion") else ""
         detail_lines.append(
+            f"- **/{e['target']}/** (position {e['position']}){del_tag}: "
             f"severity=`{e['severity']}`, accuracy={e['accuracy']:.0%}\n"
             f"  - Missing: {', '.join(e['missing_features']) or '—'}\n"
             f"  - Extra:   {', '.join(e['extra_features'])   or '—'}"
         )
     if not feedback_dict["error_summary"]:
+        detail_lines.append("✅ No errors detected — great pronunciation!")
     json_output = json.dumps({
         "score":                feedback_dict["score"],
+        "target_phonemes":      target_phonemes,
         "deletion_count":       result.deletion_count,
         "insertion_count":      result.insertion_count,
         "feature_error_counts": feedback_dict["feature_error_counts"],
         "actual_seq_lengths":   [len(s) for s in actual_feature_seqs],
     }, indent=2)
+    return main_feedback, phoneme_display, "\n".join(detail_lines), json_output
 # ─────────────────────────────────────────────────────────────────────────────
 # 5.  Gradio UI
 # ─────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 🗣️ Pronunciation Coach
+        Type a sentence in plain English, record yourself saying it,
         and get phonological-feature-level feedback with articulation tips.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            sentence_input = gr.Textbox(
+                label="Sentence to practise",
+                placeholder="e.g.  The cat sat on the mat",
+                lines=2,
+            )
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
+                label="Your speech — record or upload",
             )
             with gr.Row():
                 use_llm    = gr.Checkbox(value=False, label="LLM feedback rewriter")
                 max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues shown")
             submit_btn = gr.Button("Analyse", variant="primary")
         with gr.Column(scale=2):
+            feedback_out  = gr.Markdown(label="Coaching feedback")
+            phoneme_out   = gr.Textbox(label="Auto-detected phonemes", interactive=False)
             with gr.Accordion("Per-phoneme detail", open=False):
                 detail_out = gr.Markdown()
             with gr.Accordion("Raw JSON (developers)", open=False):
     submit_btn.click(
         fn=process,
+        inputs=[audio_input, sentence_input, use_llm, max_issues],
+        outputs=[feedback_out, phoneme_out, detail_out, json_out],
     )
     gr.Markdown(
         """
         ---
+        Just type any English sentence and hit **Analyse** — the app converts
+        it to phonemes automatically using the CMU Pronouncing Dictionary.
         """
     )

requirements.txt CHANGED Viewed

@@ -1,17 +1,11 @@
-# Core
 gradio>=4.0.0
 numpy>=1.24.0
 scipy>=1.10.0
-# Model
 torch>=2.0.0
 transformers>=4.40.0
 huggingface_hub>=0.20.0
-# Audio
 librosa>=0.10.0
 soundfile>=0.12.0
-# Optional LLM rewriter
 accelerate>=0.27.0
 httpx>=0.25.0

 gradio>=4.0.0
 numpy>=1.24.0
 scipy>=1.10.0
 torch>=2.0.0
 transformers>=4.40.0
 huggingface_hub>=0.20.0
 librosa>=0.10.0
 soundfile>=0.12.0
 accelerate>=0.27.0
 httpx>=0.25.0
+pronouncing>=0.2.0