Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

App Files Files Community

heldtomaturity commited on 26 days ago

Commit

9aa0b19

1 Parent(s): 770a612

fix produced_phoneme AttributeError

Browse files

Files changed (2) hide show

app.py +65 -148
feedback_generator.py +1 -1

app.py CHANGED Viewed

@@ -1,16 +1,11 @@
 """
-Mispronunciation Detection & Diagnosis — HuggingFace Space
-===========================================================
-Wires together:
-  1. PhonologicalWav2Vec2  (your best_model.pt, loaded once at cold start)
-  2. G2P                   (user types normal English → auto-converted to ARPAbet)
-  3. MDD engine            (per-feature NW alignment → errors + score)
-  4. Feedback generator    (rule engine + optional LLM rewriter)
-Environment variables (Space → Settings → Variables and secrets):
-  HF_TOKEN          (secret)   — read token for your private model repo
-  HF_MODEL_REPO     (variable) — e.g. "Backlighteu/phonological-mdd"
-  HF_MODEL_FILENAME (variable) — e.g. "best_model.pt"
 """
 import os
@@ -22,7 +17,7 @@ import gradio as gr
 import librosa
 import pronouncing
-from huggingface_hub import hf_hub_download, snapshot_download
 from transformers import Wav2Vec2FeatureExtractor
 from wav2vec2_phonological import PhonologicalWav2Vec2
@@ -31,7 +26,7 @@ from feedback_generator import generate_feedback
 from phonological_features import CMU_39_PHONEMES
 # ─────────────────────────────────────────────────────────────────────────────
-# 1.  Model — loaded once, reused for every request
 # ─────────────────────────────────────────────────────────────────────────────
 _model = None
@@ -49,231 +44,153 @@ def load_model():
     if _model is not None:
         return
-    print(f"[startup] Caching {MODEL_REPO} to ./model_cache ...")
-    snapshot_download(
-        repo_id=MODEL_REPO,
-        token=HF_TOKEN,
-        local_dir="./model_cache",
-    )
-    weights_path = "./model_cache/best_model.pt"
-    print(f"[startup] Loading weights from {weights_path}")
     model = PhonologicalWav2Vec2(
         pretrained_model_name=PRETRAINED_BASE,
         num_output_nodes=71,
         freeze_cnn_encoder=True,
     )
-    state_dict = torch.load(weights_path, map_location=_device)
     model.load_state_dict(state_dict)
     model.to(_device)
     model.eval()
     _model = model
-    print(f"[startup] Model ready on {_device}.")
-    print(f"[startup] Loading feature extractor ...")
     _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
-    print("[startup] Feature extractor ready.")
 # ─────────────────────────────────────────────────────────────────────────────
-# 2.  G2P — normal English words → CMU-39 ARPAbet phonemes
 # ─────────────────────────────────────────────────────────────────────────────
 _CMU_39 = set(CMU_39_PHONEMES)
-def _word_to_phonemes(word: str) -> list[str] | None:
-    """Convert one word to CMU-39 phonemes using the bundled CMU dict."""
-    results = pronouncing.phones_for_word(word.lower())
-    if not results:
-        return None
-    phones = results[0].split()   # take first (most common) pronunciation
-    return [
-        re.sub(r"[0-9]", "", p).lower()   # strip stress digits
-        for p in phones
-        if re.sub(r"[0-9]", "", p).lower() in _CMU_39
-    ]
 def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
-    """
-    Convert a plain English sentence to a CMU-39 phoneme list.
-    Returns (phonemes, unknown_words).
-    Unknown words (not in CMU dict) are skipped and reported separately.
-    """
     words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
-    all_phonemes, unknown = [], []
     for word in words:
-        phones = _word_to_phonemes(word)
-        if phones:
-            all_phonemes.extend(phones)
         else:
             unknown.append(word)
-    return all_phonemes, unknown
 # ─────────────────────────────────────────────────────────────────────────────
-# 3.  Audio → decoded feature sequences
 # ─────────────────────────────────────────────────────────────────────────────
-TARGET_SR = 16_000
 def decode_audio(audio_path: str) -> list[list[int]]:
     load_model()
-    waveform, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True)
-    waveform = waveform.astype(np.float32)
     inputs = _feature_extractor(
-        waveform,
-        sampling_rate=TARGET_SR,
-        return_tensors="pt",
-        padding=True,
     )
     input_values   = inputs.input_values.to(_device)
     attention_mask = inputs.get("attention_mask")
     if attention_mask is not None:
         attention_mask = attention_mask.to(_device)
     with torch.no_grad():
-        logits, output_lengths = _model(
-            input_values, attention_mask, apply_spec_augment=False
-        )
-    # decode() returns list[B][35][list[bool]]
     decoded_35 = _model.decode(logits, output_lengths)[0]
     return [[1 if v else 0 for v in seq] for seq in decoded_35]
 # ─────────────────────────────────────────────────────────────────────────────
-# 4.  Gradio processing function
 # ─────────────────────────────────────────────────────────────────────────────
-def process(audio_input, sentence_text, use_llm, max_issues):
     if audio_input is None:
-        return "Please record or upload audio first.", "", "", "{}"
-    sentence_text = sentence_text.strip()
-    if not sentence_text:
-        return "Please type the sentence you want to practise.", "", "", "{}"
-    # G2P conversion
-    target_phonemes, unknown_words = sentence_to_phonemes(sentence_text)
     if not target_phonemes:
-        return (
-            "Could not convert the sentence to phonemes. "
-            "Please use common English words.",
-            "", "", "{}",
-        )
-    phoneme_display = " ".join(target_phonemes)
-    unknown_msg = ""
-    if unknown_words:
-        unknown_msg = f"\n\n⚠️ Words not found in dictionary (skipped): *{', '.join(unknown_words)}*"
-    # Audio inference
     try:
         actual_feature_seqs = decode_audio(audio_input)
     except Exception as e:
-        return f"Audio processing error: {e}", "", "", "{}"
     # MDD
     try:
-        result = run_mdd(
-            actual_feature_seqs=actual_feature_seqs,
-            target_phonemes=target_phonemes,
-        )
     except Exception as e:
-        return f"MDD engine error: {e}", "", "", "{}"
     # Feedback
-    feedback_dict = generate_feedback(result, use_llm=use_llm, max_issues=int(max_issues))
     score = feedback_dict["score"]
-    main_feedback = (
-        f"**Score: {score}/100**{unknown_msg}\n\n"
-        + feedback_dict["final_feedback"]
-    )
-    # Per-phoneme detail
-    detail_lines = ["### Per-phoneme breakdown\n"]
     for e in feedback_dict["error_summary"]:
-        del_tag = " *(deleted)*" if e.get("is_deletion") else ""
-        detail_lines.append(
-            f"- **/{e['target']}/** (position {e['position']}){del_tag}: "
-            f"severity=`{e['severity']}`, accuracy={e['accuracy']:.0%}\n"
-            f"  - Missing: {', '.join(e['missing_features']) or '—'}\n"
-            f"  - Extra:   {', '.join(e['extra_features'])   or '—'}"
         )
-    if not feedback_dict["error_summary"]:
-        detail_lines.append("✅ No errors detected — great pronunciation!")
-    json_output = json.dumps({
-        "score":                feedback_dict["score"],
-        "target_phonemes":      target_phonemes,
-        "deletion_count":       result.deletion_count,
-        "insertion_count":      result.insertion_count,
-        "feature_error_counts": feedback_dict["feature_error_counts"],
-        "actual_seq_lengths":   [len(s) for s in actual_feature_seqs],
-    }, indent=2)
-    return main_feedback, phoneme_display, "\n".join(detail_lines), json_output
 # ─────────────────────────────────────────────────────────────────────────────
-# 5.  Gradio UI
 # ─────────────────────────────────────────────────────────────────────────────
-with gr.Blocks(title="Pronunciation Coach", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🗣️ Pronunciation Coach
-        Type a sentence in plain English, record yourself saying it,
-        and get phonological-feature-level feedback with articulation tips.
-        """
-    )
     with gr.Row():
         with gr.Column(scale=1):
             sentence_input = gr.Textbox(
                 label="Sentence to practise",
-                placeholder="e.g.  The cat sat on the mat",
                 lines=2,
             )
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
-                label="Your speech — record or upload",
             )
-            with gr.Row():
-                use_llm    = gr.Checkbox(value=False, label="LLM feedback rewriter")
-                max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues shown")
             submit_btn = gr.Button("Analyse", variant="primary")
         with gr.Column(scale=2):
-            feedback_out  = gr.Markdown(label="Coaching feedback")
-            phoneme_out   = gr.Textbox(label="Auto-detected phonemes", interactive=False)
             with gr.Accordion("Per-phoneme detail", open=False):
                 detail_out = gr.Markdown()
-            with gr.Accordion("Raw JSON (developers)", open=False):
-                json_out = gr.Code(language="json")
     submit_btn.click(
         fn=process,
-        inputs=[audio_input, sentence_input, use_llm, max_issues],
-        outputs=[feedback_out, phoneme_out, detail_out, json_out],
-    )
-    gr.Markdown(
-        """
-        ---
-        Just type any English sentence and hit **Analyse** — the app converts
-        it to phonemes automatically using the CMU Pronouncing Dictionary.
-        """
     )
 if __name__ == "__main__":
-    demo.launch()

 """
+Pronunciation Coach — HuggingFace Space
+========================================
+1. User types a normal English sentence
+2. User records themselves saying it
+3. App runs phonological model → 35 CTC feature sequences
+4. MDD engine aligns them against canonical sequences → errors + score
+5. Feedback generator returns coaching tips
 """
 import os
 import librosa
 import pronouncing
+from huggingface_hub import snapshot_download
 from transformers import Wav2Vec2FeatureExtractor
 from wav2vec2_phonological import PhonologicalWav2Vec2
 from phonological_features import CMU_39_PHONEMES
 # ─────────────────────────────────────────────────────────────────────────────
+# Model globals
 # ─────────────────────────────────────────────────────────────────────────────
 _model = None
     if _model is not None:
         return
+    print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...")
+    snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache")
     model = PhonologicalWav2Vec2(
         pretrained_model_name=PRETRAINED_BASE,
         num_output_nodes=71,
         freeze_cnn_encoder=True,
     )
+    state_dict = torch.load("./model_cache/best_model.pt", map_location=_device)
     model.load_state_dict(state_dict)
     model.to(_device)
     model.eval()
     _model = model
     _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
+    print(f"[startup] Ready on {_device}.")
 # ─────────────────────────────────────────────────────────────────────────────
+# G2P — plain English → CMU-39 phonemes
 # ─────────────────────────────────────────────────────────────────────────────
 _CMU_39 = set(CMU_39_PHONEMES)
 def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
     words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
+    phonemes, unknown = [], []
     for word in words:
+        results = pronouncing.phones_for_word(word.lower())
+        if results:
+            for p in results[0].split():
+                p = re.sub(r"[0-9]", "", p).lower()
+                if p in _CMU_39:
+                    phonemes.append(p)
         else:
             unknown.append(word)
+    return phonemes, unknown
 # ─────────────────────────────────────────────────────────────────────────────
+# Audio inference
 # ─────────────────────────────────────────────────────────────────────────────
 def decode_audio(audio_path: str) -> list[list[int]]:
     load_model()
+    waveform, _ = librosa.load(audio_path, sr=16000, mono=True)
     inputs = _feature_extractor(
+        waveform.astype(np.float32), sampling_rate=16000,
+        return_tensors="pt", padding=True,
     )
     input_values   = inputs.input_values.to(_device)
     attention_mask = inputs.get("attention_mask")
     if attention_mask is not None:
         attention_mask = attention_mask.to(_device)
     with torch.no_grad():
+        logits, output_lengths = _model(input_values, attention_mask,
+                                        apply_spec_augment=False)
     decoded_35 = _model.decode(logits, output_lengths)[0]
     return [[1 if v else 0 for v in seq] for seq in decoded_35]
 # ─────────────────────────────────────────────────────────────────────────────
+# Main handler
 # ─────────────────────────────────────────────────────────────────────────────
+def process(audio_input, sentence_text, max_issues):
     if audio_input is None:
+        return "⚠️ Please record or upload audio.", ""
+    if not sentence_text.strip():
+        return "⚠️ Please type the sentence you want to practise.", ""
+    # G2P
+    target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip())
     if not target_phonemes:
+        return "⚠️ Could not convert sentence to phonemes. Try simpler English words.", ""
+    # Model inference
     try:
         actual_feature_seqs = decode_audio(audio_input)
     except Exception as e:
+        return f"❌ Audio error: {e}", ""
     # MDD
     try:
+        result = run_mdd(actual_feature_seqs=actual_feature_seqs,
+                         target_phonemes=target_phonemes)
     except Exception as e:
+        return f"❌ MDD error: {e}", ""
     # Feedback
+    feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues))
     score = feedback_dict["score"]
+    main_out = f"**Score: {score}/100**\n\n" + feedback_dict["final_feedback"]
+    if unknown:
+        main_out += f"\n\n⚠️ Words not in dictionary (skipped): *{', '.join(unknown)}*"
+    # Detail
+    lines = []
     for e in feedback_dict["error_summary"]:
+        tag = " *(deleted)*" if e.get("is_deletion") else ""
+        lines.append(
+            f"**/{e['target']}/** pos {e['position']}{tag} — "
+            f"{e['severity']}, {e['accuracy']:.0%} accurate  \n"
+            f"Missing: {', '.join(e['missing_features']) or '—'} | "
+            f"Extra: {', '.join(e['extra_features']) or '—'}"
         )
+    detail_out = "\n\n".join(lines) if lines else "✅ No errors detected!"
+    return main_out, detail_out
 # ─────────────────────────────────────────────────────────────────────────────
+# Gradio UI — clean and simple
 # ─────────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="Pronunciation Coach") as demo:
+    gr.Markdown("# 🗣️ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.")
     with gr.Row():
         with gr.Column(scale=1):
             sentence_input = gr.Textbox(
                 label="Sentence to practise",
+                placeholder="The cat sat on the mat",
                 lines=2,
             )
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
+                label="Your speech",
             )
+            max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show")
             submit_btn = gr.Button("Analyse", variant="primary")
         with gr.Column(scale=2):
+            feedback_out = gr.Markdown(label="Feedback")
             with gr.Accordion("Per-phoneme detail", open=False):
                 detail_out = gr.Markdown()
     submit_btn.click(
         fn=process,
+        inputs=[audio_input, sentence_input, max_issues],
+        outputs=[feedback_out, detail_out],
     )
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Soft())

feedback_generator.py CHANGED Viewed

@@ -664,7 +664,7 @@ def generate_feedback(
         {
             "position": e.position,
             "target": e.target_phoneme,
-            "produced": e.produced_phoneme,
             "missing_features": e.missing_features,
             "extra_features": e.extra_features,
             "accuracy": round(e.feature_accuracy, 3),

         {
             "position": e.position,
             "target": e.target_phoneme,
+            "is_deletion": e.is_deletion,
             "missing_features": e.missing_features,
             "extra_features": e.extra_features,
             "accuracy": round(e.feature_accuracy, 3),