""" Pronunciation Coach — HuggingFace Space ======================================== 1. User types a normal English sentence 2. User records themselves saying it 3. App runs phonological model → 35 CTC feature sequences 4. MDD engine aligns them against canonical sequences → errors + score 5. Feedback generator returns coaching tips """ import os import re import json import torch import numpy as np import gradio as gr import librosa import pronouncing from huggingface_hub import snapshot_download from transformers import Wav2Vec2FeatureExtractor from wav2vec2_phonological import PhonologicalWav2Vec2 from mdd_engine import run_mdd from feedback_generator import generate_feedback from phonological_features import CMU_39_PHONEMES # ───────────────────────────────────────────────────────────────────────────── # Model globals # ───────────────────────────────────────────────────────────────────────────── _model = None _feature_extractor = None _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") PRETRAINED_BASE = "facebook/wav2vec2-large-robust" MODEL_REPO = os.environ.get("HF_MODEL_REPO", "Backlighteu/phonological-mdd") MODEL_FILENAME = os.environ.get("HF_MODEL_FILENAME", "best_model.pt") HF_TOKEN = os.environ.get("HF_TOKEN", None) def load_model(): global _model, _feature_extractor if _model is not None: return print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...") snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache") model = PhonologicalWav2Vec2( pretrained_model_name=PRETRAINED_BASE, num_output_nodes=71, freeze_cnn_encoder=True, ) state_dict = torch.load("./model_cache/best_model.pt", map_location=_device) model.load_state_dict(state_dict) model.to(_device) model.eval() _model = model _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE) print(f"[startup] Ready on {_device}.") # ───────────────────────────────────────────────────────────────────────────── # G2P — plain English → CMU-39 phonemes # ───────────────────────────────────────────────────────────────────────────── _CMU_39 = set(CMU_39_PHONEMES) def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]: words = re.sub(r"[^a-zA-Z\s]", "", sentence).split() phonemes, unknown = [], [] for word in words: results = pronouncing.phones_for_word(word.lower()) if results: for p in results[0].split(): p = re.sub(r"[0-9]", "", p).lower() if p in _CMU_39: phonemes.append(p) else: unknown.append(word) return phonemes, unknown # ───────────────────────────────────────────────────────────────────────────── # Audio inference # ───────────────────────────────────────────────────────────────────────────── def decode_audio(audio_path: str) -> list[list[int]]: load_model() waveform, _ = librosa.load(audio_path, sr=16000, mono=True) inputs = _feature_extractor( waveform.astype(np.float32), sampling_rate=16000, return_tensors="pt", padding=True, ) input_values = inputs.input_values.to(_device) attention_mask = inputs.get("attention_mask") if attention_mask is not None: attention_mask = attention_mask.to(_device) with torch.no_grad(): logits, output_lengths = _model(input_values, attention_mask, apply_spec_augment=False) decoded_35 = _model.decode(logits, output_lengths)[0] return [[1 if v else 0 for v in seq] for seq in decoded_35] # ───────────────────────────────────────────────────────────────────────────── # Main handler # ───────────────────────────────────────────────────────────────────────────── def process(audio_input, sentence_text, max_issues): if audio_input is None: return "⚠️ Please record or upload audio.", "" if not sentence_text.strip(): return "⚠️ Please type the sentence you want to practise.", "" # G2P target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip()) if not target_phonemes: return "⚠️ Could not convert sentence to phonemes. Try simpler English words.", "" # Model inference try: actual_feature_seqs = decode_audio(audio_input) except Exception as e: return f"❌ Audio error: {e}", "" # MDD try: result = run_mdd(actual_feature_seqs=actual_feature_seqs, target_phonemes=target_phonemes) except Exception as e: return f"❌ MDD error: {e}", "" # Feedback feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues)) score = feedback_dict["score"] main_out = f"**Score: {score}/100**\n\n" + feedback_dict["final_feedback"] if unknown: main_out += f"\n\n⚠️ Words not in dictionary (skipped): *{', '.join(unknown)}*" # Detail lines = [] for e in feedback_dict["error_summary"]: tag = " *(deleted)*" if e.get("is_deletion") else "" lines.append( f"**/{e['target']}/** pos {e['position']}{tag} — " f"{e['severity']}, {e['accuracy']:.0%} accurate \n" f"Missing: {', '.join(e['missing_features']) or '—'} | " f"Extra: {', '.join(e['extra_features']) or '—'}" ) detail_out = "\n\n".join(lines) if lines else "✅ No errors detected!" return main_out, detail_out # ───────────────────────────────────────────────────────────────────────────── # Gradio UI — clean and simple # ───────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Pronunciation Coach") as demo: gr.Markdown("# 🗣️ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.") with gr.Row(): with gr.Column(scale=1): sentence_input = gr.Textbox( label="Sentence to practise", placeholder="The cat sat on the mat", lines=2, ) audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Your speech", ) max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show") submit_btn = gr.Button("Analyse", variant="primary") with gr.Column(scale=2): feedback_out = gr.Markdown(label="Feedback") with gr.Accordion("Per-phoneme detail", open=False): detail_out = gr.Markdown() submit_btn.click( fn=process, inputs=[audio_input, sentence_input, max_issues], outputs=[feedback_out, detail_out], ) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft())