Spaces:
Sleeping
Sleeping
| """ | |
| Pronunciation Coach β HuggingFace Space | |
| ======================================== | |
| 1. User types a normal English sentence | |
| 2. User records themselves saying it | |
| 3. App runs phonological model β 35 CTC feature sequences | |
| 4. MDD engine aligns them against canonical sequences β errors + score | |
| 5. Feedback generator returns coaching tips | |
| """ | |
| import os | |
| import re | |
| import json | |
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| import librosa | |
| import pronouncing | |
| from huggingface_hub import snapshot_download | |
| from transformers import Wav2Vec2FeatureExtractor | |
| from wav2vec2_phonological import PhonologicalWav2Vec2 | |
| from mdd_engine import run_mdd | |
| from feedback_generator import generate_feedback | |
| from phonological_features import CMU_39_PHONEMES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model globals | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _model = None | |
| _feature_extractor = None | |
| _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| PRETRAINED_BASE = "facebook/wav2vec2-large-robust" | |
| MODEL_REPO = os.environ.get("HF_MODEL_REPO", "Backlighteu/phonological-mdd") | |
| MODEL_FILENAME = os.environ.get("HF_MODEL_FILENAME", "best_model.pt") | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| def load_model(): | |
| global _model, _feature_extractor | |
| if _model is not None: | |
| return | |
| print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...") | |
| snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache") | |
| model = PhonologicalWav2Vec2( | |
| pretrained_model_name=PRETRAINED_BASE, | |
| num_output_nodes=71, | |
| freeze_cnn_encoder=True, | |
| ) | |
| state_dict = torch.load("./model_cache/best_model.pt", map_location=_device) | |
| model.load_state_dict(state_dict) | |
| model.to(_device) | |
| model.eval() | |
| _model = model | |
| _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE) | |
| print(f"[startup] Ready on {_device}.") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # G2P β plain English β CMU-39 phonemes | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _CMU_39 = set(CMU_39_PHONEMES) | |
| def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]: | |
| words = re.sub(r"[^a-zA-Z\s]", "", sentence).split() | |
| phonemes, unknown = [], [] | |
| for word in words: | |
| results = pronouncing.phones_for_word(word.lower()) | |
| if results: | |
| for p in results[0].split(): | |
| p = re.sub(r"[0-9]", "", p).lower() | |
| if p in _CMU_39: | |
| phonemes.append(p) | |
| else: | |
| unknown.append(word) | |
| return phonemes, unknown | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Audio inference | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def decode_audio(audio_path: str) -> list[list[int]]: | |
| load_model() | |
| waveform, _ = librosa.load(audio_path, sr=16000, mono=True) | |
| inputs = _feature_extractor( | |
| waveform.astype(np.float32), sampling_rate=16000, | |
| return_tensors="pt", padding=True, | |
| ) | |
| input_values = inputs.input_values.to(_device) | |
| attention_mask = inputs.get("attention_mask") | |
| if attention_mask is not None: | |
| attention_mask = attention_mask.to(_device) | |
| with torch.no_grad(): | |
| logits, output_lengths = _model(input_values, attention_mask, | |
| apply_spec_augment=False) | |
| decoded_35 = _model.decode(logits, output_lengths)[0] | |
| return [[1 if v else 0 for v in seq] for seq in decoded_35] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main handler | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process(audio_input, sentence_text, max_issues): | |
| if audio_input is None: | |
| return "β οΈ Please record or upload audio.", "" | |
| if not sentence_text.strip(): | |
| return "β οΈ Please type the sentence you want to practise.", "" | |
| # G2P | |
| target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip()) | |
| if not target_phonemes: | |
| return "β οΈ Could not convert sentence to phonemes. Try simpler English words.", "" | |
| # Model inference | |
| try: | |
| actual_feature_seqs = decode_audio(audio_input) | |
| except Exception as e: | |
| return f"β Audio error: {e}", "" | |
| # MDD | |
| try: | |
| result = run_mdd(actual_feature_seqs=actual_feature_seqs, | |
| target_phonemes=target_phonemes) | |
| except Exception as e: | |
| return f"β MDD error: {e}", "" | |
| # Feedback | |
| feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues)) | |
| score = feedback_dict["score"] | |
| main_out = f"**Score: {score}/100**\n\n" + feedback_dict["final_feedback"] | |
| if unknown: | |
| main_out += f"\n\nβ οΈ Words not in dictionary (skipped): *{', '.join(unknown)}*" | |
| # Detail | |
| lines = [] | |
| for e in feedback_dict["error_summary"]: | |
| tag = " *(deleted)*" if e.get("is_deletion") else "" | |
| lines.append( | |
| f"**/{e['target']}/** pos {e['position']}{tag} β " | |
| f"{e['severity']}, {e['accuracy']:.0%} accurate \n" | |
| f"Missing: {', '.join(e['missing_features']) or 'β'} | " | |
| f"Extra: {', '.join(e['extra_features']) or 'β'}" | |
| ) | |
| detail_out = "\n\n".join(lines) if lines else "β No errors detected!" | |
| return main_out, detail_out | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI β clean and simple | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Pronunciation Coach") as demo: | |
| gr.Markdown("# π£οΈ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| sentence_input = gr.Textbox( | |
| label="Sentence to practise", | |
| placeholder="The cat sat on the mat", | |
| lines=2, | |
| ) | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Your speech", | |
| ) | |
| max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show") | |
| submit_btn = gr.Button("Analyse", variant="primary") | |
| with gr.Column(scale=2): | |
| feedback_out = gr.Markdown(label="Feedback") | |
| with gr.Accordion("Per-phoneme detail", open=False): | |
| detail_out = gr.Markdown() | |
| submit_btn.click( | |
| fn=process, | |
| inputs=[audio_input, sentence_input, max_issues], | |
| outputs=[feedback_out, detail_out], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(theme=gr.themes.Soft()) | |