heldtomaturity's picture
fix produced_phoneme AttributeError
9aa0b19
"""
Pronunciation Coach β€” HuggingFace Space
========================================
1. User types a normal English sentence
2. User records themselves saying it
3. App runs phonological model β†’ 35 CTC feature sequences
4. MDD engine aligns them against canonical sequences β†’ errors + score
5. Feedback generator returns coaching tips
"""
import os
import re
import json
import torch
import numpy as np
import gradio as gr
import librosa
import pronouncing
from huggingface_hub import snapshot_download
from transformers import Wav2Vec2FeatureExtractor
from wav2vec2_phonological import PhonologicalWav2Vec2
from mdd_engine import run_mdd
from feedback_generator import generate_feedback
from phonological_features import CMU_39_PHONEMES
# ─────────────────────────────────────────────────────────────────────────────
# Model globals
# ─────────────────────────────────────────────────────────────────────────────
_model = None
_feature_extractor = None
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRETRAINED_BASE = "facebook/wav2vec2-large-robust"
MODEL_REPO = os.environ.get("HF_MODEL_REPO", "Backlighteu/phonological-mdd")
MODEL_FILENAME = os.environ.get("HF_MODEL_FILENAME", "best_model.pt")
HF_TOKEN = os.environ.get("HF_TOKEN", None)
def load_model():
global _model, _feature_extractor
if _model is not None:
return
print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...")
snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache")
model = PhonologicalWav2Vec2(
pretrained_model_name=PRETRAINED_BASE,
num_output_nodes=71,
freeze_cnn_encoder=True,
)
state_dict = torch.load("./model_cache/best_model.pt", map_location=_device)
model.load_state_dict(state_dict)
model.to(_device)
model.eval()
_model = model
_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
print(f"[startup] Ready on {_device}.")
# ─────────────────────────────────────────────────────────────────────────────
# G2P β€” plain English β†’ CMU-39 phonemes
# ─────────────────────────────────────────────────────────────────────────────
_CMU_39 = set(CMU_39_PHONEMES)
def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
phonemes, unknown = [], []
for word in words:
results = pronouncing.phones_for_word(word.lower())
if results:
for p in results[0].split():
p = re.sub(r"[0-9]", "", p).lower()
if p in _CMU_39:
phonemes.append(p)
else:
unknown.append(word)
return phonemes, unknown
# ─────────────────────────────────────────────────────────────────────────────
# Audio inference
# ─────────────────────────────────────────────────────────────────────────────
def decode_audio(audio_path: str) -> list[list[int]]:
load_model()
waveform, _ = librosa.load(audio_path, sr=16000, mono=True)
inputs = _feature_extractor(
waveform.astype(np.float32), sampling_rate=16000,
return_tensors="pt", padding=True,
)
input_values = inputs.input_values.to(_device)
attention_mask = inputs.get("attention_mask")
if attention_mask is not None:
attention_mask = attention_mask.to(_device)
with torch.no_grad():
logits, output_lengths = _model(input_values, attention_mask,
apply_spec_augment=False)
decoded_35 = _model.decode(logits, output_lengths)[0]
return [[1 if v else 0 for v in seq] for seq in decoded_35]
# ─────────────────────────────────────────────────────────────────────────────
# Main handler
# ─────────────────────────────────────────────────────────────────────────────
def process(audio_input, sentence_text, max_issues):
if audio_input is None:
return "⚠️ Please record or upload audio.", ""
if not sentence_text.strip():
return "⚠️ Please type the sentence you want to practise.", ""
# G2P
target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip())
if not target_phonemes:
return "⚠️ Could not convert sentence to phonemes. Try simpler English words.", ""
# Model inference
try:
actual_feature_seqs = decode_audio(audio_input)
except Exception as e:
return f"❌ Audio error: {e}", ""
# MDD
try:
result = run_mdd(actual_feature_seqs=actual_feature_seqs,
target_phonemes=target_phonemes)
except Exception as e:
return f"❌ MDD error: {e}", ""
# Feedback
feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues))
score = feedback_dict["score"]
main_out = f"**Score: {score}/100**\n\n" + feedback_dict["final_feedback"]
if unknown:
main_out += f"\n\n⚠️ Words not in dictionary (skipped): *{', '.join(unknown)}*"
# Detail
lines = []
for e in feedback_dict["error_summary"]:
tag = " *(deleted)*" if e.get("is_deletion") else ""
lines.append(
f"**/{e['target']}/** pos {e['position']}{tag} β€” "
f"{e['severity']}, {e['accuracy']:.0%} accurate \n"
f"Missing: {', '.join(e['missing_features']) or 'β€”'} | "
f"Extra: {', '.join(e['extra_features']) or 'β€”'}"
)
detail_out = "\n\n".join(lines) if lines else "βœ… No errors detected!"
return main_out, detail_out
# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI β€” clean and simple
# ─────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Pronunciation Coach") as demo:
gr.Markdown("# πŸ—£οΈ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.")
with gr.Row():
with gr.Column(scale=1):
sentence_input = gr.Textbox(
label="Sentence to practise",
placeholder="The cat sat on the mat",
lines=2,
)
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Your speech",
)
max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show")
submit_btn = gr.Button("Analyse", variant="primary")
with gr.Column(scale=2):
feedback_out = gr.Markdown(label="Feedback")
with gr.Accordion("Per-phoneme detail", open=False):
detail_out = gr.Markdown()
submit_btn.click(
fn=process,
inputs=[audio_input, sentence_input, max_issues],
outputs=[feedback_out, detail_out],
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())