Spaces:

Backlighteu
/

Pronunciation-Coach

Sleeping

App Files Files Community

Pronunciation-Coach / app.py

heldtomaturity

fix produced_phoneme AttributeError

9aa0b19 12 days ago

raw

history blame contribute delete

8.44 kB

	"""
	Pronunciation Coach — HuggingFace Space
	========================================
	1. User types a normal English sentence
	2. User records themselves saying it
	3. App runs phonological model → 35 CTC feature sequences
	4. MDD engine aligns them against canonical sequences → errors + score
	5. Feedback generator returns coaching tips
	"""

	import os
	import re
	import json
	import torch
	import numpy as np
	import gradio as gr
	import librosa
	import pronouncing

	from huggingface_hub import snapshot_download
	from transformers import Wav2Vec2FeatureExtractor

	from wav2vec2_phonological import PhonologicalWav2Vec2
	from mdd_engine import run_mdd
	from feedback_generator import generate_feedback
	from phonological_features import CMU_39_PHONEMES

	# ─────────────────────────────────────────────────────────────────────────────
	# Model globals
	# ─────────────────────────────────────────────────────────────────────────────

	_model = None
	_feature_extractor = None
	_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	PRETRAINED_BASE = "facebook/wav2vec2-large-robust"
	MODEL_REPO = os.environ.get("HF_MODEL_REPO", "Backlighteu/phonological-mdd")
	MODEL_FILENAME = os.environ.get("HF_MODEL_FILENAME", "best_model.pt")
	HF_TOKEN = os.environ.get("HF_TOKEN", None)


	def load_model():
	global _model, _feature_extractor
	if _model is not None:
	return

	print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...")
	snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache")

	model = PhonologicalWav2Vec2(
	pretrained_model_name=PRETRAINED_BASE,
	num_output_nodes=71,
	freeze_cnn_encoder=True,
	)
	state_dict = torch.load("./model_cache/best_model.pt", map_location=_device)
	model.load_state_dict(state_dict)
	model.to(_device)
	model.eval()
	_model = model

	_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
	print(f"[startup] Ready on {_device}.")


	# ─────────────────────────────────────────────────────────────────────────────
	# G2P — plain English → CMU-39 phonemes
	# ─────────────────────────────────────────────────────────────────────────────

	_CMU_39 = set(CMU_39_PHONEMES)


	def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
	words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
	phonemes, unknown = [], []
	for word in words:
	results = pronouncing.phones_for_word(word.lower())
	if results:
	for p in results[0].split():
	p = re.sub(r"[0-9]", "", p).lower()
	if p in _CMU_39:
	phonemes.append(p)
	else:
	unknown.append(word)
	return phonemes, unknown


	# ─────────────────────────────────────────────────────────────────────────────
	# Audio inference
	# ─────────────────────────────────────────────────────────────────────────────

	def decode_audio(audio_path: str) -> list[list[int]]:
	load_model()
	waveform, _ = librosa.load(audio_path, sr=16000, mono=True)
	inputs = _feature_extractor(
	waveform.astype(np.float32), sampling_rate=16000,
	return_tensors="pt", padding=True,
	)
	input_values = inputs.input_values.to(_device)
	attention_mask = inputs.get("attention_mask")
	if attention_mask is not None:
	attention_mask = attention_mask.to(_device)

	with torch.no_grad():
	logits, output_lengths = _model(input_values, attention_mask,
	apply_spec_augment=False)

	decoded_35 = _model.decode(logits, output_lengths)[0]
	return [[1 if v else 0 for v in seq] for seq in decoded_35]


	# ─────────────────────────────────────────────────────────────────────────────
	# Main handler
	# ─────────────────────────────────────────────────────────────────────────────

	def process(audio_input, sentence_text, max_issues):
	if audio_input is None:
	return "⚠️ Please record or upload audio.", ""
	if not sentence_text.strip():
	return "⚠️ Please type the sentence you want to practise.", ""

	# G2P
	target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip())
	if not target_phonemes:
	return "⚠️ Could not convert sentence to phonemes. Try simpler English words.", ""

	# Model inference
	try:
	actual_feature_seqs = decode_audio(audio_input)
	except Exception as e:
	return f"❌ Audio error: {e}", ""

	# MDD
	try:
	result = run_mdd(actual_feature_seqs=actual_feature_seqs,
	target_phonemes=target_phonemes)
	except Exception as e:
	return f"❌ MDD error: {e}", ""

	# Feedback
	feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues))

	score = feedback_dict["score"]
	main_out = f"Score: {score}/100\n\n" + feedback_dict["final_feedback"]
	if unknown:
	main_out += f"\n\n⚠️ Words not in dictionary (skipped): {', '.join(unknown)}"

	# Detail
	lines = []
	for e in feedback_dict["error_summary"]:
	tag = " (deleted)" if e.get("is_deletion") else ""
	lines.append(
	f"/{e['target']}/ pos {e['position']}{tag} — "
	f"{e['severity']}, {e['accuracy']:.0%} accurate \n"
	f"Missing: {', '.join(e['missing_features']) or '—'} \| "
	f"Extra: {', '.join(e['extra_features']) or '—'}"
	)
	detail_out = "\n\n".join(lines) if lines else "✅ No errors detected!"

	return main_out, detail_out


	# ─────────────────────────────────────────────────────────────────────────────
	# Gradio UI — clean and simple
	# ─────────────────────────────────────────────────────────────────────────────

	with gr.Blocks(title="Pronunciation Coach") as demo:
	gr.Markdown("# 🗣️ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.")

	with gr.Row():
	with gr.Column(scale=1):
	sentence_input = gr.Textbox(
	label="Sentence to practise",
	placeholder="The cat sat on the mat",
	lines=2,
	)
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Your speech",
	)
	max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show")
	submit_btn = gr.Button("Analyse", variant="primary")

	with gr.Column(scale=2):
	feedback_out = gr.Markdown(label="Feedback")
	with gr.Accordion("Per-phoneme detail", open=False):
	detail_out = gr.Markdown()

	submit_btn.click(
	fn=process,
	inputs=[audio_input, sentence_input, max_issues],
	outputs=[feedback_out, detail_out],
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft())