import numpy as np import librosa from phonemizer import phonemize def extract_phonemes(text): """Convert text to phonemes""" return phonemize(text, language='en-us', backend='espeak', strip=True) def analyze_audio_phonetically(audio_path, reference_text=None, wav2vec_processor=None, wav2vec_model=None): """Perform phonetic analysis of the audio compared to reference text (optional, only if local model loaded)""" if not wav2vec_processor or not wav2vec_model: return {"detected_phonemes": "[Phoneme analysis not available]"} audio, sr = librosa.load(audio_path, sr=16000) inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt") import torch with torch.no_grad(): logits = wav2vec_model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) phoneme_sequence = wav2vec_processor.batch_decode(predicted_ids)[0] result = {"detected_phonemes": phoneme_sequence} if reference_text: reference_phonemes = extract_phonemes(reference_text) result["reference_phonemes"] = reference_phonemes result["analysis"] = "Phoneme comparison would be performed here" return result def extract_pronunciation_embedding(audio_path, hubert_processor=None, hubert_model=None): """Extract pronunciation embedding for comparison purposes (optional, only if local model loaded)""" if not hubert_model or not hubert_processor: return None audio, sr = librosa.load(audio_path, sr=16000) inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt") import torch with torch.no_grad(): outputs = hubert_model(**inputs) embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy() return embedding