Communication-coach / phonetics.py
BeastGokul's picture
Upload 4 files
19b6d6d verified
import numpy as np
import librosa
from phonemizer import phonemize
def extract_phonemes(text):
"""Convert text to phonemes"""
return phonemize(text, language='en-us', backend='espeak', strip=True)
def analyze_audio_phonetically(audio_path, reference_text=None, wav2vec_processor=None, wav2vec_model=None):
"""Perform phonetic analysis of the audio compared to reference text (optional, only if local model loaded)"""
if not wav2vec_processor or not wav2vec_model:
return {"detected_phonemes": "[Phoneme analysis not available]"}
audio, sr = librosa.load(audio_path, sr=16000)
inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt")
import torch
with torch.no_grad():
logits = wav2vec_model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
phoneme_sequence = wav2vec_processor.batch_decode(predicted_ids)[0]
result = {"detected_phonemes": phoneme_sequence}
if reference_text:
reference_phonemes = extract_phonemes(reference_text)
result["reference_phonemes"] = reference_phonemes
result["analysis"] = "Phoneme comparison would be performed here"
return result
def extract_pronunciation_embedding(audio_path, hubert_processor=None, hubert_model=None):
"""Extract pronunciation embedding for comparison purposes (optional, only if local model loaded)"""
if not hubert_model or not hubert_processor:
return None
audio, sr = librosa.load(audio_path, sr=16000)
inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt")
import torch
with torch.no_grad():
outputs = hubert_model(**inputs)
embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
return embedding