import numpy as np
import librosa
from phonemizer import phonemize

def extract_phonemes(text):
    """Convert text to phonemes"""
    return phonemize(text, language='en-us', backend='espeak', strip=True)

def analyze_audio_phonetically(audio_path, reference_text=None, wav2vec_processor=None, wav2vec_model=None):
    """Perform phonetic analysis of the audio compared to reference text (optional, only if local model loaded)"""
    if not wav2vec_processor or not wav2vec_model:
        return {"detected_phonemes": "[Phoneme analysis not available]"}
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt")
    import torch
    with torch.no_grad():
        logits = wav2vec_model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    phoneme_sequence = wav2vec_processor.batch_decode(predicted_ids)[0]
    result = {"detected_phonemes": phoneme_sequence}
    if reference_text:
        reference_phonemes = extract_phonemes(reference_text)
        result["reference_phonemes"] = reference_phonemes
        result["analysis"] = "Phoneme comparison would be performed here"
    return result

def extract_pronunciation_embedding(audio_path, hubert_processor=None, hubert_model=None):
    """Extract pronunciation embedding for comparison purposes (optional, only if local model loaded)"""
    if not hubert_model or not hubert_processor:
        return None
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt")
    import torch
    with torch.no_grad():
        outputs = hubert_model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding