import whisper
import librosa
import numpy as np
import tensorflow_hub as hub

# Load ASR
asr_model = whisper.load_model("small")

# Load YAMNet for sound classification
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
class_map = yamnet.class_map_path().numpy()

# Simple Emotion Estimator (from YAMNet embedding)
def estimate_emotion(activation):
    mean_val = activation.mean()
    if mean_val > 0.3:
        return "Happy / Excited"
    elif mean_val < -0.3:
        return "Sad / Depressed"
    return "Neutral"


def speech_to_text(audio):
    result = asr_model.transcribe(audio)
    return result["text"]


def detect_sound(audio):
    waveform, sr = librosa.load(audio, sr=16000)
    waveform = waveform.reshape(1, -1)
    scores, embeddings, _ = yamnet(waveform)
    mean_scores = np.mean(scores.numpy(), axis=0)
    top_idx = np.argmax(mean_scores)
    return class_map[top_idx].decode("utf-8"), mean_scores.max()


def analyze_audio(audio_file):
    summary = {}

    summary["transcription"] = speech_to_text(audio_file)

    event, confidence = detect_sound(audio_file)
    summary["sound_event"] = event
    summary["sound_confidence"] = float(confidence)

    summary["emotion"] = "Neutral (approx)"

    summary["speakers"] = "Not available in HF-free version"

    return summary