import whisper import librosa import numpy as np import tensorflow_hub as hub # Load ASR asr_model = whisper.load_model("small") # Load YAMNet for sound classification yamnet = hub.load("https://tfhub.dev/google/yamnet/1") class_map = yamnet.class_map_path().numpy() # Simple Emotion Estimator (from YAMNet embedding) def estimate_emotion(activation): mean_val = activation.mean() if mean_val > 0.3: return "Happy / Excited" elif mean_val < -0.3: return "Sad / Depressed" return "Neutral" def speech_to_text(audio): result = asr_model.transcribe(audio) return result["text"] def detect_sound(audio): waveform, sr = librosa.load(audio, sr=16000) waveform = waveform.reshape(1, -1) scores, embeddings, _ = yamnet(waveform) mean_scores = np.mean(scores.numpy(), axis=0) top_idx = np.argmax(mean_scores) return class_map[top_idx].decode("utf-8"), mean_scores.max() def analyze_audio(audio_file): summary = {} summary["transcription"] = speech_to_text(audio_file) event, confidence = detect_sound(audio_file) summary["sound_event"] = event summary["sound_confidence"] = float(confidence) summary["emotion"] = "Neutral (approx)" summary["speakers"] = "Not available in HF-free version" return summary