Spaces:
Sleeping
Sleeping
| import whisper | |
| import librosa | |
| import numpy as np | |
| import tensorflow_hub as hub | |
| # Load ASR | |
| asr_model = whisper.load_model("small") | |
| # Load YAMNet for sound classification | |
| yamnet = hub.load("https://tfhub.dev/google/yamnet/1") | |
| class_map = yamnet.class_map_path().numpy() | |
| # Simple Emotion Estimator (from YAMNet embedding) | |
| def estimate_emotion(activation): | |
| mean_val = activation.mean() | |
| if mean_val > 0.3: | |
| return "Happy / Excited" | |
| elif mean_val < -0.3: | |
| return "Sad / Depressed" | |
| return "Neutral" | |
| def speech_to_text(audio): | |
| result = asr_model.transcribe(audio) | |
| return result["text"] | |
| def detect_sound(audio): | |
| waveform, sr = librosa.load(audio, sr=16000) | |
| waveform = waveform.reshape(1, -1) | |
| scores, embeddings, _ = yamnet(waveform) | |
| mean_scores = np.mean(scores.numpy(), axis=0) | |
| top_idx = np.argmax(mean_scores) | |
| return class_map[top_idx].decode("utf-8"), mean_scores.max() | |
| def analyze_audio(audio_file): | |
| summary = {} | |
| summary["transcription"] = speech_to_text(audio_file) | |
| event, confidence = detect_sound(audio_file) | |
| summary["sound_event"] = event | |
| summary["sound_confidence"] = float(confidence) | |
| summary["emotion"] = "Neutral (approx)" | |
| summary["speakers"] = "Not available in HF-free version" | |
| return summary | |