Spaces:
Sleeping
Sleeping
| import os | |
| import warnings | |
| import whisper | |
| import librosa | |
| import numpy as np | |
| import tensorflow as tf | |
| import tensorflow_hub as hub | |
| import csv | |
| # Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs. | |
| # NOTE: These env vars must be set before TensorFlow fully initializes; setting them | |
| # here greatly reduces, but may not completely remove, startup logs on some platforms. | |
| os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # hide INFO/WARNING logs | |
| os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # disable oneDNN custom ops | |
| os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") # don't try to use CUDA GPUs | |
| # Suppress specific library warnings that are expected in this setup. | |
| warnings.filterwarnings( | |
| "ignore", | |
| category=UserWarning, | |
| message="FP16 is not supported on CPU; using FP32 instead", | |
| ) | |
| warnings.filterwarnings( | |
| "ignore", | |
| category=FutureWarning, | |
| module="librosa", | |
| ) | |
| # Load ASR | |
| asr_model = whisper.load_model("small") | |
| # Load YAMNet for sound classification | |
| yamnet = hub.load("https://tfhub.dev/google/yamnet/1") | |
| class_map_path = yamnet.class_map_path().numpy() | |
| if isinstance(class_map_path, bytes): | |
| class_map_path = class_map_path.decode("utf-8") | |
| # Parse YAMNet class map CSV to get human-readable labels | |
| with tf.io.gfile.GFile(class_map_path) as f: | |
| reader = csv.DictReader(f) | |
| yamnet_labels = [row["display_name"] for row in reader] | |
| # Simple Emotion Estimator (from YAMNet embedding) | |
| def estimate_emotion(activation): | |
| mean_val = activation.mean() | |
| if mean_val > 0.3: | |
| return "Happy / Excited" | |
| elif mean_val < -0.3: | |
| return "Sad / Depressed" | |
| return "Neutral" | |
| def speech_to_text(audio): | |
| # Force FP32 on CPU to avoid FP16 warnings and ensure compatibility | |
| result = asr_model.transcribe(audio, fp16=False) | |
| return result["text"] | |
| def detect_sound(audio): | |
| # Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet | |
| waveform, sr = librosa.load(audio, sr=16000, mono=True) | |
| waveform = waveform.astype(np.float32) | |
| scores, embeddings, _ = yamnet(waveform) | |
| mean_scores = np.mean(scores.numpy(), axis=0) | |
| top_idx = int(np.argmax(mean_scores)) | |
| # Look up human-readable class label from YAMNet's CSV class map | |
| label = yamnet_labels[top_idx] if 0 <= top_idx < len(yamnet_labels) else "Unknown" | |
| return label, float(mean_scores.max()) | |
| def analyze_audio(audio_file): | |
| summary = {} | |
| summary["transcription"] = speech_to_text(audio_file) | |
| event, confidence = detect_sound(audio_file) | |
| summary["sound_event"] = event | |
| summary["sound_confidence"] = float(confidence) | |
| summary["emotion"] = "Neutral (approx)" | |
| summary["speakers"] = "Not available in HF-free version" | |
| return summary | |