import os import warnings import whisper import librosa import numpy as np import tensorflow as tf import tensorflow_hub as hub import csv # Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs. # NOTE: These env vars must be set before TensorFlow fully initializes; setting them # here greatly reduces, but may not completely remove, startup logs on some platforms. os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # hide INFO/WARNING logs os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # disable oneDNN custom ops os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") # don't try to use CUDA GPUs # Suppress specific library warnings that are expected in this setup. warnings.filterwarnings( "ignore", category=UserWarning, message="FP16 is not supported on CPU; using FP32 instead", ) warnings.filterwarnings( "ignore", category=FutureWarning, module="librosa", ) # Load ASR asr_model = whisper.load_model("small") # Load YAMNet for sound classification yamnet = hub.load("https://tfhub.dev/google/yamnet/1") class_map_path = yamnet.class_map_path().numpy() if isinstance(class_map_path, bytes): class_map_path = class_map_path.decode("utf-8") # Parse YAMNet class map CSV to get human-readable labels with tf.io.gfile.GFile(class_map_path) as f: reader = csv.DictReader(f) yamnet_labels = [row["display_name"] for row in reader] # Simple Emotion Estimator (from YAMNet embedding) def estimate_emotion(activation): mean_val = activation.mean() if mean_val > 0.3: return "Happy / Excited" elif mean_val < -0.3: return "Sad / Depressed" return "Neutral" def speech_to_text(audio): # Force FP32 on CPU to avoid FP16 warnings and ensure compatibility result = asr_model.transcribe(audio, fp16=False) return result["text"] def detect_sound(audio): # Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet waveform, sr = librosa.load(audio, sr=16000, mono=True) waveform = waveform.astype(np.float32) scores, embeddings, _ = yamnet(waveform) mean_scores = np.mean(scores.numpy(), axis=0) top_idx = int(np.argmax(mean_scores)) # Look up human-readable class label from YAMNet's CSV class map label = yamnet_labels[top_idx] if 0 <= top_idx < len(yamnet_labels) else "Unknown" return label, float(mean_scores.max()) def analyze_audio(audio_file): summary = {} summary["transcription"] = speech_to_text(audio_file) event, confidence = detect_sound(audio_file) summary["sound_event"] = event summary["sound_confidence"] = float(confidence) summary["emotion"] = "Neutral (approx)" summary["speakers"] = "Not available in HF-free version" return summary