Spaces:
Sleeping
Sleeping
File size: 2,761 Bytes
4b25dd0 5a416b3 ba773e9 5a416b3 ba773e9 5a416b3 ba773e9 4b25dd0 ba773e9 4b25dd0 5a416b3 ba773e9 5a416b3 4b25dd0 5a416b3 98a399f 5a416b3 4b25dd0 ba773e9 4b25dd0 5a416b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import warnings
import whisper
import librosa
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import csv
# Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs.
# NOTE: These env vars must be set before TensorFlow fully initializes; setting them
# here greatly reduces, but may not completely remove, startup logs on some platforms.
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # hide INFO/WARNING logs
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # disable oneDNN custom ops
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") # don't try to use CUDA GPUs
# Suppress specific library warnings that are expected in this setup.
warnings.filterwarnings(
"ignore",
category=UserWarning,
message="FP16 is not supported on CPU; using FP32 instead",
)
warnings.filterwarnings(
"ignore",
category=FutureWarning,
module="librosa",
)
# Load ASR
asr_model = whisper.load_model("small")
# Load YAMNet for sound classification
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
class_map_path = yamnet.class_map_path().numpy()
if isinstance(class_map_path, bytes):
class_map_path = class_map_path.decode("utf-8")
# Parse YAMNet class map CSV to get human-readable labels
with tf.io.gfile.GFile(class_map_path) as f:
reader = csv.DictReader(f)
yamnet_labels = [row["display_name"] for row in reader]
# Simple Emotion Estimator (from YAMNet embedding)
def estimate_emotion(activation):
mean_val = activation.mean()
if mean_val > 0.3:
return "Happy / Excited"
elif mean_val < -0.3:
return "Sad / Depressed"
return "Neutral"
def speech_to_text(audio):
# Force FP32 on CPU to avoid FP16 warnings and ensure compatibility
result = asr_model.transcribe(audio, fp16=False)
return result["text"]
def detect_sound(audio):
# Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet
waveform, sr = librosa.load(audio, sr=16000, mono=True)
waveform = waveform.astype(np.float32)
scores, embeddings, _ = yamnet(waveform)
mean_scores = np.mean(scores.numpy(), axis=0)
top_idx = int(np.argmax(mean_scores))
# Look up human-readable class label from YAMNet's CSV class map
label = yamnet_labels[top_idx] if 0 <= top_idx < len(yamnet_labels) else "Unknown"
return label, float(mean_scores.max())
def analyze_audio(audio_file):
summary = {}
summary["transcription"] = speech_to_text(audio_file)
event, confidence = detect_sound(audio_file)
summary["sound_event"] = event
summary["sound_confidence"] = float(confidence)
summary["emotion"] = "Neutral (approx)"
summary["speakers"] = "Not available in HF-free version"
return summary
|