Spaces:

Rivalcoder
/

Lite

Sleeping

File size: 2,761 Bytes

4b25dd0
 
 
5a416b3
 
 
ba773e9
5a416b3
ba773e9
5a416b3
ba773e9
 
 
4b25dd0
 
 
 
ba773e9
4b25dd0
 
 
 
 
 
 
 
 
 
 
5a416b3
 
 
 
 
ba773e9
 
 
 
 
 
 
 
5a416b3
 
 
 
 
 
 
 
 
 
 
 
4b25dd0
 
5a416b3
 
 
 
98a399f
 
 
5a416b3
 
4b25dd0
ba773e9
 
4b25dd0
5a416b3

import os
import warnings

import whisper
import librosa
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import csv

# Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs.
# NOTE: These env vars must be set before TensorFlow fully initializes; setting them
# here greatly reduces, but may not completely remove, startup logs on some platforms.
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # hide INFO/WARNING logs
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")  # disable oneDNN custom ops
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")  # don't try to use CUDA GPUs

# Suppress specific library warnings that are expected in this setup.
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message="FP16 is not supported on CPU; using FP32 instead",
)
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    module="librosa",
)

# Load ASR
asr_model = whisper.load_model("small")

# Load YAMNet for sound classification
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
class_map_path = yamnet.class_map_path().numpy()
if isinstance(class_map_path, bytes):
    class_map_path = class_map_path.decode("utf-8")

# Parse YAMNet class map CSV to get human-readable labels
with tf.io.gfile.GFile(class_map_path) as f:
    reader = csv.DictReader(f)
    yamnet_labels = [row["display_name"] for row in reader]

# Simple Emotion Estimator (from YAMNet embedding)
def estimate_emotion(activation):
    mean_val = activation.mean()
    if mean_val > 0.3:
        return "Happy / Excited"
    elif mean_val < -0.3:
        return "Sad / Depressed"
    return "Neutral"


def speech_to_text(audio):
    # Force FP32 on CPU to avoid FP16 warnings and ensure compatibility
    result = asr_model.transcribe(audio, fp16=False)
    return result["text"]


def detect_sound(audio):
    # Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet
    waveform, sr = librosa.load(audio, sr=16000, mono=True)
    waveform = waveform.astype(np.float32)
    scores, embeddings, _ = yamnet(waveform)
    mean_scores = np.mean(scores.numpy(), axis=0)
    top_idx = int(np.argmax(mean_scores))
    # Look up human-readable class label from YAMNet's CSV class map
    label = yamnet_labels[top_idx] if 0 <= top_idx < len(yamnet_labels) else "Unknown"
    return label, float(mean_scores.max())


def analyze_audio(audio_file):
    summary = {}

    summary["transcription"] = speech_to_text(audio_file)

    event, confidence = detect_sound(audio_file)
    summary["sound_event"] = event
    summary["sound_confidence"] = float(confidence)

    summary["emotion"] = "Neutral (approx)"

    summary["speakers"] = "Not available in HF-free version"

    return summary