File size: 2,377 Bytes
4b25dd0
 
 
5a416b3
 
 
 
 
4b25dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a416b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b25dd0
 
5a416b3
 
 
 
98a399f
 
 
5a416b3
 
4b25dd0
 
 
 
 
 
 
 
 
5a416b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import warnings

import whisper
import librosa
import numpy as np
import tensorflow_hub as hub

# Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # hide INFO/WARNING logs
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")  # disable oneDNN custom ops
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")  # don't try to use CUDA GPUs

# Suppress specific library warnings that are expected in this setup
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message="FP16 is not supported on CPU; using FP32 instead",
)
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    module="librosa",
)

# Load ASR
asr_model = whisper.load_model("small")

# Load YAMNet for sound classification
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
class_map = yamnet.class_map_path().numpy()

# Simple Emotion Estimator (from YAMNet embedding)
def estimate_emotion(activation):
    mean_val = activation.mean()
    if mean_val > 0.3:
        return "Happy / Excited"
    elif mean_val < -0.3:
        return "Sad / Depressed"
    return "Neutral"


def speech_to_text(audio):
    # Force FP32 on CPU to avoid FP16 warnings and ensure compatibility
    result = asr_model.transcribe(audio, fp16=False)
    return result["text"]


def detect_sound(audio):
    # Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet
    waveform, sr = librosa.load(audio, sr=16000, mono=True)
    waveform = waveform.astype(np.float32)
    scores, embeddings, _ = yamnet(waveform)
    mean_scores = np.mean(scores.numpy(), axis=0)
    top_idx = int(np.argmax(mean_scores))
    # class_map may contain integers or byte strings depending on TF Hub version;
    # convert robustly to a human-readable label.
    label = class_map[top_idx]
    if isinstance(label, bytes):
        label = label.decode("utf-8")
    else:
        label = str(label)
    return label, float(mean_scores.max())


def analyze_audio(audio_file):
    summary = {}

    summary["transcription"] = speech_to_text(audio_file)

    event, confidence = detect_sound(audio_file)
    summary["sound_event"] = event
    summary["sound_confidence"] = float(confidence)

    summary["emotion"] = "Neutral (approx)"

    summary["speakers"] = "Not available in HF-free version"

    return summary