Rivalcoder commited on
Commit
4b25dd0
·
1 Parent(s): 98a399f
Files changed (1) hide show
  1. alm_pipeline.py +31 -3
alm_pipeline.py CHANGED
@@ -1,8 +1,28 @@
 
 
 
1
  import whisper
2
  import librosa
3
  import numpy as np
4
  import tensorflow_hub as hub
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Load ASR
7
  asr_model = whisper.load_model("small")
8
 
@@ -21,7 +41,8 @@ def estimate_emotion(activation):
21
 
22
 
23
  def speech_to_text(audio):
24
- result = asr_model.transcribe(audio)
 
25
  return result["text"]
26
 
27
 
@@ -31,8 +52,15 @@ def detect_sound(audio):
31
  waveform = waveform.astype(np.float32)
32
  scores, embeddings, _ = yamnet(waveform)
33
  mean_scores = np.mean(scores.numpy(), axis=0)
34
- top_idx = np.argmax(mean_scores)
35
- return class_map[top_idx].decode("utf-8"), mean_scores.max()
 
 
 
 
 
 
 
36
 
37
 
38
  def analyze_audio(audio_file):
 
1
+ import os
2
+ import warnings
3
+
4
  import whisper
5
  import librosa
6
  import numpy as np
7
  import tensorflow_hub as hub
8
 
9
+ # Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs
10
+ os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # hide INFO/WARNING logs
11
+ os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # disable oneDNN custom ops
12
+ os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") # don't try to use CUDA GPUs
13
+
14
+ # Suppress specific library warnings that are expected in this setup
15
+ warnings.filterwarnings(
16
+ "ignore",
17
+ category=UserWarning,
18
+ message="FP16 is not supported on CPU; using FP32 instead",
19
+ )
20
+ warnings.filterwarnings(
21
+ "ignore",
22
+ category=FutureWarning,
23
+ module="librosa",
24
+ )
25
+
26
  # Load ASR
27
  asr_model = whisper.load_model("small")
28
 
 
41
 
42
 
43
  def speech_to_text(audio):
44
+ # Force FP32 on CPU to avoid FP16 warnings and ensure compatibility
45
+ result = asr_model.transcribe(audio, fp16=False)
46
  return result["text"]
47
 
48
 
 
52
  waveform = waveform.astype(np.float32)
53
  scores, embeddings, _ = yamnet(waveform)
54
  mean_scores = np.mean(scores.numpy(), axis=0)
55
+ top_idx = int(np.argmax(mean_scores))
56
+ # class_map may contain integers or byte strings depending on TF Hub version;
57
+ # convert robustly to a human-readable label.
58
+ label = class_map[top_idx]
59
+ if isinstance(label, bytes):
60
+ label = label.decode("utf-8")
61
+ else:
62
+ label = str(label)
63
+ return label, float(mean_scores.max())
64
 
65
 
66
  def analyze_audio(audio_file):