kamcio1989 commited on
Commit
e6df1f2
·
verified ·
1 Parent(s): 99be848

Upload utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. utils.py +110 -0
utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import av
2
+ import numpy as np
3
+ import threading
4
+ import time
5
+ from collections import deque
6
+ import io
7
+ import pydub
8
+
9
+ class AudioProcessor(av.AudioProcessorBase):
10
+ def __init__(self):
11
+ self.threshold = 2000 # Energy threshold
12
+ self.silence_duration = 1.5 # Seconds of silence to mark end of speech
13
+
14
+ self.frames = []
15
+ self.is_speaking = False
16
+ self.silence_start_time = None
17
+ self.speech_detected_in_session = False
18
+ self.ready_to_process = False
19
+ self.lock = threading.Lock()
20
+
21
+ self.sample_rate = 48000 # Default for WebRTC usually
22
+
23
+ def set_thresholds(self, energy, silence_dur):
24
+ with self.lock:
25
+ self.threshold = energy
26
+ self.silence_duration = silence_dur
27
+
28
+ def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
29
+ # Convert to numpy array to analyze energy
30
+ raw_samples = frame.to_ndarray()
31
+
32
+ # Calculate energy (RMS)
33
+ # Handle stereo/mono
34
+ if raw_samples.ndim > 1:
35
+ raw_samples = np.mean(raw_samples, axis=1)
36
+
37
+ rms = np.sqrt(np.mean(raw_samples**2))
38
+
39
+ with self.lock:
40
+ self.sample_rate = frame.sample_rate
41
+
42
+ # Voice Activity Detection Logic
43
+ if rms > self.threshold:
44
+ self.is_speaking = True
45
+ self.speech_detected_in_session = True
46
+ self.silence_start_time = None
47
+ else:
48
+ self.is_speaking = False
49
+ if self.speech_detected_in_session and self.silence_start_time is None:
50
+ self.silence_start_time = time.time()
51
+
52
+ # Collect frames if we have detected speech in this session
53
+ if self.speech_detected_in_session:
54
+ self.frames.append(frame)
55
+
56
+ # Check if we should stop recording (silence duration exceeded)
57
+ if (self.speech_detected_in_session and
58
+ self.silence_start_time and
59
+ (time.time() - self.silence_start_time) > self.silence_duration):
60
+
61
+ self.ready_to_process = True
62
+ self.speech_detected_in_session = False # Reset for next turn
63
+ self.silence_start_time = None
64
+
65
+ return None # We don't need to return audio to the browser (SENDONLY mode)
66
+
67
+ def has_audio_frame(self):
68
+ with self.lock:
69
+ return self.ready_to_process
70
+
71
+ def get_audio_frames(self):
72
+ with self.lock:
73
+ if not self.ready_to_process:
74
+ return None
75
+
76
+ # Return copy of frames and clear buffer
77
+ data = self.frames[:]
78
+ self.frames = []
79
+ self.ready_to_process = False
80
+ return data
81
+
82
+ def save_audio_to_bytes(frames):
83
+ """Converts a list of av.AudioFrames to a WAV byte buffer."""
84
+ if not frames:
85
+ return None
86
+
87
+ # Use pydub for easy conversion
88
+ # Combine all frames
89
+ output = io.BytesIO()
90
+
91
+ # Extract raw PCM data
92
+ # Note: WebRTC frames are usually 16-bit PCM
93
+ audio_segment = pydub.AudioSegment.empty()
94
+
95
+ for frame in frames:
96
+ # Convert av frame to pydub segment
97
+ # frame.to_ndarray() returns float32 or int16 depending on layout,
98
+ # usually we need to ensure it's bytes for pydub
99
+ data = frame.to_ndarray().tobytes()
100
+ segment = pydub.AudioSegment(
101
+ data=data,
102
+ sample_width=frame.format.bytes,
103
+ frame_rate=frame.sample_rate,
104
+ channels=len(frame.layout.channels)
105
+ )
106
+ audio_segment += segment
107
+
108
+ # Export to WAV
109
+ audio_segment.export(output, format="wav")
110
+ return output.getvalue()