Spaces:
Sleeping
Sleeping
| import av | |
| import numpy as np | |
| import threading | |
| import time | |
| from collections import deque | |
| import io | |
| import pydub | |
| class AudioProcessor(av.AudioProcessorBase): | |
| def __init__(self): | |
| self.threshold = 2000 # Energy threshold | |
| self.silence_duration = 1.5 # Seconds of silence to mark end of speech | |
| self.frames = [] | |
| self.is_speaking = False | |
| self.silence_start_time = None | |
| self.speech_detected_in_session = False | |
| self.ready_to_process = False | |
| self.lock = threading.Lock() | |
| self.sample_rate = 48000 # Default for WebRTC usually | |
| def set_thresholds(self, energy, silence_dur): | |
| with self.lock: | |
| self.threshold = energy | |
| self.silence_duration = silence_dur | |
| def recv(self, frame: av.AudioFrame) -> av.AudioFrame: | |
| # Convert to numpy array to analyze energy | |
| raw_samples = frame.to_ndarray() | |
| # Calculate energy (RMS) | |
| # Handle stereo/mono | |
| if raw_samples.ndim > 1: | |
| raw_samples = np.mean(raw_samples, axis=1) | |
| rms = np.sqrt(np.mean(raw_samples**2)) | |
| with self.lock: | |
| self.sample_rate = frame.sample_rate | |
| # Voice Activity Detection Logic | |
| if rms > self.threshold: | |
| self.is_speaking = True | |
| self.speech_detected_in_session = True | |
| self.silence_start_time = None | |
| else: | |
| self.is_speaking = False | |
| if self.speech_detected_in_session and self.silence_start_time is None: | |
| self.silence_start_time = time.time() | |
| # Collect frames if we have detected speech in this session | |
| if self.speech_detected_in_session: | |
| self.frames.append(frame) | |
| # Check if we should stop recording (silence duration exceeded) | |
| if (self.speech_detected_in_session and | |
| self.silence_start_time and | |
| (time.time() - self.silence_start_time) > self.silence_duration): | |
| self.ready_to_process = True | |
| self.speech_detected_in_session = False # Reset for next turn | |
| self.silence_start_time = None | |
| return None # We don't need to return audio to the browser (SENDONLY mode) | |
| def has_audio_frame(self): | |
| with self.lock: | |
| return self.ready_to_process | |
| def get_audio_frames(self): | |
| with self.lock: | |
| if not self.ready_to_process: | |
| return None | |
| # Return copy of frames and clear buffer | |
| data = self.frames[:] | |
| self.frames = [] | |
| self.ready_to_process = False | |
| return data | |
| def save_audio_to_bytes(frames): | |
| """Converts a list of av.AudioFrames to a WAV byte buffer.""" | |
| if not frames: | |
| return None | |
| # Use pydub for easy conversion | |
| # Combine all frames | |
| output = io.BytesIO() | |
| # Extract raw PCM data | |
| # Note: WebRTC frames are usually 16-bit PCM | |
| audio_segment = pydub.AudioSegment.empty() | |
| for frame in frames: | |
| # Convert av frame to pydub segment | |
| # frame.to_ndarray() returns float32 or int16 depending on layout, | |
| # usually we need to ensure it's bytes for pydub | |
| data = frame.to_ndarray().tobytes() | |
| segment = pydub.AudioSegment( | |
| data=data, | |
| sample_width=frame.format.bytes, | |
| frame_rate=frame.sample_rate, | |
| channels=len(frame.layout.channels) | |
| ) | |
| audio_segment += segment | |
| # Export to WAV | |
| audio_segment.export(output, format="wav") | |
| return output.getvalue() |