import av import numpy as np import threading import time from collections import deque import io import pydub class AudioProcessor(av.AudioProcessorBase): def __init__(self): self.threshold = 2000 # Energy threshold self.silence_duration = 1.5 # Seconds of silence to mark end of speech self.frames = [] self.is_speaking = False self.silence_start_time = None self.speech_detected_in_session = False self.ready_to_process = False self.lock = threading.Lock() self.sample_rate = 48000 # Default for WebRTC usually def set_thresholds(self, energy, silence_dur): with self.lock: self.threshold = energy self.silence_duration = silence_dur def recv(self, frame: av.AudioFrame) -> av.AudioFrame: # Convert to numpy array to analyze energy raw_samples = frame.to_ndarray() # Calculate energy (RMS) # Handle stereo/mono if raw_samples.ndim > 1: raw_samples = np.mean(raw_samples, axis=1) rms = np.sqrt(np.mean(raw_samples**2)) with self.lock: self.sample_rate = frame.sample_rate # Voice Activity Detection Logic if rms > self.threshold: self.is_speaking = True self.speech_detected_in_session = True self.silence_start_time = None else: self.is_speaking = False if self.speech_detected_in_session and self.silence_start_time is None: self.silence_start_time = time.time() # Collect frames if we have detected speech in this session if self.speech_detected_in_session: self.frames.append(frame) # Check if we should stop recording (silence duration exceeded) if (self.speech_detected_in_session and self.silence_start_time and (time.time() - self.silence_start_time) > self.silence_duration): self.ready_to_process = True self.speech_detected_in_session = False # Reset for next turn self.silence_start_time = None return None # We don't need to return audio to the browser (SENDONLY mode) def has_audio_frame(self): with self.lock: return self.ready_to_process def get_audio_frames(self): with self.lock: if not self.ready_to_process: return None # Return copy of frames and clear buffer data = self.frames[:] self.frames = [] self.ready_to_process = False return data def save_audio_to_bytes(frames): """Converts a list of av.AudioFrames to a WAV byte buffer.""" if not frames: return None # Use pydub for easy conversion # Combine all frames output = io.BytesIO() # Extract raw PCM data # Note: WebRTC frames are usually 16-bit PCM audio_segment = pydub.AudioSegment.empty() for frame in frames: # Convert av frame to pydub segment # frame.to_ndarray() returns float32 or int16 depending on layout, # usually we need to ensure it's bytes for pydub data = frame.to_ndarray().tobytes() segment = pydub.AudioSegment( data=data, sample_width=frame.format.bytes, frame_rate=frame.sample_rate, channels=len(frame.layout.channels) ) audio_segment += segment # Export to WAV audio_segment.export(output, format="wav") return output.getvalue()