anycoder-86ef485c / utils.py
kamcio1989's picture
Upload utils.py with huggingface_hub
e6df1f2 verified
import av
import numpy as np
import threading
import time
from collections import deque
import io
import pydub
class AudioProcessor(av.AudioProcessorBase):
def __init__(self):
self.threshold = 2000 # Energy threshold
self.silence_duration = 1.5 # Seconds of silence to mark end of speech
self.frames = []
self.is_speaking = False
self.silence_start_time = None
self.speech_detected_in_session = False
self.ready_to_process = False
self.lock = threading.Lock()
self.sample_rate = 48000 # Default for WebRTC usually
def set_thresholds(self, energy, silence_dur):
with self.lock:
self.threshold = energy
self.silence_duration = silence_dur
def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
# Convert to numpy array to analyze energy
raw_samples = frame.to_ndarray()
# Calculate energy (RMS)
# Handle stereo/mono
if raw_samples.ndim > 1:
raw_samples = np.mean(raw_samples, axis=1)
rms = np.sqrt(np.mean(raw_samples**2))
with self.lock:
self.sample_rate = frame.sample_rate
# Voice Activity Detection Logic
if rms > self.threshold:
self.is_speaking = True
self.speech_detected_in_session = True
self.silence_start_time = None
else:
self.is_speaking = False
if self.speech_detected_in_session and self.silence_start_time is None:
self.silence_start_time = time.time()
# Collect frames if we have detected speech in this session
if self.speech_detected_in_session:
self.frames.append(frame)
# Check if we should stop recording (silence duration exceeded)
if (self.speech_detected_in_session and
self.silence_start_time and
(time.time() - self.silence_start_time) > self.silence_duration):
self.ready_to_process = True
self.speech_detected_in_session = False # Reset for next turn
self.silence_start_time = None
return None # We don't need to return audio to the browser (SENDONLY mode)
def has_audio_frame(self):
with self.lock:
return self.ready_to_process
def get_audio_frames(self):
with self.lock:
if not self.ready_to_process:
return None
# Return copy of frames and clear buffer
data = self.frames[:]
self.frames = []
self.ready_to_process = False
return data
def save_audio_to_bytes(frames):
"""Converts a list of av.AudioFrames to a WAV byte buffer."""
if not frames:
return None
# Use pydub for easy conversion
# Combine all frames
output = io.BytesIO()
# Extract raw PCM data
# Note: WebRTC frames are usually 16-bit PCM
audio_segment = pydub.AudioSegment.empty()
for frame in frames:
# Convert av frame to pydub segment
# frame.to_ndarray() returns float32 or int16 depending on layout,
# usually we need to ensure it's bytes for pydub
data = frame.to_ndarray().tobytes()
segment = pydub.AudioSegment(
data=data,
sample_width=frame.format.bytes,
frame_rate=frame.sample_rate,
channels=len(frame.layout.channels)
)
audio_segment += segment
# Export to WAV
audio_segment.export(output, format="wav")
return output.getvalue()