Spaces:

kamcio1989
/

anycoder-86ef485c

Sleeping

App Files Files Community

anycoder-86ef485c / utils.py

kamcio1989

Upload utils.py with huggingface_hub

e6df1f2 verified about 1 month ago

raw

history blame contribute delete

3.74 kB

	import av
	import numpy as np
	import threading
	import time
	from collections import deque
	import io
	import pydub

	class AudioProcessor(av.AudioProcessorBase):
	def __init__(self):
	self.threshold = 2000 # Energy threshold
	self.silence_duration = 1.5 # Seconds of silence to mark end of speech

	self.frames = []
	self.is_speaking = False
	self.silence_start_time = None
	self.speech_detected_in_session = False
	self.ready_to_process = False
	self.lock = threading.Lock()

	self.sample_rate = 48000 # Default for WebRTC usually

	def set_thresholds(self, energy, silence_dur):
	with self.lock:
	self.threshold = energy
	self.silence_duration = silence_dur

	def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
	# Convert to numpy array to analyze energy
	raw_samples = frame.to_ndarray()

	# Calculate energy (RMS)
	# Handle stereo/mono
	if raw_samples.ndim > 1:
	raw_samples = np.mean(raw_samples, axis=1)

	rms = np.sqrt(np.mean(raw_samples**2))

	with self.lock:
	self.sample_rate = frame.sample_rate

	# Voice Activity Detection Logic
	if rms > self.threshold:
	self.is_speaking = True
	self.speech_detected_in_session = True
	self.silence_start_time = None
	else:
	self.is_speaking = False
	if self.speech_detected_in_session and self.silence_start_time is None:
	self.silence_start_time = time.time()

	# Collect frames if we have detected speech in this session
	if self.speech_detected_in_session:
	self.frames.append(frame)

	# Check if we should stop recording (silence duration exceeded)
	if (self.speech_detected_in_session and
	self.silence_start_time and
	(time.time() - self.silence_start_time) > self.silence_duration):

	self.ready_to_process = True
	self.speech_detected_in_session = False # Reset for next turn
	self.silence_start_time = None

	return None # We don't need to return audio to the browser (SENDONLY mode)

	def has_audio_frame(self):
	with self.lock:
	return self.ready_to_process

	def get_audio_frames(self):
	with self.lock:
	if not self.ready_to_process:
	return None

	# Return copy of frames and clear buffer
	data = self.frames[:]
	self.frames = []
	self.ready_to_process = False
	return data

	def save_audio_to_bytes(frames):
	"""Converts a list of av.AudioFrames to a WAV byte buffer."""
	if not frames:
	return None

	# Use pydub for easy conversion
	# Combine all frames
	output = io.BytesIO()

	# Extract raw PCM data
	# Note: WebRTC frames are usually 16-bit PCM
	audio_segment = pydub.AudioSegment.empty()

	for frame in frames:
	# Convert av frame to pydub segment
	# frame.to_ndarray() returns float32 or int16 depending on layout,
	# usually we need to ensure it's bytes for pydub
	data = frame.to_ndarray().tobytes()
	segment = pydub.AudioSegment(
	data=data,
	sample_width=frame.format.bytes,
	frame_rate=frame.sample_rate,
	channels=len(frame.layout.channels)
	)
	audio_segment += segment

	# Export to WAV
	audio_segment.export(output, format="wav")
	return output.getvalue()