Spaces:

kamcio1989
/

anycoder-86ef485c

Sleeping

App Files Files Community

kamcio1989 commited on Nov 22, 2025

Commit

e6df1f2

verified ·

1 Parent(s): 99be848

Upload utils.py with huggingface_hub

Browse files

Files changed (1) hide show

utils.py +110 -0

utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import av
+import numpy as np
+import threading
+import time
+from collections import deque
+import io
+import pydub
+class AudioProcessor(av.AudioProcessorBase):
+    def __init__(self):
+        self.threshold = 2000  # Energy threshold
+        self.silence_duration = 1.5  # Seconds of silence to mark end of speech
+        self.frames = []
+        self.is_speaking = False
+        self.silence_start_time = None
+        self.speech_detected_in_session = False
+        self.ready_to_process = False
+        self.lock = threading.Lock()
+        self.sample_rate = 48000 # Default for WebRTC usually
+    def set_thresholds(self, energy, silence_dur):
+        with self.lock:
+            self.threshold = energy
+            self.silence_duration = silence_dur
+    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
+        # Convert to numpy array to analyze energy
+        raw_samples = frame.to_ndarray()
+        # Calculate energy (RMS)
+        # Handle stereo/mono
+        if raw_samples.ndim > 1:
+            raw_samples = np.mean(raw_samples, axis=1)
+        rms = np.sqrt(np.mean(raw_samples**2))
+        with self.lock:
+            self.sample_rate = frame.sample_rate
+            # Voice Activity Detection Logic
+            if rms > self.threshold:
+                self.is_speaking = True
+                self.speech_detected_in_session = True
+                self.silence_start_time = None
+            else:
+                self.is_speaking = False
+                if self.speech_detected_in_session and self.silence_start_time is None:
+                    self.silence_start_time = time.time()
+            # Collect frames if we have detected speech in this session
+            if self.speech_detected_in_session:
+                self.frames.append(frame)
+            # Check if we should stop recording (silence duration exceeded)
+            if (self.speech_detected_in_session and
+                self.silence_start_time and
+                (time.time() - self.silence_start_time) > self.silence_duration):
+                self.ready_to_process = True
+                self.speech_detected_in_session = False # Reset for next turn
+                self.silence_start_time = None
+        return None # We don't need to return audio to the browser (SENDONLY mode)
+    def has_audio_frame(self):
+        with self.lock:
+            return self.ready_to_process
+    def get_audio_frames(self):
+        with self.lock:
+            if not self.ready_to_process:
+                return None
+            # Return copy of frames and clear buffer
+            data = self.frames[:]
+            self.frames = []
+            self.ready_to_process = False
+            return data
+def save_audio_to_bytes(frames):
+    """Converts a list of av.AudioFrames to a WAV byte buffer."""
+    if not frames:
+        return None
+    # Use pydub for easy conversion
+    # Combine all frames
+    output = io.BytesIO()
+    # Extract raw PCM data
+    # Note: WebRTC frames are usually 16-bit PCM
+    audio_segment = pydub.AudioSegment.empty()
+    for frame in frames:
+        # Convert av frame to pydub segment
+        # frame.to_ndarray() returns float32 or int16 depending on layout,
+        # usually we need to ensure it's bytes for pydub
+        data = frame.to_ndarray().tobytes()
+        segment = pydub.AudioSegment(
+            data=data,
+            sample_width=frame.format.bytes,
+            frame_rate=frame.sample_rate,
+            channels=len(frame.layout.channels)
+        )
+        audio_segment += segment
+    # Export to WAV
+    audio_segment.export(output, format="wav")
+    return output.getvalue()