File size: 3,736 Bytes
e6df1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import av
import numpy as np
import threading
import time
from collections import deque
import io
import pydub

class AudioProcessor(av.AudioProcessorBase):
    def __init__(self):
        self.threshold = 2000  # Energy threshold
        self.silence_duration = 1.5  # Seconds of silence to mark end of speech
        
        self.frames = []
        self.is_speaking = False
        self.silence_start_time = None
        self.speech_detected_in_session = False
        self.ready_to_process = False
        self.lock = threading.Lock()
        
        self.sample_rate = 48000 # Default for WebRTC usually
        
    def set_thresholds(self, energy, silence_dur):
        with self.lock:
            self.threshold = energy
            self.silence_duration = silence_dur

    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
        # Convert to numpy array to analyze energy
        raw_samples = frame.to_ndarray()
        
        # Calculate energy (RMS)
        # Handle stereo/mono
        if raw_samples.ndim > 1:
            raw_samples = np.mean(raw_samples, axis=1)
            
        rms = np.sqrt(np.mean(raw_samples**2))
        
        with self.lock:
            self.sample_rate = frame.sample_rate
            
            # Voice Activity Detection Logic
            if rms > self.threshold:
                self.is_speaking = True
                self.speech_detected_in_session = True
                self.silence_start_time = None
            else:
                self.is_speaking = False
                if self.speech_detected_in_session and self.silence_start_time is None:
                    self.silence_start_time = time.time()

            # Collect frames if we have detected speech in this session
            if self.speech_detected_in_session:
                self.frames.append(frame)

            # Check if we should stop recording (silence duration exceeded)
            if (self.speech_detected_in_session and 
                self.silence_start_time and 
                (time.time() - self.silence_start_time) > self.silence_duration):
                
                self.ready_to_process = True
                self.speech_detected_in_session = False # Reset for next turn
                self.silence_start_time = None

        return None # We don't need to return audio to the browser (SENDONLY mode)

    def has_audio_frame(self):
        with self.lock:
            return self.ready_to_process

    def get_audio_frames(self):
        with self.lock:
            if not self.ready_to_process:
                return None
            
            # Return copy of frames and clear buffer
            data = self.frames[:]
            self.frames = []
            self.ready_to_process = False
            return data

def save_audio_to_bytes(frames):
    """Converts a list of av.AudioFrames to a WAV byte buffer."""
    if not frames:
        return None

    # Use pydub for easy conversion
    # Combine all frames
    output = io.BytesIO()
    
    # Extract raw PCM data
    # Note: WebRTC frames are usually 16-bit PCM
    audio_segment = pydub.AudioSegment.empty()
    
    for frame in frames:
        # Convert av frame to pydub segment
        # frame.to_ndarray() returns float32 or int16 depending on layout, 
        # usually we need to ensure it's bytes for pydub
        data = frame.to_ndarray().tobytes()
        segment = pydub.AudioSegment(
            data=data, 
            sample_width=frame.format.bytes, 
            frame_rate=frame.sample_rate, 
            channels=len(frame.layout.channels)
        )
        audio_segment += segment
        
    # Export to WAV
    audio_segment.export(output, format="wav")
    return output.getvalue()