Pranav Mishra commited on
Commit
df60ee3
·
1 Parent(s): 1772a46

Streamline backend: Remove Whisper/Wav2Vec2 models and dependencies

Browse files

- Removed whisper_digit_processor.py, wav2vec2_processor.py, faster_whisper_processor.py
- Removed local_whisper.py and related VAD utilities
- Simplified requirements_hf.txt: removed transformers, webrtcvad, CPU-specific PyTorch
- Only keeping 3 core ML models: MFCC, Mel CNN, Raw CNN + External API
- Reduced build size and complexity for reliable HF Spaces deployment

app.py CHANGED
@@ -12,9 +12,8 @@ from typing import Dict, Any, Optional
12
  from dotenv import load_dotenv
13
  import numpy as np
14
 
15
- # Import audio processors (only essential ones for deployment)
16
  from audio_processors.external_api import ExternalAPIProcessor
17
- from audio_processors.whisper_digit_processor import WhisperDigitProcessor
18
  from audio_processors.ml_mfcc_processor import MLMFCCProcessor
19
  from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor
20
  from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor
@@ -80,14 +79,7 @@ def initialize_processors():
80
  except Exception as e:
81
  app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}")
82
 
83
- # Whisper digit processor as another fallback
84
- try:
85
- whisper_processor = WhisperDigitProcessor()
86
- if whisper_processor.is_configured():
87
- procs['whisper_digit'] = whisper_processor
88
- app.logger.info("[OK] Whisper digit processor initialized")
89
- except Exception as e:
90
- app.logger.error(f"[FAIL] Failed to initialize Whisper: {str(e)}")
91
 
92
  app.logger.info(f"Processor initialization complete:")
93
  app.logger.info(f" ML Models loaded: {ml_working_count}/3")
 
12
  from dotenv import load_dotenv
13
  import numpy as np
14
 
15
+ # Import audio processors (only the 3 ML models + external API)
16
  from audio_processors.external_api import ExternalAPIProcessor
 
17
  from audio_processors.ml_mfcc_processor import MLMFCCProcessor
18
  from audio_processors.ml_mel_cnn_processor import MLMelCNNProcessor
19
  from audio_processors.ml_raw_cnn_processor import MLRawCNNProcessor
 
79
  except Exception as e:
80
  app.logger.error(f"[FAIL] Failed to initialize External API: {str(e)}")
81
 
82
+ # Removed whisper processors to reduce dependencies and build size
 
 
 
 
 
 
 
83
 
84
  app.logger.info(f"Processor initialization complete:")
85
  app.logger.info(f" ML Models loaded: {ml_working_count}/3")
audio_processors/faster_whisper_processor.py DELETED
@@ -1,219 +0,0 @@
1
- """
2
- Faster-Whisper processor with built-in VAD (2025 approach)
3
- More reliable than manual WebRTC VAD + Whisper coordination
4
- """
5
-
6
- import numpy as np
7
- import io
8
- import time
9
- import logging
10
- from typing import Dict, Any, Optional
11
-
12
- try:
13
- from faster_whisper import WhisperModel
14
- FASTER_WHISPER_AVAILABLE = True
15
- except ImportError:
16
- FASTER_WHISPER_AVAILABLE = False
17
- WhisperModel = None
18
-
19
- from .base_processor import AudioProcessor
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- class FasterWhisperDigitProcessor(AudioProcessor):
24
- """
25
- Modern 2025 approach using faster-whisper with built-in VAD.
26
- Much more reliable than manual WebRTC VAD coordination.
27
- """
28
-
29
- def __init__(self):
30
- """Initialize faster-whisper processor with built-in VAD."""
31
- super().__init__("Faster-Whisper with VAD")
32
-
33
- if not FASTER_WHISPER_AVAILABLE:
34
- logger.error("faster-whisper not available. Install with: pip install faster-whisper")
35
- self.model = None
36
- return
37
-
38
- self.model = None
39
- self.device = "cuda" if self._cuda_available() else "cpu"
40
-
41
- # Digit mapping
42
- self.digit_map = {
43
- "zero": "0", "one": "1", "two": "2", "three": "3",
44
- "four": "4", "five": "5", "six": "6", "seven": "7",
45
- "eight": "8", "nine": "9",
46
- "oh": "0", "o": "0", "for": "4", "fore": "4",
47
- "to": "2", "too": "2", "tu": "2", "tree": "3",
48
- "free": "3", "ate": "8", "ait": "8"
49
- }
50
-
51
- # Statistics
52
- self.total_predictions = 0
53
- self.successful_predictions = 0
54
- self.failed_predictions = 0
55
-
56
- self._initialize_model()
57
-
58
- def _cuda_available(self) -> bool:
59
- """Check if CUDA is available."""
60
- try:
61
- import torch
62
- return torch.cuda.is_available()
63
- except ImportError:
64
- return False
65
-
66
- def _initialize_model(self):
67
- """Initialize faster-whisper model with VAD."""
68
- if not FASTER_WHISPER_AVAILABLE:
69
- return
70
-
71
- try:
72
- logger.info("Initializing faster-whisper model with built-in VAD...")
73
-
74
- # Initialize faster-whisper model
75
- self.model = WhisperModel(
76
- "tiny", # Use tiny model for speed
77
- device=self.device,
78
- compute_type="float16" if self.device == "cuda" else "int8"
79
- )
80
-
81
- logger.info(f"Faster-Whisper model initialized on {self.device}")
82
-
83
- except Exception as e:
84
- logger.error(f"Failed to initialize faster-whisper: {e}")
85
- self.model = None
86
-
87
- def is_configured(self) -> bool:
88
- """Check if processor is configured."""
89
- return self.model is not None and FASTER_WHISPER_AVAILABLE
90
-
91
- def process_audio(self, audio_data: bytes) -> str:
92
- """
93
- Process audio with built-in VAD and return predicted digit.
94
-
95
- Args:
96
- audio_data: Raw audio bytes
97
-
98
- Returns:
99
- str: Predicted digit (0-9) or error message
100
- """
101
- if not self.is_configured():
102
- return "error: Model not configured"
103
-
104
- try:
105
- # Convert audio to numpy array
106
- audio_array = self._convert_audio_bytes(audio_data)
107
- if audio_array is None:
108
- return "error: Audio conversion failed"
109
-
110
- # Use faster-whisper with built-in VAD
111
- segments, info = self.model.transcribe(
112
- audio_array,
113
- language="en",
114
- # Built-in VAD parameters - much better than manual VAD
115
- vad_filter=True,
116
- vad_parameters=dict(
117
- min_silence_duration_ms=100, # 100ms minimum silence
118
- speech_pad_ms=30 # 30ms padding around speech
119
- )
120
- )
121
-
122
- # Process transcription results
123
- transcriptions = []
124
- for segment in segments:
125
- text = segment.text.strip().lower()
126
- if text:
127
- transcriptions.append(text)
128
-
129
- if not transcriptions:
130
- return "error: No speech detected"
131
-
132
- # Combine all segments and extract digit
133
- full_text = " ".join(transcriptions)
134
- digit = self._text_to_digit(full_text)
135
-
136
- logger.debug(f"Faster-Whisper: '{full_text}' -> '{digit}'")
137
-
138
- if digit in "0123456789":
139
- self.successful_predictions += 1
140
- return digit
141
- else:
142
- self.failed_predictions += 1
143
- return f"unclear: {full_text}"
144
-
145
- except Exception as e:
146
- logger.error(f"Faster-Whisper processing failed: {e}")
147
- self.failed_predictions += 1
148
- return f"error: {str(e)}"
149
- finally:
150
- self.total_predictions += 1
151
-
152
- def _convert_audio_bytes(self, audio_data: bytes) -> Optional[np.ndarray]:
153
- """Convert audio bytes to numpy array for faster-whisper."""
154
- try:
155
- # Check if it's a WAV file
156
- if audio_data.startswith(b'RIFF'):
157
- import soundfile as sf
158
- audio_buffer = io.BytesIO(audio_data)
159
- audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
160
-
161
- # Convert stereo to mono if needed
162
- if len(audio_array.shape) > 1:
163
- audio_array = np.mean(audio_array, axis=1)
164
-
165
- return audio_array
166
- else:
167
- # Raw PCM data
168
- audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
169
- return audio_array / 32768.0
170
-
171
- except Exception as e:
172
- logger.error(f"Audio conversion failed: {e}")
173
- return None
174
-
175
- def _text_to_digit(self, text: str) -> str:
176
- """Convert transcribed text to digit."""
177
- text = text.strip().lower()
178
-
179
- # Remove common words
180
- text = text.replace("the", "").replace("number", "").replace("digit", "")
181
- text = text.strip()
182
-
183
- # Direct mapping
184
- if text in self.digit_map:
185
- return self.digit_map[text]
186
-
187
- # Word-by-word check
188
- for word in text.split():
189
- if word in self.digit_map:
190
- return self.digit_map[word]
191
-
192
- # Check for digits in text
193
- digits = [char for char in text if char.isdigit()]
194
- if digits:
195
- return digits[0]
196
-
197
- return text
198
-
199
- def get_model_info(self) -> Dict[str, Any]:
200
- """Get model information."""
201
- return {
202
- 'model_name': 'faster-whisper-tiny',
203
- 'model_type': 'Speech-to-Text with VAD',
204
- 'has_builtin_vad': True,
205
- 'device': self.device,
206
- 'available': FASTER_WHISPER_AVAILABLE
207
- }
208
-
209
- def get_stats(self) -> Dict[str, Any]:
210
- """Get processing statistics."""
211
- success_rate = self.successful_predictions / max(1, self.total_predictions)
212
-
213
- return {
214
- 'total_predictions': self.total_predictions,
215
- 'successful_predictions': self.successful_predictions,
216
- 'failed_predictions': self.failed_predictions,
217
- 'success_rate': round(success_rate, 3),
218
- 'model_available': self.is_configured()
219
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audio_processors/local_whisper.py DELETED
@@ -1,158 +0,0 @@
1
- import logging
2
- import numpy as np
3
- from typing import Optional
4
- from .base_processor import AudioProcessor
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
- class LocalWhisperProcessor(AudioProcessor):
9
- """
10
- Local Whisper model using transformers pipeline.
11
- Fallback when API is unavailable.
12
- """
13
-
14
- def __init__(self):
15
- super().__init__("Local Whisper (Tiny)")
16
- self.pipeline = None
17
- self.model_name = "openai/whisper-tiny"
18
- self.is_initialized = False
19
-
20
- def _initialize_model(self):
21
- """Lazy initialization of the model"""
22
- if self.is_initialized:
23
- return
24
-
25
- try:
26
- logger.info(f"Loading local Whisper model: {self.model_name}")
27
-
28
- from transformers import pipeline
29
- import torch
30
-
31
- # Use CPU for compatibility, GPU if available
32
- device = "cuda" if torch.cuda.is_available() else "cpu"
33
-
34
- self.pipeline = pipeline(
35
- "automatic-speech-recognition",
36
- model=self.model_name,
37
- device=device,
38
- torch_dtype=torch.float32, # Use float32 to avoid dtype issues
39
- return_timestamps=False # We only need text
40
- )
41
-
42
- logger.info(f"Local Whisper model loaded on {device}")
43
- self.is_initialized = True
44
-
45
- except ImportError as e:
46
- logger.error("transformers library not installed. Run: pip install transformers torch")
47
- raise Exception("transformers library required for local processing")
48
- except Exception as e:
49
- logger.error(f"Failed to load local Whisper model: {str(e)}")
50
- raise Exception(f"Local model initialization failed: {str(e)}")
51
-
52
- def process_audio(self, audio_data: bytes) -> str:
53
- """
54
- Process audio using local Whisper model.
55
-
56
- Args:
57
- audio_data: Raw audio bytes (WAV format preferred)
58
-
59
- Returns:
60
- Predicted digit as string ('0'-'9')
61
-
62
- Raises:
63
- Exception: If processing fails
64
- """
65
- try:
66
- # Initialize model on first use
67
- self._initialize_model()
68
-
69
- # Convert audio bytes to numpy array
70
- from utils.audio_utils import audio_to_numpy
71
- audio_array, sample_rate = audio_to_numpy(audio_data)
72
-
73
- # Resample to 16kHz if needed (Whisper expects 16kHz)
74
- if sample_rate != 16000:
75
- logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
76
- import librosa
77
- audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
78
-
79
- # Process with pipeline
80
- logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
81
- result = self.pipeline(audio_array)
82
-
83
- if not result or 'text' not in result:
84
- logger.error(f"Unexpected pipeline result: {result}")
85
- raise Exception("Invalid pipeline output")
86
-
87
- transcribed_text = result['text'].strip().lower()
88
- logger.debug(f"Local Whisper transcription: '{transcribed_text}'")
89
-
90
- # Extract digit from transcription
91
- predicted_digit = self._extract_digit(transcribed_text)
92
-
93
- if predicted_digit is None:
94
- logger.warning(f"No digit found in transcription: '{transcribed_text}'")
95
- return "?"
96
-
97
- return predicted_digit
98
-
99
- except Exception as e:
100
- logger.error(f"Local Whisper processing failed: {str(e)}")
101
- raise Exception(f"Local processing error: {str(e)}")
102
-
103
- def _extract_digit(self, text: str) -> Optional[str]:
104
- """
105
- Extract digit from transcribed text.
106
- Handles both numerical ('1', '2') and word forms ('one', 'two').
107
- """
108
- import re
109
-
110
- # Word to digit mapping
111
- word_to_digit = {
112
- 'zero': '0', 'oh': '0',
113
- 'one': '1', 'won': '1',
114
- 'two': '2', 'to': '2', 'too': '2',
115
- 'three': '3', 'tree': '3',
116
- 'four': '4', 'for': '4', 'fore': '4',
117
- 'five': '5',
118
- 'six': '6', 'sick': '6',
119
- 'seven': '7',
120
- 'eight': '8', 'ate': '8',
121
- 'nine': '9', 'niner': '9'
122
- }
123
-
124
- # First, try to find a direct digit
125
- digit_match = re.search(r'\b([0-9])\b', text)
126
- if digit_match:
127
- return digit_match.group(1)
128
-
129
- # Then try word forms
130
- words = text.split()
131
- for word in words:
132
- clean_word = re.sub(r'[^\w]', '', word.lower())
133
- if clean_word in word_to_digit:
134
- return word_to_digit[clean_word]
135
-
136
- # Try partial matches for robustness
137
- for word, digit in word_to_digit.items():
138
- if word in text:
139
- return digit
140
-
141
- return None
142
-
143
- def is_configured(self) -> bool:
144
- """Check if local model can be initialized."""
145
- try:
146
- import transformers
147
- import torch
148
- return True
149
- except ImportError:
150
- return False
151
-
152
- def test_connection(self) -> bool:
153
- """Test local model functionality."""
154
- try:
155
- self._initialize_model()
156
- return True
157
- except:
158
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audio_processors/wav2vec2_processor.py DELETED
@@ -1,170 +0,0 @@
1
- import logging
2
- import numpy as np
3
- from typing import Optional
4
- from .base_processor import AudioProcessor
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
- class Wav2Vec2Processor(AudioProcessor):
9
- """
10
- Wav2Vec2 model processor for speech recognition.
11
- Lightweight alternative to Whisper.
12
- """
13
-
14
- def __init__(self):
15
- super().__init__("Wav2Vec2 (Facebook)")
16
- self.processor = None
17
- self.model = None
18
- self.model_name = "facebook/wav2vec2-base-960h"
19
- self.is_initialized = False
20
-
21
- def _initialize_model(self):
22
- """Lazy initialization of the model"""
23
- if self.is_initialized:
24
- return
25
-
26
- try:
27
- logger.info(f"Loading Wav2Vec2 model: {self.model_name}")
28
-
29
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
30
- import torch
31
-
32
- # Load processor and model
33
- self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
34
- self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
35
-
36
- # Move to GPU if available
37
- device = "cuda" if torch.cuda.is_available() else "cpu"
38
- self.model = self.model.to(device)
39
- self.device = device
40
-
41
- logger.info(f"Wav2Vec2 model loaded on {device}")
42
- self.is_initialized = True
43
-
44
- except ImportError as e:
45
- logger.error("transformers library not installed. Run: pip install transformers torch")
46
- raise Exception("transformers library required for Wav2Vec2 processing")
47
- except Exception as e:
48
- logger.error(f"Failed to load Wav2Vec2 model: {str(e)}")
49
- raise Exception(f"Wav2Vec2 model initialization failed: {str(e)}")
50
-
51
- def process_audio(self, audio_data: bytes) -> str:
52
- """
53
- Process audio using Wav2Vec2 model.
54
-
55
- Args:
56
- audio_data: Raw audio bytes (WAV format preferred)
57
-
58
- Returns:
59
- Predicted digit as string ('0'-'9')
60
-
61
- Raises:
62
- Exception: If processing fails
63
- """
64
- try:
65
- # Initialize model on first use
66
- self._initialize_model()
67
-
68
- # Convert audio bytes to numpy array
69
- from utils.audio_utils import audio_to_numpy
70
- audio_array, sample_rate = audio_to_numpy(audio_data)
71
-
72
- # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
73
- if sample_rate != 16000:
74
- logger.debug(f"Resampling from {sample_rate}Hz to 16kHz")
75
- import librosa
76
- audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
77
-
78
- logger.debug(f"Processing audio: {len(audio_array)} samples at 16kHz")
79
-
80
- # Process with Wav2Vec2
81
- import torch
82
-
83
- # Tokenize audio
84
- input_values = self.processor(
85
- audio_array,
86
- return_tensors="pt",
87
- padding="longest",
88
- sampling_rate=16000
89
- ).input_values.to(self.device)
90
-
91
- # Get logits
92
- with torch.no_grad():
93
- logits = self.model(input_values).logits
94
-
95
- # Get predicted tokens
96
- predicted_ids = torch.argmax(logits, dim=-1)
97
-
98
- # Decode transcription
99
- transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip()
100
- logger.debug(f"Wav2Vec2 transcription: '{transcription}'")
101
-
102
- # Extract digit from transcription
103
- predicted_digit = self._extract_digit(transcription)
104
-
105
- if predicted_digit is None:
106
- logger.warning(f"No digit found in transcription: '{transcription}'")
107
- return "?"
108
-
109
- return predicted_digit
110
-
111
- except Exception as e:
112
- logger.error(f"Wav2Vec2 processing failed: {str(e)}")
113
- raise Exception(f"Wav2Vec2 processing error: {str(e)}")
114
-
115
- def _extract_digit(self, text: str) -> Optional[str]:
116
- """
117
- Extract digit from transcribed text.
118
- Handles both numerical ('1', '2') and word forms ('one', 'two').
119
- """
120
- import re
121
-
122
- # Word to digit mapping
123
- word_to_digit = {
124
- 'zero': '0', 'oh': '0',
125
- 'one': '1', 'won': '1',
126
- 'two': '2', 'to': '2', 'too': '2',
127
- 'three': '3', 'tree': '3',
128
- 'four': '4', 'for': '4', 'fore': '4', 'full': '4', # "full" often misheard as "four"
129
- 'five': '5',
130
- 'six': '6', 'sick': '6',
131
- 'seven': '7',
132
- 'eight': '8', 'ate': '8',
133
- 'nine': '9', 'niner': '9'
134
- }
135
-
136
- # First, try to find a direct digit
137
- digit_match = re.search(r'\b([0-9])\b', text)
138
- if digit_match:
139
- return digit_match.group(1)
140
-
141
- # Then try word forms
142
- words = text.split()
143
- for word in words:
144
- clean_word = re.sub(r'[^\w]', '', word.lower())
145
- if clean_word in word_to_digit:
146
- return word_to_digit[clean_word]
147
-
148
- # Try partial matches for robustness
149
- for word, digit in word_to_digit.items():
150
- if word in text:
151
- return digit
152
-
153
- return None
154
-
155
- def is_configured(self) -> bool:
156
- """Check if Wav2Vec2 model can be initialized."""
157
- try:
158
- import transformers
159
- import torch
160
- return True
161
- except ImportError:
162
- return False
163
-
164
- def test_connection(self) -> bool:
165
- """Test Wav2Vec2 model functionality."""
166
- try:
167
- self._initialize_model()
168
- return True
169
- except:
170
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audio_processors/whisper_digit_processor.py DELETED
@@ -1,429 +0,0 @@
1
- """
2
- Whisper-based digit recognition processor
3
- Specialized implementation for spoken digit recognition (0-9)
4
- """
5
-
6
- import numpy as np
7
- import io
8
- import time
9
- import logging
10
- from typing import Dict, Any, Optional
11
- import torch
12
- from transformers import pipeline
13
- import soundfile as sf
14
-
15
- from .base_processor import AudioProcessor
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
- class WhisperDigitProcessor(AudioProcessor):
20
- """
21
- Whisper-based digit recognition processor using Hugging Face transformers.
22
- Optimized for single digit recognition with mapping from text to numbers.
23
- """
24
-
25
- def __init__(self):
26
- """Initialize Whisper digit processor with optimized settings."""
27
- super().__init__("Whisper Digit Recognition")
28
- self.model = None
29
- self.device = 0 if torch.cuda.is_available() else -1
30
-
31
- # Digit mapping for text-to-number conversion
32
- self.digit_map = {
33
- "zero": "0", "one": "1", "two": "2", "three": "3",
34
- "four": "4", "five": "5", "six": "6", "seven": "7",
35
- "eight": "8", "nine": "9",
36
- # Common variations and alternatives
37
- "oh": "0", "o": "0",
38
- "for": "4", "fore": "4", "to": "2", "too": "2", "tu": "2",
39
- "tree": "3", "free": "3", "ate": "8", "ait": "8"
40
- }
41
-
42
- # Reverse mapping for validation
43
- self.number_words = set(self.digit_map.keys())
44
-
45
- # Statistics tracking
46
- self.total_predictions = 0
47
- self.successful_predictions = 0
48
- self.failed_predictions = 0
49
- self.average_inference_time = 0.0
50
-
51
- self._initialize_model()
52
-
53
- def _initialize_model(self):
54
- """Initialize the Whisper model with optimal settings for digit recognition."""
55
- try:
56
- logger.info("Initializing Whisper model for digit recognition...")
57
-
58
- # Use Whisper tiny model for fast inference
59
- self.model = pipeline(
60
- "automatic-speech-recognition",
61
- model="openai/whisper-tiny",
62
- device=self.device,
63
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
64
- return_timestamps=False # We don't need timestamps for single digits
65
- )
66
-
67
- logger.info(f"Whisper model initialized successfully on device: {self.device}")
68
-
69
- # Test model with dummy input
70
- test_audio = np.random.randn(16000).astype(np.float32) # 1 second of noise
71
- try:
72
- test_result = self.model(test_audio)
73
- logger.info("Model test successful")
74
- except Exception as e:
75
- logger.warning(f"Model test failed but model loaded: {e}")
76
-
77
- return True
78
-
79
- except Exception as e:
80
- logger.error(f"Failed to initialize Whisper model: {e}")
81
- return False
82
-
83
- def is_configured(self) -> bool:
84
- """Check if the processor is properly configured."""
85
- return self.model is not None
86
-
87
- def process_audio(self, audio_data: bytes) -> str:
88
- """
89
- Predict digit from audio data.
90
-
91
- Args:
92
- audio_data: Raw audio bytes (WAV format preferred)
93
-
94
- Returns:
95
- str: Predicted digit (0-9) or error message
96
- """
97
- if not self.is_configured():
98
- return "error: Model not configured"
99
-
100
- try:
101
- # Convert audio bytes to numpy array
102
- audio_array = self._convert_audio_to_array(audio_data)
103
-
104
- if audio_array is None:
105
- return "error: Invalid audio format"
106
-
107
- # Ensure proper sample rate and format
108
- audio_array = self._preprocess_audio(audio_array)
109
-
110
- # Run Whisper inference
111
- result = self.model(audio_array)
112
- text = result["text"].strip().lower()
113
-
114
- # Convert text to digit
115
- digit = self._text_to_digit(text)
116
-
117
- # Enhanced logging to debug transcription issues
118
- logger.info(f"🎤 Whisper transcription: '{text}' -> digit: '{digit}'")
119
- logger.info(f"📊 Audio stats: duration={len(audio_array)/16000:.2f}s, samples={len(audio_array)}, max_val={np.max(np.abs(audio_array)):.3f}")
120
-
121
- if digit in "0123456789":
122
- self.successful_predictions += 1
123
- return digit
124
- else:
125
- self.failed_predictions += 1
126
- return f"unclear: {text}"
127
-
128
- except Exception as e:
129
- logger.error(f"Whisper prediction failed: {e}")
130
- self.failed_predictions += 1
131
- return f"error: {str(e)}"
132
- finally:
133
- self.total_predictions += 1
134
-
135
- def _convert_audio_to_array(self, audio_data: bytes) -> Optional[np.ndarray]:
136
- """
137
- Convert audio bytes to numpy array.
138
-
139
- Args:
140
- audio_data: Raw audio bytes (could be WAV file or raw PCM from VAD)
141
-
142
- Returns:
143
- np.ndarray: Audio samples or None if conversion failed
144
- """
145
- # First check if this looks like raw PCM data from VAD (no file headers)
146
- if len(audio_data) < 100 or not audio_data.startswith(b'RIFF'):
147
- # This is likely raw PCM data from WebRTC VAD
148
- try:
149
- logger.debug("Processing raw PCM data from VAD segment")
150
- audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
151
- audio_array = audio_array / 32768.0 # Normalize to [-1, 1]
152
- self._original_sample_rate = 16000 # WebRTC VAD uses 16kHz
153
- return audio_array
154
- except Exception as e:
155
- logger.error(f"Failed to process raw PCM data: {e}")
156
- return None
157
-
158
- # This looks like a complete audio file (WAV, etc.)
159
- try:
160
- # Try to read as audio file using soundfile
161
- audio_buffer = io.BytesIO(audio_data)
162
- audio_array, sample_rate = sf.read(audio_buffer, dtype='float32')
163
-
164
- # Handle stereo to mono conversion
165
- if len(audio_array.shape) > 1:
166
- audio_array = np.mean(audio_array, axis=1)
167
-
168
- # Store original sample rate for resampling
169
- self._original_sample_rate = sample_rate
170
-
171
- logger.debug(f"Successfully loaded audio file: {len(audio_array)} samples at {sample_rate}Hz")
172
- return audio_array
173
-
174
- except Exception as e:
175
- logger.warning(f"Audio file conversion failed with soundfile: {e}")
176
-
177
- # Final fallback: treat as raw PCM
178
- try:
179
- logger.debug("Fallback: treating as raw PCM data")
180
- audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
181
- audio_array = audio_array / 32768.0 # Normalize to [-1, 1]
182
- self._original_sample_rate = 16000 # Assume 16kHz
183
- return audio_array
184
- except Exception as e2:
185
- logger.error(f"All audio conversion methods failed: {e2}")
186
- return None
187
-
188
- def _preprocess_audio(self, audio_array: np.ndarray) -> np.ndarray:
189
- """
190
- Preprocess audio for optimal Whisper performance.
191
-
192
- Args:
193
- audio_array: Raw audio samples
194
-
195
- Returns:
196
- np.ndarray: Preprocessed audio
197
- """
198
- # Resample to 16kHz if needed (Whisper's expected input)
199
- target_sample_rate = 16000
200
-
201
- if hasattr(self, '_original_sample_rate') and self._original_sample_rate != target_sample_rate:
202
- try:
203
- import librosa
204
- audio_array = librosa.resample(
205
- audio_array,
206
- orig_sr=self._original_sample_rate,
207
- target_sr=target_sample_rate
208
- )
209
- logger.debug(f"Resampled audio from {self._original_sample_rate}Hz to {target_sample_rate}Hz")
210
- except ImportError:
211
- logger.warning("librosa not available for resampling, using original audio")
212
- except Exception as e:
213
- logger.warning(f"Resampling failed: {e}, using original audio")
214
-
215
- # Trim silence from edges
216
- audio_array = self._trim_silence(audio_array)
217
-
218
- # Ensure minimum length (Whisper works better with at least 0.1s)
219
- min_samples = int(0.1 * target_sample_rate)
220
- if len(audio_array) < min_samples:
221
- # Pad with silence
222
- padding = min_samples - len(audio_array)
223
- audio_array = np.pad(audio_array, (0, padding), mode='constant', constant_values=0)
224
-
225
- # Normalize audio
226
- max_val = np.max(np.abs(audio_array))
227
- if max_val > 0:
228
- audio_array = audio_array / max_val * 0.9 # Prevent clipping
229
-
230
- return audio_array
231
-
232
- def _trim_silence(self, audio_array: np.ndarray, silence_threshold: float = 0.01) -> np.ndarray:
233
- """
234
- Trim silence from beginning and end of audio.
235
-
236
- Args:
237
- audio_array: Audio samples
238
- silence_threshold: Threshold for silence detection
239
-
240
- Returns:
241
- np.ndarray: Trimmed audio
242
- """
243
- if len(audio_array) == 0:
244
- return audio_array
245
-
246
- # Find non-silent regions
247
- energy = audio_array ** 2
248
- non_silent = energy > silence_threshold
249
-
250
- if not np.any(non_silent):
251
- return audio_array # All silence, return as is
252
-
253
- # Find first and last non-silent samples
254
- first_sound = np.argmax(non_silent)
255
- last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1
256
-
257
- # Add small padding
258
- padding_samples = int(0.05 * 16000) # 50ms padding
259
- first_sound = max(0, first_sound - padding_samples)
260
- last_sound = min(len(audio_array) - 1, last_sound + padding_samples)
261
-
262
- return audio_array[first_sound:last_sound + 1]
263
-
264
- def _text_to_digit(self, text: str) -> str:
265
- """
266
- Convert transcribed text to digit.
267
-
268
- Args:
269
- text: Transcribed text from Whisper
270
-
271
- Returns:
272
- str: Digit (0-9) or original text if no match
273
- """
274
- # Clean the text
275
- text = text.strip().lower()
276
-
277
- # Remove common punctuation and extra words
278
- text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
279
- text = text.replace("the", "").replace("number", "").replace("digit", "")
280
- text = text.strip()
281
-
282
- # Try direct mapping
283
- if text in self.digit_map:
284
- return self.digit_map[text]
285
-
286
- # Try word-by-word mapping for multi-word responses
287
- words = text.split()
288
- for word in words:
289
- if word in self.digit_map:
290
- return self.digit_map[word]
291
-
292
- # Check if it's already a digit
293
- if len(text) == 1 and text.isdigit():
294
- return text
295
-
296
- # Look for digits in the text
297
- digits_found = [char for char in text if char.isdigit()]
298
- if digits_found:
299
- return digits_found[0] # Return first digit found
300
-
301
- # No clear digit found
302
- return text
303
-
304
- def predict_with_timing(self, audio_data: bytes) -> Dict[str, Any]:
305
- """
306
- Predict digit with detailed timing and confidence metrics.
307
-
308
- Args:
309
- audio_data: Raw audio bytes
310
-
311
- Returns:
312
- dict: Prediction results with timing and metadata
313
- """
314
- start_time = time.time()
315
-
316
- predicted_digit = self.process_audio(audio_data)
317
-
318
- inference_time = time.time() - start_time
319
-
320
- # Update average inference time
321
- if self.total_predictions > 0:
322
- self.average_inference_time = (
323
- (self.average_inference_time * (self.total_predictions - 1) + inference_time)
324
- / self.total_predictions
325
- )
326
-
327
- # Determine success status
328
- is_successful = predicted_digit in "0123456789"
329
- confidence_score = 1.0 if is_successful else 0.0
330
-
331
- # Extract any error information
332
- error_info = None
333
- if predicted_digit.startswith("error:"):
334
- error_info = predicted_digit[6:].strip()
335
- predicted_digit = "unknown"
336
- elif predicted_digit.startswith("unclear:"):
337
- error_info = f"Transcription unclear: {predicted_digit[8:].strip()}"
338
- predicted_digit = "unknown"
339
-
340
- result = {
341
- 'predicted_digit': predicted_digit,
342
- 'confidence_score': confidence_score,
343
- 'inference_time': round(inference_time, 4),
344
- 'success': is_successful,
345
- 'timestamp': time.time(),
346
- 'model': 'openai/whisper-tiny',
347
- 'method': 'whisper_digit'
348
- }
349
-
350
- if error_info:
351
- result['error'] = error_info
352
-
353
- return result
354
-
355
- def get_model_info(self) -> Dict[str, Any]:
356
- """
357
- Get information about the loaded model.
358
-
359
- Returns:
360
- dict: Model information
361
- """
362
- return {
363
- 'model_name': 'openai/whisper-tiny',
364
- 'model_type': 'Speech-to-Text (ASR)',
365
- 'specialized_for': 'Digit Recognition (0-9)',
366
- 'device': 'GPU' if self.device >= 0 else 'CPU',
367
- 'torch_device': self.device,
368
- 'supports_streaming': False,
369
- 'supported_languages': ['en'],
370
- 'digit_mappings': len(self.digit_map)
371
- }
372
-
373
- def get_stats(self) -> Dict[str, Any]:
374
- """
375
- Get processor statistics.
376
-
377
- Returns:
378
- dict: Performance statistics
379
- """
380
- success_rate = (
381
- self.successful_predictions / max(1, self.total_predictions)
382
- )
383
-
384
- return {
385
- 'total_predictions': self.total_predictions,
386
- 'successful_predictions': self.successful_predictions,
387
- 'failed_predictions': self.failed_predictions,
388
- 'success_rate': round(success_rate, 3),
389
- 'average_inference_time': round(self.average_inference_time, 4),
390
- 'model_loaded': self.is_configured()
391
- }
392
-
393
- def test_with_sample_audio(self) -> Dict[str, Any]:
394
- """
395
- Test the processor with generated sample audio.
396
-
397
- Returns:
398
- dict: Test results
399
- """
400
- if not self.is_configured():
401
- return {'error': 'Model not configured'}
402
-
403
- try:
404
- # Generate simple test audio (1 second of tone)
405
- sample_rate = 16000
406
- duration = 1.0
407
- frequency = 440 # A note
408
-
409
- t = np.linspace(0, duration, int(sample_rate * duration))
410
- test_audio = 0.3 * np.sin(2 * np.pi * frequency * t).astype(np.float32)
411
-
412
- # Run prediction
413
- start_time = time.time()
414
- result = self.model(test_audio)
415
- test_time = time.time() - start_time
416
-
417
- return {
418
- 'test_successful': True,
419
- 'test_time': round(test_time, 4),
420
- 'transcription': result.get('text', 'No text'),
421
- 'model_responsive': True
422
- }
423
-
424
- except Exception as e:
425
- return {
426
- 'test_successful': False,
427
- 'error': str(e),
428
- 'model_responsive': False
429
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements_hf.txt CHANGED
@@ -1,4 +1,4 @@
1
- # HF Spaces Requirements - Essential packages only
2
  # Core Flask API
3
  Flask==2.3.3
4
  Flask-CORS==4.0.0
@@ -12,15 +12,11 @@ scipy==1.11.4
12
  soundfile==0.12.1
13
 
14
  # ML Models - PyTorch (CPU optimized for HF Spaces)
15
- torch==2.0.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
16
- torchaudio==2.0.2+cpu --extra-index-url https://download.pytorch.org/whl/cpu
17
 
18
  # Essential ML utilities
19
  scikit-learn==1.3.2
20
- transformers==4.35.2
21
-
22
- # Audio format handling
23
- webrtcvad==2.0.10
24
 
25
  # Logging and utilities
26
  tqdm==4.66.1
 
1
+ # HF Spaces Requirements - Essential packages only (3 ML models only)
2
  # Core Flask API
3
  Flask==2.3.3
4
  Flask-CORS==4.0.0
 
12
  soundfile==0.12.1
13
 
14
  # ML Models - PyTorch (CPU optimized for HF Spaces)
15
+ torch==2.0.1
16
+ torchaudio==2.0.2
17
 
18
  # Essential ML utilities
19
  scikit-learn==1.3.2
 
 
 
 
20
 
21
  # Logging and utilities
22
  tqdm==4.66.1
utils/enhanced_vad.py DELETED
@@ -1,571 +0,0 @@
1
- """
2
- Enhanced VAD Implementation with ffmpeg support and comprehensive debugging
3
- """
4
-
5
- import numpy as np
6
- import logging
7
- import subprocess
8
- import tempfile
9
- import os
10
- import time
11
- import wave
12
- import io
13
- from pathlib import Path
14
- from typing import Dict, List, Tuple, Optional, Any
15
- from threading import Thread, Lock
16
- import asyncio
17
- import concurrent.futures
18
-
19
- # Try to import WebRTC VAD
20
- try:
21
- import webrtcvad
22
- WEBRTC_AVAILABLE = True
23
- except ImportError:
24
- WEBRTC_AVAILABLE = False
25
- logging.warning("webrtcvad not available - using fallback VAD implementation")
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
- class EnhancedVAD:
30
- """
31
- Enhanced Voice Activity Detection with ffmpeg integration and comprehensive debugging.
32
-
33
- Features:
34
- - ffmpeg-based audio preprocessing
35
- - Multiple VAD implementations (WebRTC, simple energy-based)
36
- - Comprehensive audio validation and debugging
37
- - Async audio chunk saving
38
- - Real-time performance monitoring
39
- """
40
-
41
- def __init__(self,
42
- sample_rate: int = 16000,
43
- frame_duration_ms: int = 30,
44
- aggressiveness: int = 1,
45
- min_speech_duration: float = 0.4,
46
- max_speech_duration: float = 3.0,
47
- silence_threshold: float = 0.01):
48
- """
49
- Initialize Enhanced VAD.
50
-
51
- Args:
52
- sample_rate: Target sample rate (Hz)
53
- frame_duration_ms: Frame duration in milliseconds
54
- aggressiveness: VAD aggressiveness (0-3)
55
- min_speech_duration: Minimum speech segment duration (seconds)
56
- max_speech_duration: Maximum speech segment duration (seconds)
57
- silence_threshold: Energy threshold for silence detection
58
- """
59
- self.sample_rate = sample_rate
60
- self.frame_duration_ms = frame_duration_ms
61
- self.frame_size = int(sample_rate * frame_duration_ms / 1000)
62
- self.aggressiveness = aggressiveness
63
- self.min_speech_duration = min_speech_duration
64
- self.max_speech_duration = max_speech_duration
65
- self.silence_threshold = silence_threshold
66
-
67
- # Initialize WebRTC VAD if available
68
- self.webrtc_vad = None
69
- if WEBRTC_AVAILABLE:
70
- try:
71
- self.webrtc_vad = webrtcvad.Vad(aggressiveness)
72
- logger.info(f"WebRTC VAD initialized (aggressiveness: {aggressiveness})")
73
- except Exception as e:
74
- logger.error(f"Failed to initialize WebRTC VAD: {e}")
75
- self.webrtc_vad = None
76
-
77
- # Check ffmpeg availability
78
- self.ffmpeg_available = self._check_ffmpeg_available()
79
-
80
- # Performance tracking
81
- self.stats = {
82
- 'total_chunks_processed': 0,
83
- 'speech_segments_detected': 0,
84
- 'processing_time_total': 0.0,
85
- 'last_processing_time': 0.0,
86
- 'ffmpeg_conversions': 0,
87
- 'audio_validation_failures': 0,
88
- 'webrtc_available': WEBRTC_AVAILABLE and self.webrtc_vad is not None,
89
- 'ffmpeg_available': self.ffmpeg_available
90
- }
91
-
92
- # Async processing
93
- self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=2)
94
- self.save_lock = Lock()
95
-
96
- logger.info(f"Enhanced VAD initialized:")
97
- logger.info(f" Sample rate: {sample_rate} Hz")
98
- logger.info(f" Frame duration: {frame_duration_ms} ms")
99
- logger.info(f" WebRTC VAD: {'Available' if self.webrtc_vad else 'Not available'}")
100
- logger.info(f" ffmpeg: {'Available' if self.ffmpeg_available else 'Not available'}")
101
-
102
- def _check_ffmpeg_available(self) -> bool:
103
- """Check if ffmpeg is available."""
104
- try:
105
- result = subprocess.run(['ffmpeg', '-version'],
106
- capture_output=True, text=True, timeout=5)
107
- return result.returncode == 0
108
- except Exception:
109
- return False
110
-
111
- def preprocess_audio_with_ffmpeg(self, audio_data: bytes) -> Optional[bytes]:
112
- """
113
- Preprocess audio using ffmpeg for optimal VAD performance.
114
-
115
- Args:
116
- audio_data: Raw audio bytes
117
-
118
- Returns:
119
- Preprocessed audio bytes or None if processing fails
120
- """
121
- if not self.ffmpeg_available:
122
- logger.debug("ffmpeg not available for audio preprocessing")
123
- return None
124
-
125
- temp_input = None
126
- temp_output = None
127
-
128
- try:
129
- # Create temporary files
130
- with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
131
- temp_input.write(audio_data)
132
- temp_input.flush()
133
-
134
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output:
135
- pass
136
-
137
- # ffmpeg command for VAD-optimized preprocessing
138
- ffmpeg_cmd = [
139
- 'ffmpeg',
140
- '-i', temp_input.name,
141
- '-ar', str(self.sample_rate), # Resample to target rate
142
- '-ac', '1', # Convert to mono
143
- '-acodec', 'pcm_s16le', # 16-bit PCM
144
- '-af', 'highpass=f=80,lowpass=f=8000,dynaudnorm=f=10:g=3', # Audio filters for speech
145
- '-f', 'wav',
146
- '-loglevel', 'error',
147
- '-y',
148
- temp_output.name
149
- ]
150
-
151
- result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=10)
152
-
153
- if result.returncode == 0:
154
- with open(temp_output.name, 'rb') as f:
155
- preprocessed_audio = f.read()
156
-
157
- self.stats['ffmpeg_conversions'] += 1
158
- logger.debug(f"ffmpeg preprocessing: {len(audio_data)} -> {len(preprocessed_audio)} bytes")
159
- return preprocessed_audio
160
- else:
161
- logger.error(f"ffmpeg preprocessing failed: {result.stderr}")
162
- return None
163
-
164
- except Exception as e:
165
- logger.error(f"ffmpeg preprocessing error: {e}")
166
- return None
167
-
168
- finally:
169
- # Cleanup
170
- try:
171
- if temp_input and os.path.exists(temp_input.name):
172
- os.unlink(temp_input.name)
173
- if temp_output and os.path.exists(temp_output.name):
174
- os.unlink(temp_output.name)
175
- except Exception:
176
- pass
177
-
178
- def validate_and_debug_audio(self, audio_data: bytes) -> Dict[str, Any]:
179
- """
180
- Comprehensive audio validation and debugging.
181
-
182
- Args:
183
- audio_data: Audio data to validate
184
-
185
- Returns:
186
- Validation results and debugging information
187
- """
188
- debug_info = {
189
- 'size_bytes': len(audio_data),
190
- 'valid_wav': False,
191
- 'sample_rate': None,
192
- 'channels': None,
193
- 'duration': 0.0,
194
- 'energy_level': 0.0,
195
- 'is_silent': True,
196
- 'format_detected': 'unknown',
197
- 'issues': []
198
- }
199
-
200
- try:
201
- # Check minimum size
202
- if len(audio_data) < 44:
203
- debug_info['issues'].append(f"Too small: {len(audio_data)} bytes (need ≥44 for WAV)")
204
- return debug_info
205
-
206
- # Detect format by header
207
- if audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:20]:
208
- debug_info['format_detected'] = 'wav'
209
- elif audio_data.startswith(b'OggS'):
210
- debug_info['format_detected'] = 'ogg'
211
- elif audio_data.startswith(b'\x1a\x45\xdf\xa3'):
212
- debug_info['format_detected'] = 'webm'
213
-
214
- # Try to parse as WAV
215
- try:
216
- with wave.open(io.BytesIO(audio_data), 'rb') as wav:
217
- debug_info['valid_wav'] = True
218
- debug_info['sample_rate'] = wav.getframerate()
219
- debug_info['channels'] = wav.getnchannels()
220
- debug_info['duration'] = wav.getnframes() / wav.getframerate()
221
-
222
- # Read audio samples for analysis
223
- wav.rewind()
224
- frames = wav.readframes(wav.getnframes())
225
-
226
- if len(frames) > 0:
227
- # Convert to numpy for analysis
228
- audio_array = np.frombuffer(frames, dtype=np.int16)
229
-
230
- # Calculate energy level
231
- energy = np.sqrt(np.mean(audio_array.astype(np.float32) ** 2))
232
- debug_info['energy_level'] = float(energy)
233
- debug_info['is_silent'] = energy < (self.silence_threshold * 32768)
234
-
235
- # Check for constant beep (common issue)
236
- if len(audio_array) > 100:
237
- # Check if audio is a constant tone (beep)
238
- diff = np.diff(audio_array)
239
- if np.std(diff) < 100: # Very low variation
240
- debug_info['issues'].append("Constant tone/beep detected")
241
-
242
- # Check dynamic range
243
- if np.max(audio_array) - np.min(audio_array) < 1000:
244
- debug_info['issues'].append("Very low dynamic range")
245
-
246
- except Exception as wav_error:
247
- debug_info['issues'].append(f"WAV parsing failed: {wav_error}")
248
-
249
- # Additional format-specific checks
250
- if debug_info['format_detected'] in ['ogg', 'webm'] and not debug_info['valid_wav']:
251
- debug_info['issues'].append("Non-WAV format detected - requires conversion")
252
-
253
- logger.debug(f"Audio validation: {debug_info}")
254
-
255
- if debug_info['issues']:
256
- self.stats['audio_validation_failures'] += 1
257
- logger.warning(f"Audio validation issues: {debug_info['issues']}")
258
-
259
- return debug_info
260
-
261
- except Exception as e:
262
- debug_info['issues'].append(f"Validation error: {str(e)}")
263
- logger.error(f"Audio validation failed: {e}")
264
- return debug_info
265
-
266
- def detect_speech_segments(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
267
- """
268
- Detect speech segments using multiple methods.
269
-
270
- Args:
271
- audio_data: Input audio data
272
-
273
- Returns:
274
- List of (segment_audio, segment_info) tuples
275
- """
276
- start_time = time.time()
277
-
278
- # Validate and debug audio
279
- debug_info = self.validate_and_debug_audio(audio_data)
280
-
281
- segments = []
282
-
283
- try:
284
- # Preprocess with ffmpeg if available
285
- processed_audio = self.preprocess_audio_with_ffmpeg(audio_data)
286
- if processed_audio:
287
- working_audio = processed_audio
288
- logger.debug("Using ffmpeg-preprocessed audio for VAD")
289
- else:
290
- working_audio = audio_data
291
- logger.debug("Using original audio for VAD")
292
-
293
- # Re-validate processed audio
294
- if processed_audio:
295
- processed_debug = self.validate_and_debug_audio(processed_audio)
296
- logger.debug(f"Processed audio validation: {processed_debug}")
297
-
298
- # Method 1: WebRTC VAD (if available)
299
- if self.webrtc_vad and debug_info['valid_wav']:
300
- webrtc_segments = self._webrtc_vad_detection(working_audio)
301
- segments.extend(webrtc_segments)
302
- logger.debug(f"WebRTC VAD found {len(webrtc_segments)} segments")
303
-
304
- # Method 2: Energy-based VAD (fallback)
305
- if not segments or debug_info['issues']:
306
- energy_segments = self._energy_based_vad(working_audio)
307
- segments.extend(energy_segments)
308
- logger.debug(f"Energy VAD found {len(energy_segments)} segments")
309
-
310
- # Method 3: Simple duration-based segmentation (last resort)
311
- if not segments and len(audio_data) > 8000: # > 8KB
312
- fallback_segment = self._create_fallback_segment(working_audio)
313
- if fallback_segment:
314
- segments.append(fallback_segment)
315
- logger.debug("Used fallback segmentation")
316
-
317
- processing_time = time.time() - start_time
318
- self.stats['total_chunks_processed'] += 1
319
- self.stats['speech_segments_detected'] += len(segments)
320
- self.stats['processing_time_total'] += processing_time
321
- self.stats['last_processing_time'] = processing_time
322
-
323
- logger.debug(f"VAD processing complete: {len(segments)} segments in {processing_time:.3f}s")
324
-
325
- return segments
326
-
327
- except Exception as e:
328
- logger.error(f"Speech segment detection failed: {e}")
329
- return []
330
-
331
- def _webrtc_vad_detection(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
332
- """WebRTC-based speech detection."""
333
- segments = []
334
-
335
- try:
336
- frame_size_bytes = self.frame_size * 2 # 16-bit = 2 bytes per sample
337
- frames = []
338
-
339
- # Extract frames
340
- for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes):
341
- frame = audio_data[i:i + frame_size_bytes]
342
- if len(frame) == frame_size_bytes:
343
- frames.append(frame)
344
-
345
- if len(frames) < 5: # Need minimum frames
346
- return segments
347
-
348
- # VAD processing
349
- speech_frames = []
350
- for frame in frames:
351
- try:
352
- is_speech = self.webrtc_vad.is_speech(frame, self.sample_rate)
353
- speech_frames.append((frame, is_speech))
354
- except Exception as e:
355
- logger.debug(f"WebRTC VAD frame processing failed: {e}")
356
- speech_frames.append((frame, False))
357
-
358
- # Group consecutive speech frames
359
- current_segment = []
360
- for frame, is_speech in speech_frames:
361
- if is_speech:
362
- current_segment.append(frame)
363
- else:
364
- if len(current_segment) > 0:
365
- # End of speech segment
366
- segment_audio = b''.join(current_segment)
367
- segment_duration = len(current_segment) * self.frame_duration_ms / 1000
368
-
369
- if segment_duration >= self.min_speech_duration:
370
- segments.append((segment_audio, {
371
- 'duration': segment_duration,
372
- 'method': 'webrtc_vad',
373
- 'frames': len(current_segment)
374
- }))
375
-
376
- current_segment = []
377
-
378
- # Handle final segment
379
- if current_segment:
380
- segment_audio = b''.join(current_segment)
381
- segment_duration = len(current_segment) * self.frame_duration_ms / 1000
382
-
383
- if segment_duration >= self.min_speech_duration:
384
- segments.append((segment_audio, {
385
- 'duration': segment_duration,
386
- 'method': 'webrtc_vad',
387
- 'frames': len(current_segment)
388
- }))
389
-
390
- return segments
391
-
392
- except Exception as e:
393
- logger.error(f"WebRTC VAD detection failed: {e}")
394
- return []
395
-
396
- def _energy_based_vad(self, audio_data: bytes) -> List[Tuple[bytes, Dict[str, Any]]]:
397
- """Energy-based speech detection."""
398
- segments = []
399
-
400
- try:
401
- # Try to parse as WAV or raw PCM
402
- try:
403
- with wave.open(io.BytesIO(audio_data), 'rb') as wav:
404
- frames = wav.readframes(wav.getnframes())
405
- sample_rate = wav.getframerate()
406
- except:
407
- # Assume raw 16-bit PCM
408
- frames = audio_data
409
- sample_rate = self.sample_rate
410
-
411
- if len(frames) < 1000: # Too short
412
- return segments
413
-
414
- # Convert to numpy array
415
- audio_samples = np.frombuffer(frames, dtype=np.int16)
416
- audio_float = audio_samples.astype(np.float32) / 32768.0
417
-
418
- # Calculate energy in overlapping windows
419
- window_size = int(sample_rate * 0.1) # 100ms windows
420
- hop_size = window_size // 2
421
-
422
- energies = []
423
- for i in range(0, len(audio_float) - window_size, hop_size):
424
- window = audio_float[i:i + window_size]
425
- energy = np.sqrt(np.mean(window ** 2))
426
- energies.append(energy)
427
-
428
- if len(energies) < 3:
429
- return segments
430
-
431
- # Adaptive threshold
432
- mean_energy = np.mean(energies)
433
- threshold = max(self.silence_threshold, mean_energy * 0.3)
434
-
435
- # Find speech segments
436
- if isinstance(energies, (list, np.ndarray)):
437
- energies = np.array(energies) # Ensure it's a numpy array
438
- speech_windows = energies > threshold
439
-
440
- # Group consecutive speech windows
441
- speech_start = None
442
- for i, is_speech in enumerate(speech_windows):
443
- if is_speech and speech_start is None:
444
- speech_start = i
445
- elif not is_speech and speech_start is not None:
446
- # End of speech
447
- start_sample = speech_start * hop_size
448
- end_sample = min(i * hop_size + window_size, len(audio_samples))
449
-
450
- segment_samples = audio_samples[start_sample:end_sample]
451
- segment_duration = len(segment_samples) / sample_rate
452
-
453
- if segment_duration >= self.min_speech_duration:
454
- # Convert back to bytes
455
- segment_audio = segment_samples.tobytes()
456
-
457
- segments.append((segment_audio, {
458
- 'duration': segment_duration,
459
- 'method': 'energy_based',
460
- 'start_time': start_sample / sample_rate,
461
- 'energy_threshold': threshold,
462
- 'mean_energy': mean_energy
463
- }))
464
-
465
- speech_start = None
466
-
467
- return segments
468
-
469
- except Exception as e:
470
- logger.error(f"Energy-based VAD failed: {e}")
471
- return []
472
-
473
- def _create_fallback_segment(self, audio_data: bytes) -> Optional[Tuple[bytes, Dict[str, Any]]]:
474
- """Create a fallback segment when VAD methods fail."""
475
- try:
476
- # Use the entire audio as a segment if it's reasonable length
477
- debug_info = self.validate_and_debug_audio(audio_data)
478
-
479
- if debug_info['duration'] > 0:
480
- duration = debug_info['duration']
481
- else:
482
- # Estimate duration based on size (assume 16-bit, mono, 16kHz)
483
- estimated_samples = len(audio_data) // 2
484
- duration = estimated_samples / self.sample_rate
485
-
486
- if self.min_speech_duration <= duration <= self.max_speech_duration:
487
- return (audio_data, {
488
- 'duration': duration,
489
- 'method': 'fallback',
490
- 'estimated': True,
491
- 'issues': debug_info['issues']
492
- })
493
-
494
- return None
495
-
496
- except Exception as e:
497
- logger.error(f"Fallback segment creation failed: {e}")
498
- return None
499
-
500
- async def save_audio_chunk_async(self, audio_data: bytes, session_id: str,
501
- chunk_type: str = "vad_chunk") -> Optional[str]:
502
- """
503
- Asynchronously save audio chunk to file.
504
-
505
- Args:
506
- audio_data: Audio data to save
507
- session_id: Session identifier
508
- chunk_type: Type of chunk (for filename)
509
-
510
- Returns:
511
- Path to saved file or None if failed
512
- """
513
- def _save_chunk():
514
- try:
515
- with self.save_lock:
516
- timestamp = int(time.time() * 1000)
517
- filename = f"{chunk_type}_{session_id}_{timestamp}.wav"
518
- filepath = Path("output") / filename
519
-
520
- # Ensure output directory exists
521
- filepath.parent.mkdir(exist_ok=True)
522
-
523
- # Save as WAV file
524
- with open(filepath, 'wb') as f:
525
- f.write(audio_data)
526
-
527
- logger.debug(f"Saved audio chunk: {filepath}")
528
- return str(filepath)
529
-
530
- except Exception as e:
531
- logger.error(f"Failed to save audio chunk: {e}")
532
- return None
533
-
534
- # Run in executor to avoid blocking
535
- loop = asyncio.get_event_loop()
536
- result = await loop.run_in_executor(self.executor, _save_chunk)
537
- return result
538
-
539
- def get_stats(self) -> Dict[str, Any]:
540
- """Get comprehensive VAD statistics."""
541
- stats = self.stats.copy()
542
-
543
- if stats['total_chunks_processed'] > 0:
544
- stats['average_processing_time'] = stats['processing_time_total'] / stats['total_chunks_processed']
545
- stats['segments_per_chunk'] = stats['speech_segments_detected'] / stats['total_chunks_processed']
546
- else:
547
- stats['average_processing_time'] = 0.0
548
- stats['segments_per_chunk'] = 0.0
549
-
550
- return stats
551
-
552
- def cleanup(self):
553
- """Clean up resources."""
554
- if hasattr(self, 'executor'):
555
- self.executor.shutdown(wait=True)
556
- logger.info("Enhanced VAD cleaned up")
557
-
558
- # Convenience function for creating enhanced VAD
559
- def create_enhanced_vad(config: Optional[Dict[str, Any]] = None) -> EnhancedVAD:
560
- """Create enhanced VAD with optional configuration."""
561
- if config is None:
562
- config = {}
563
-
564
- return EnhancedVAD(
565
- sample_rate=config.get('sample_rate', 16000),
566
- frame_duration_ms=config.get('frame_duration_ms', 30),
567
- aggressiveness=config.get('aggressiveness', 1),
568
- min_speech_duration=config.get('min_speech_duration', 0.4),
569
- max_speech_duration=config.get('max_speech_duration', 3.0),
570
- silence_threshold=config.get('silence_threshold', 0.01)
571
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/session_manager.py DELETED
@@ -1,340 +0,0 @@
1
- """
2
- Session Management for Audio Chunk Storage
3
- Handles session creation, audio chunk saving, and folder organization
4
- """
5
-
6
- import os
7
- import time
8
- import uuid
9
- import logging
10
- import wave
11
- import numpy as np
12
- from typing import Dict, Optional, List
13
- from pathlib import Path
14
- import json
15
- import threading
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
- class SessionManager:
20
- """
21
- Manages audio recording sessions with systematic file storage.
22
- Each session gets a unique ID and folder for organized chunk storage.
23
- """
24
-
25
- def __init__(self, base_output_dir: str = "output"):
26
- """
27
- Initialize session manager.
28
-
29
- Args:
30
- base_output_dir: Base directory for all session outputs
31
- """
32
- self.base_output_dir = Path(base_output_dir)
33
- self.base_output_dir.mkdir(exist_ok=True)
34
-
35
- # Active sessions tracking
36
- self.active_sessions: Dict[str, 'AudioSession'] = {}
37
- self.lock = threading.Lock()
38
-
39
- logger.info(f"Session manager initialized with output directory: {self.base_output_dir}")
40
-
41
- def create_session(self, session_id: Optional[str] = None) -> str:
42
- """
43
- Create a new audio recording session.
44
-
45
- Args:
46
- session_id: Optional custom session ID, otherwise auto-generated
47
-
48
- Returns:
49
- str: Session ID
50
- """
51
- if not session_id:
52
- # Generate session ID with timestamp and short UUID
53
- timestamp = int(time.time())
54
- short_uuid = str(uuid.uuid4())[:8]
55
- session_id = f"session{timestamp}_{short_uuid}"
56
-
57
- with self.lock:
58
- if session_id in self.active_sessions:
59
- logger.warning(f"Session {session_id} already exists, returning existing session")
60
- return session_id
61
-
62
- # Create session object
63
- session = AudioSession(session_id, self.base_output_dir)
64
- self.active_sessions[session_id] = session
65
-
66
- logger.info(f"Created new session: {session_id}")
67
- return session_id
68
-
69
- def get_session(self, session_id: str) -> Optional['AudioSession']:
70
- """Get an existing session by ID."""
71
- with self.lock:
72
- return self.active_sessions.get(session_id)
73
-
74
- def close_session(self, session_id: str) -> bool:
75
- """
76
- Close and finalize a session.
77
-
78
- Args:
79
- session_id: Session to close
80
-
81
- Returns:
82
- bool: True if session was closed successfully
83
- """
84
- with self.lock:
85
- if session_id not in self.active_sessions:
86
- logger.warning(f"Session {session_id} not found")
87
- return False
88
-
89
- session = self.active_sessions[session_id]
90
- session.finalize()
91
- del self.active_sessions[session_id]
92
-
93
- logger.info(f"Closed session: {session_id} ({session.chunk_count} chunks saved)")
94
- return True
95
-
96
- def cleanup_old_sessions(self, max_age_hours: int = 24) -> int:
97
- """
98
- Clean up sessions older than specified hours.
99
-
100
- Args:
101
- max_age_hours: Maximum age in hours before cleanup
102
-
103
- Returns:
104
- int: Number of sessions cleaned up
105
- """
106
- cutoff_time = time.time() - (max_age_hours * 3600)
107
- cleaned_count = 0
108
-
109
- # Find old session folders
110
- for session_dir in self.base_output_dir.iterdir():
111
- if not session_dir.is_dir() or not session_dir.name.startswith('session'):
112
- continue
113
-
114
- try:
115
- # Check if session has a metadata file with creation time
116
- metadata_file = session_dir / "session_info.json"
117
- if metadata_file.exists():
118
- with open(metadata_file, 'r') as f:
119
- metadata = json.load(f)
120
- if metadata.get('created_at', 0) < cutoff_time:
121
- import shutil
122
- shutil.rmtree(session_dir)
123
- cleaned_count += 1
124
- logger.info(f"Cleaned up old session: {session_dir.name}")
125
- else:
126
- # Fallback to directory modification time
127
- if session_dir.stat().st_mtime < cutoff_time:
128
- import shutil
129
- shutil.rmtree(session_dir)
130
- cleaned_count += 1
131
- logger.info(f"Cleaned up old session: {session_dir.name}")
132
-
133
- except Exception as e:
134
- logger.error(f"Error cleaning up session {session_dir.name}: {e}")
135
-
136
- if cleaned_count > 0:
137
- logger.info(f"Cleaned up {cleaned_count} old sessions")
138
-
139
- return cleaned_count
140
-
141
- def get_session_stats(self) -> Dict:
142
- """Get statistics about all sessions."""
143
- with self.lock:
144
- stats = {
145
- 'active_sessions': len(self.active_sessions),
146
- 'total_chunks_active': sum(s.chunk_count for s in self.active_sessions.values()),
147
- 'session_details': {
148
- sid: {
149
- 'chunk_count': session.chunk_count,
150
- 'created_at': session.created_at,
151
- 'folder_path': str(session.session_dir)
152
- }
153
- for sid, session in self.active_sessions.items()
154
- }
155
- }
156
-
157
- # Count total session folders
158
- total_session_dirs = len([
159
- d for d in self.base_output_dir.iterdir()
160
- if d.is_dir() and d.name.startswith('session')
161
- ])
162
- stats['total_session_folders'] = total_session_dirs
163
-
164
- return stats
165
-
166
-
167
- class AudioSession:
168
- """
169
- Represents a single audio recording session with systematic chunk storage.
170
- """
171
-
172
- def __init__(self, session_id: str, base_output_dir: Path):
173
- """
174
- Initialize audio session.
175
-
176
- Args:
177
- session_id: Unique session identifier
178
- base_output_dir: Base directory for output
179
- """
180
- self.session_id = session_id
181
- self.created_at = time.time()
182
- self.chunk_count = 0
183
-
184
- # Create session directory
185
- self.session_dir = base_output_dir / session_id
186
- self.session_dir.mkdir(exist_ok=True)
187
-
188
- # Create subdirectories
189
- self.chunks_dir = self.session_dir / "chunks"
190
- self.chunks_dir.mkdir(exist_ok=True)
191
-
192
- # Session metadata
193
- self.metadata = {
194
- 'session_id': session_id,
195
- 'created_at': self.created_at,
196
- 'created_at_human': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.created_at)),
197
- 'chunk_count': 0,
198
- 'chunks': []
199
- }
200
-
201
- self._save_metadata()
202
- logger.info(f"Session folder created: {self.session_dir}")
203
-
204
- def save_audio_chunk(self, audio_data: bytes, prediction_result: Optional[Dict] = None,
205
- chunk_type: str = "speech") -> str:
206
- """
207
- Save an audio chunk to the session folder.
208
-
209
- Args:
210
- audio_data: Raw audio bytes (WAV format preferred)
211
- prediction_result: Optional prediction results to save alongside
212
- chunk_type: Type of chunk ("speech", "vad_segment", "raw", etc.)
213
-
214
- Returns:
215
- str: Path to saved chunk file
216
- """
217
- self.chunk_count += 1
218
-
219
- # Generate chunk filename
220
- chunk_filename = f"{self.chunk_count:03d}.wav"
221
- chunk_path = self.chunks_dir / chunk_filename
222
-
223
- try:
224
- # Save audio data
225
- if self._is_wav_format(audio_data):
226
- # Already WAV format, save directly
227
- with open(chunk_path, 'wb') as f:
228
- f.write(audio_data)
229
- logger.debug(f"Saved WAV chunk: {chunk_path}")
230
- else:
231
- # Convert raw PCM to WAV
232
- self._save_pcm_as_wav(audio_data, chunk_path)
233
- logger.debug(f"Converted and saved PCM chunk: {chunk_path}")
234
-
235
- # Update metadata
236
- chunk_info = {
237
- 'chunk_id': self.chunk_count,
238
- 'filename': chunk_filename,
239
- 'chunk_type': chunk_type,
240
- 'size_bytes': len(audio_data),
241
- 'saved_at': time.time(),
242
- 'saved_at_human': time.strftime('%Y-%m-%d %H:%M:%S'),
243
- 'audio_format': 'wav' if self._is_wav_format(audio_data) else 'pcm_converted'
244
- }
245
-
246
- # Add prediction results if provided
247
- if prediction_result:
248
- chunk_info['prediction'] = prediction_result
249
-
250
- self.metadata['chunks'].append(chunk_info)
251
- self.metadata['chunk_count'] = self.chunk_count
252
- self._save_metadata()
253
-
254
- logger.info(f"Saved audio chunk {self.chunk_count}: {chunk_path}")
255
- return str(chunk_path)
256
-
257
- except Exception as e:
258
- logger.error(f"Failed to save audio chunk {self.chunk_count}: {e}")
259
- # Rollback chunk count on failure
260
- self.chunk_count -= 1
261
- raise
262
-
263
- def _is_wav_format(self, audio_data: bytes) -> bool:
264
- """Check if audio data is in WAV format."""
265
- return audio_data.startswith(b'RIFF') and b'WAVE' in audio_data[:12]
266
-
267
- def _save_pcm_as_wav(self, pcm_data: bytes, output_path: Path,
268
- sample_rate: int = 16000, channels: int = 1, sample_width: int = 2):
269
- """
270
- Convert raw PCM data to WAV format and save.
271
-
272
- Args:
273
- pcm_data: Raw PCM bytes
274
- output_path: Output WAV file path
275
- sample_rate: Sample rate (default 16kHz for speech)
276
- channels: Number of channels (default mono)
277
- sample_width: Sample width in bytes (default 16-bit)
278
- """
279
- try:
280
- with wave.open(str(output_path), 'wb') as wav_file:
281
- wav_file.setnchannels(channels)
282
- wav_file.setsampwidth(sample_width)
283
- wav_file.setframerate(sample_rate)
284
- wav_file.writeframes(pcm_data)
285
-
286
- except Exception as e:
287
- logger.error(f"PCM to WAV conversion failed: {e}")
288
- # Fallback: save as raw PCM with .pcm extension
289
- raw_path = output_path.with_suffix('.pcm')
290
- with open(raw_path, 'wb') as f:
291
- f.write(pcm_data)
292
- logger.warning(f"Saved as raw PCM instead: {raw_path}")
293
-
294
- def _save_metadata(self):
295
- """Save session metadata to JSON file."""
296
- try:
297
- metadata_path = self.session_dir / "session_info.json"
298
- with open(metadata_path, 'w') as f:
299
- json.dump(self.metadata, f, indent=2, default=str)
300
- except Exception as e:
301
- logger.error(f"Failed to save session metadata: {e}")
302
-
303
- def finalize(self):
304
- """Finalize the session and save final metadata."""
305
- self.metadata['finalized_at'] = time.time()
306
- self.metadata['finalized_at_human'] = time.strftime('%Y-%m-%d %H:%M:%S')
307
- self.metadata['final_chunk_count'] = self.chunk_count
308
- self._save_metadata()
309
-
310
- logger.info(f"📋 Finalized session {self.session_id}: {self.chunk_count} chunks saved")
311
-
312
- def get_chunk_list(self) -> List[str]:
313
- """Get list of all chunk files in order."""
314
- chunk_files = []
315
- for i in range(1, self.chunk_count + 1):
316
- chunk_file = self.chunks_dir / f"{i:03d}.wav"
317
- if chunk_file.exists():
318
- chunk_files.append(str(chunk_file))
319
- else:
320
- # Check for .pcm fallback
321
- pcm_file = self.chunks_dir / f"{i:03d}.pcm"
322
- if pcm_file.exists():
323
- chunk_files.append(str(pcm_file))
324
- return chunk_files
325
-
326
- def get_session_summary(self) -> Dict:
327
- """Get comprehensive session summary."""
328
- return {
329
- 'session_id': self.session_id,
330
- 'created_at': self.created_at,
331
- 'chunk_count': self.chunk_count,
332
- 'session_dir': str(self.session_dir),
333
- 'chunks_dir': str(self.chunks_dir),
334
- 'chunk_files': self.get_chunk_list(),
335
- 'metadata': self.metadata
336
- }
337
-
338
-
339
- # Global session manager instance
340
- session_manager = SessionManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/vad.py DELETED
@@ -1,149 +0,0 @@
1
- """
2
- Voice Activity Detection (VAD) for streaming audio processing
3
- Detects speech segments and trims silence
4
- """
5
-
6
- import numpy as np
7
- import logging
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- class VoiceActivityDetector:
12
- """Simple voice activity detector based on energy and zero-crossing rate."""
13
-
14
- def __init__(self):
15
- self.sample_rate = 16000
16
- self.frame_size = 512 # ~32ms frames at 16kHz
17
- self.hop_size = 256 # 50% overlap
18
-
19
- # VAD thresholds
20
- self.energy_threshold = 0.01 # Minimum energy for speech
21
- self.zcr_threshold = 0.3 # Zero crossing rate threshold
22
- self.min_speech_frames = 5 # Minimum frames for speech detection
23
- self.min_silence_frames = 8 # Minimum silence frames to end speech
24
-
25
- # State tracking
26
- self.is_speech_active = False
27
- self.speech_frames = 0
28
- self.silence_frames = 0
29
- self.speech_buffer = []
30
-
31
- logger.info("Voice Activity Detector initialized")
32
-
33
- def reset(self):
34
- """Reset VAD state."""
35
- self.is_speech_active = False
36
- self.speech_frames = 0
37
- self.silence_frames = 0
38
- self.speech_buffer = []
39
-
40
- def compute_energy(self, frame):
41
- """Compute energy of audio frame."""
42
- return np.mean(frame ** 2)
43
-
44
- def compute_zcr(self, frame):
45
- """Compute zero crossing rate of audio frame."""
46
- zcr = np.sum(np.abs(np.diff(np.sign(frame)))) / (2 * len(frame))
47
- return zcr
48
-
49
- def is_speech_frame(self, frame):
50
- """Determine if frame contains speech."""
51
- energy = self.compute_energy(frame)
52
- zcr = self.compute_zcr(frame)
53
-
54
- # Simple rule: speech has moderate energy and ZCR
55
- has_energy = energy > self.energy_threshold
56
- has_reasonable_zcr = zcr < self.zcr_threshold
57
-
58
- return has_energy and has_reasonable_zcr
59
-
60
- def process_chunk(self, audio_data):
61
- """
62
- Process audio chunk and return speech segments.
63
-
64
- Args:
65
- audio_data: numpy array of audio samples
66
-
67
- Returns:
68
- List of (start_sample, end_sample) tuples for speech segments
69
- """
70
- if len(audio_data) == 0:
71
- return []
72
-
73
- speech_segments = []
74
- num_frames = (len(audio_data) - self.frame_size) // self.hop_size + 1
75
-
76
- for i in range(num_frames):
77
- start_idx = i * self.hop_size
78
- end_idx = start_idx + self.frame_size
79
-
80
- if end_idx > len(audio_data):
81
- break
82
-
83
- frame = audio_data[start_idx:end_idx]
84
- is_speech = self.is_speech_frame(frame)
85
-
86
- if is_speech:
87
- self.speech_frames += 1
88
- self.silence_frames = 0
89
-
90
- if not self.is_speech_active and self.speech_frames >= self.min_speech_frames:
91
- # Speech started
92
- self.is_speech_active = True
93
- self.speech_start_idx = max(0, start_idx - self.min_speech_frames * self.hop_size)
94
- logger.debug(f"Speech started at sample {self.speech_start_idx}")
95
-
96
- else:
97
- self.silence_frames += 1
98
-
99
- if self.is_speech_active and self.silence_frames >= self.min_silence_frames:
100
- # Speech ended
101
- speech_end_idx = start_idx
102
- speech_segments.append((self.speech_start_idx, speech_end_idx))
103
- logger.debug(f"Speech ended at sample {speech_end_idx}")
104
-
105
- # Reset for next speech segment
106
- self.is_speech_active = False
107
- self.speech_frames = 0
108
- self.silence_frames = 0
109
-
110
- return speech_segments
111
-
112
- def extract_speech_segments(self, audio_data, segments):
113
- """Extract speech segments from audio data."""
114
- speech_chunks = []
115
-
116
- for start_idx, end_idx in segments:
117
- if end_idx > start_idx:
118
- segment = audio_data[start_idx:end_idx]
119
- # Trim silence from edges
120
- segment = self.trim_silence(segment)
121
- if len(segment) > self.sample_rate * 0.3: # At least 300ms
122
- speech_chunks.append(segment)
123
-
124
- return speech_chunks
125
-
126
- def trim_silence(self, audio_data, silence_threshold=0.01):
127
- """Trim silence from beginning and end of audio."""
128
- if len(audio_data) == 0:
129
- return audio_data
130
-
131
- # Find first and last non-silent samples
132
- energy = audio_data ** 2
133
- non_silent = energy > silence_threshold
134
-
135
- if not np.any(non_silent):
136
- return audio_data # All silence, return as is
137
-
138
- first_sound = np.argmax(non_silent)
139
- last_sound = len(non_silent) - np.argmax(non_silent[::-1]) - 1
140
-
141
- return audio_data[first_sound:last_sound + 1]
142
-
143
- def get_current_speech_segment(self, audio_data):
144
- """Get current ongoing speech segment if any."""
145
- if self.is_speech_active and len(audio_data) > 0:
146
- current_segment = audio_data[self.speech_start_idx:]
147
- if len(current_segment) > self.sample_rate * 0.5: # At least 500ms
148
- return self.trim_silence(current_segment)
149
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/vad_feature_integration.py DELETED
@@ -1,483 +0,0 @@
1
- """
2
- Integration module for WebRTC VAD with MFCC and Spectrogram processors
3
- Combines voice activity detection with real-time feature extraction
4
- """
5
-
6
- import numpy as np
7
- import librosa
8
- import logging
9
- from typing import Dict, List, Optional, Tuple
10
- import time
11
- from collections import deque
12
- import threading
13
- import queue
14
-
15
- from utils.webrtc_vad import WebRTCVADProcessor
16
- from audio_processors.mfcc_processor import MFCCProcessor
17
- from audio_processors.mel_spectrogram import MelSpectrogramProcessor
18
- from audio_processors.raw_spectrogram import RawSpectrogramProcessor
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
- class StreamingFeatureExtractor:
23
- """
24
- Real-time feature extraction with VAD integration.
25
- Combines WebRTC VAD with MFCC, Mel Spectrogram, and Raw Spectrogram processing.
26
- """
27
-
28
- def __init__(self, sample_rate=16000, n_mfcc=13, n_fft=2048, hop_length=512):
29
- """
30
- Initialize streaming feature extractor.
31
-
32
- Args:
33
- sample_rate: Audio sample rate
34
- n_mfcc: Number of MFCC coefficients
35
- n_fft: FFT window size
36
- hop_length: Hop length for STFT
37
- """
38
- self.sample_rate = sample_rate
39
- self.n_mfcc = n_mfcc
40
- self.n_fft = n_fft
41
- self.hop_length = hop_length
42
-
43
- # Initialize VAD processor
44
- self.vad_processor = WebRTCVADProcessor(
45
- aggressiveness=2,
46
- sample_rate=sample_rate,
47
- frame_duration=30
48
- )
49
-
50
- # Initialize feature processors
51
- self.mfcc_processor = MFCCProcessor()
52
- self.mel_processor = MelSpectrogramProcessor()
53
- self.raw_spec_processor = RawSpectrogramProcessor()
54
-
55
- # Buffers for overlapped processing
56
- self.audio_buffer = deque(maxlen=sample_rate * 2) # 2 second buffer
57
- self.feature_buffer = deque(maxlen=100) # Store recent feature vectors
58
-
59
- # Threading for real-time processing
60
- self.processing_queue = queue.Queue()
61
- self.feature_queue = queue.Queue()
62
- self.is_processing = False
63
- self.processing_thread = None
64
-
65
- # Statistics
66
- self.total_chunks_processed = 0
67
- self.features_extracted = 0
68
- self.speech_segments_processed = 0
69
-
70
- logger.info("Streaming Feature Extractor initialized")
71
-
72
- def extract_features_realtime(self, audio_chunk: bytes) -> Dict[str, np.ndarray]:
73
- """
74
- Extract features from streaming audio chunk with VAD.
75
-
76
- Args:
77
- audio_chunk: Raw audio bytes
78
-
79
- Returns:
80
- dict: Extracted features for detected speech segments
81
- """
82
- # Process with VAD first
83
- speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
84
-
85
- features_list = []
86
-
87
- for segment in speech_segments:
88
- # Convert bytes to numpy array
89
- audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
90
-
91
- # Extract comprehensive features
92
- features = self._compute_streaming_features(audio_array)
93
-
94
- if features:
95
- features_list.append(features)
96
- self.features_extracted += 1
97
-
98
- self.total_chunks_processed += 1
99
-
100
- if speech_segments:
101
- self.speech_segments_processed += len(speech_segments)
102
- logger.debug(f"Extracted features from {len(speech_segments)} speech segments")
103
-
104
- return features_list
105
-
106
- def _compute_streaming_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]:
107
- """
108
- Compute comprehensive feature set optimized for streaming.
109
-
110
- Args:
111
- audio_data: Audio samples as numpy array
112
-
113
- Returns:
114
- dict: Feature dictionary or None if extraction fails
115
- """
116
- try:
117
- if len(audio_data) < self.n_fft:
118
- logger.debug("Audio segment too short for feature extraction")
119
- return None
120
-
121
- features = {}
122
-
123
- # Core MFCC features
124
- mfccs = librosa.feature.mfcc(
125
- y=audio_data,
126
- sr=self.sample_rate,
127
- n_mfcc=self.n_mfcc,
128
- n_fft=self.n_fft,
129
- hop_length=self.hop_length
130
- )
131
-
132
- # Statistical summaries for streaming
133
- features['mfcc_mean'] = np.mean(mfccs, axis=1)
134
- features['mfcc_std'] = np.std(mfccs, axis=1)
135
- features['mfcc_delta'] = np.mean(librosa.feature.delta(mfccs), axis=1)
136
- features['mfcc_delta2'] = np.mean(librosa.feature.delta(mfccs, order=2), axis=1)
137
-
138
- # Spectral features
139
- features['spectral_centroid'] = np.mean(
140
- librosa.feature.spectral_centroid(y=audio_data, sr=self.sample_rate)
141
- )
142
- features['spectral_bandwidth'] = np.mean(
143
- librosa.feature.spectral_bandwidth(y=audio_data, sr=self.sample_rate)
144
- )
145
- features['spectral_rolloff'] = np.mean(
146
- librosa.feature.spectral_rolloff(y=audio_data, sr=self.sample_rate)
147
- )
148
- features['zero_crossing_rate'] = np.mean(
149
- librosa.feature.zero_crossing_rate(audio_data)
150
- )
151
-
152
- # Energy features
153
- features['rms_energy'] = np.mean(librosa.feature.rms(y=audio_data))
154
-
155
- # Mel spectrogram features
156
- mel_spec = librosa.feature.melspectrogram(
157
- y=audio_data,
158
- sr=self.sample_rate,
159
- n_mels=40, # Reduced for streaming
160
- n_fft=self.n_fft,
161
- hop_length=self.hop_length
162
- )
163
- features['mel_spec_mean'] = np.mean(mel_spec, axis=1)
164
- features['mel_spec_std'] = np.std(mel_spec, axis=1)
165
-
166
- # Raw spectrogram features
167
- stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length)
168
- magnitude_spec = np.abs(stft)
169
- features['raw_spec_mean'] = np.mean(magnitude_spec, axis=1)
170
- features['raw_spec_std'] = np.std(magnitude_spec, axis=1)
171
-
172
- # Harmonic and percussive components
173
- harmonic, percussive = librosa.effects.hpss(audio_data)
174
- features['harmonic_ratio'] = np.mean(harmonic ** 2) / (np.mean(audio_data ** 2) + 1e-8)
175
- features['percussive_ratio'] = np.mean(percussive ** 2) / (np.mean(audio_data ** 2) + 1e-8)
176
-
177
- # Tempo and rhythm features (simplified for streaming)
178
- tempo, _ = librosa.beat.beat_track(y=audio_data, sr=self.sample_rate)
179
- features['tempo'] = tempo
180
-
181
- # Add metadata
182
- features['_metadata'] = {
183
- 'duration': len(audio_data) / self.sample_rate,
184
- 'sample_rate': self.sample_rate,
185
- 'n_samples': len(audio_data),
186
- 'extraction_timestamp': time.time()
187
- }
188
-
189
- return features
190
-
191
- except Exception as e:
192
- logger.error(f"Feature extraction error: {e}")
193
- return None
194
-
195
- def extract_mfcc_features(self, audio_data: np.ndarray) -> Optional[np.ndarray]:
196
- """
197
- Extract only MFCC features for lightweight processing.
198
-
199
- Args:
200
- audio_data: Audio samples
201
-
202
- Returns:
203
- np.ndarray: MFCC feature vector
204
- """
205
- try:
206
- mfccs = librosa.feature.mfcc(
207
- y=audio_data,
208
- sr=self.sample_rate,
209
- n_mfcc=self.n_mfcc,
210
- n_fft=self.n_fft,
211
- hop_length=self.hop_length
212
- )
213
- return np.mean(mfccs, axis=1)
214
- except Exception as e:
215
- logger.error(f"MFCC extraction error: {e}")
216
- return None
217
-
218
- def extract_spectrogram_features(self, audio_data: np.ndarray) -> Optional[Dict[str, np.ndarray]]:
219
- """
220
- Extract spectrogram-based features.
221
-
222
- Args:
223
- audio_data: Audio samples
224
-
225
- Returns:
226
- dict: Spectrogram features
227
- """
228
- try:
229
- # Mel spectrogram
230
- mel_spec = librosa.feature.melspectrogram(
231
- y=audio_data,
232
- sr=self.sample_rate,
233
- n_mels=80,
234
- n_fft=self.n_fft,
235
- hop_length=self.hop_length
236
- )
237
-
238
- # Raw spectrogram
239
- stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length)
240
- magnitude_spec = np.abs(stft)
241
-
242
- return {
243
- 'mel_spectrogram': mel_spec,
244
- 'mel_spec_db': librosa.power_to_db(mel_spec),
245
- 'raw_spectrogram': magnitude_spec,
246
- 'raw_spec_db': librosa.amplitude_to_db(magnitude_spec)
247
- }
248
- except Exception as e:
249
- logger.error(f"Spectrogram extraction error: {e}")
250
- return None
251
-
252
- def process_with_vad_and_features(self, audio_chunk: bytes, feature_type: str = 'all') -> List[Dict]:
253
- """
254
- Process audio chunk with VAD and extract specified features.
255
-
256
- Args:
257
- audio_chunk: Raw audio bytes
258
- feature_type: Type of features to extract ('mfcc', 'spectrogram', 'all')
259
-
260
- Returns:
261
- List[dict]: Feature results for each speech segment
262
- """
263
- # Get speech segments from VAD
264
- speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
265
-
266
- results = []
267
-
268
- for i, segment in enumerate(speech_segments):
269
- # Convert to numpy array
270
- audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
271
-
272
- segment_result = {
273
- 'segment_index': i,
274
- 'segment_duration': len(audio_array) / self.sample_rate,
275
- 'segment_samples': len(audio_array)
276
- }
277
-
278
- # Extract requested features
279
- if feature_type == 'mfcc':
280
- mfcc_features = self.extract_mfcc_features(audio_array)
281
- if mfcc_features is not None:
282
- segment_result['mfcc'] = mfcc_features
283
-
284
- elif feature_type == 'spectrogram':
285
- spec_features = self.extract_spectrogram_features(audio_array)
286
- if spec_features is not None:
287
- segment_result.update(spec_features)
288
-
289
- elif feature_type == 'all':
290
- comprehensive_features = self._compute_streaming_features(audio_array)
291
- if comprehensive_features is not None:
292
- segment_result.update(comprehensive_features)
293
-
294
- results.append(segment_result)
295
-
296
- return results
297
-
298
- def start_streaming_processing(self):
299
- """Start background thread for streaming processing."""
300
- if self.is_processing:
301
- return
302
-
303
- self.is_processing = True
304
- self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True)
305
- self.processing_thread.start()
306
- logger.info("Started streaming feature processing")
307
-
308
- def stop_streaming_processing(self):
309
- """Stop background streaming processing."""
310
- self.is_processing = False
311
- if self.processing_thread:
312
- self.processing_thread.join(timeout=1.0)
313
- logger.info("Stopped streaming feature processing")
314
-
315
- def add_audio_chunk(self, audio_chunk: bytes, feature_type: str = 'all'):
316
- """
317
- Add audio chunk to processing queue.
318
-
319
- Args:
320
- audio_chunk: Raw audio bytes
321
- feature_type: Type of features to extract
322
- """
323
- if self.is_processing:
324
- try:
325
- self.processing_queue.put_nowait((audio_chunk, feature_type))
326
- except queue.Full:
327
- logger.warning("Processing queue full, dropping chunk")
328
-
329
- def get_feature_results(self) -> List[Dict]:
330
- """
331
- Get all available feature extraction results.
332
-
333
- Returns:
334
- List[dict]: Available feature results
335
- """
336
- results = []
337
- try:
338
- while True:
339
- result = self.feature_queue.get_nowait()
340
- results.append(result)
341
- except queue.Empty:
342
- pass
343
- return results
344
-
345
- def _streaming_worker(self):
346
- """Background worker for streaming feature processing."""
347
- while self.is_processing:
348
- try:
349
- # Get audio chunk with timeout
350
- audio_chunk, feature_type = self.processing_queue.get(timeout=0.1)
351
-
352
- # Process chunk
353
- start_time = time.time()
354
- results = self.process_with_vad_and_features(audio_chunk, feature_type)
355
- processing_time = time.time() - start_time
356
-
357
- # Add processing metadata
358
- for result in results:
359
- result['processing_time'] = processing_time
360
- result['timestamp'] = time.time()
361
-
362
- # Add results to output queue
363
- for result in results:
364
- try:
365
- self.feature_queue.put_nowait(result)
366
- except queue.Full:
367
- logger.warning("Feature queue full, dropping result")
368
-
369
- except queue.Empty:
370
- continue
371
- except Exception as e:
372
- logger.error(f"Streaming feature processing error: {e}")
373
-
374
- def get_stats(self) -> Dict:
375
- """
376
- Get feature extraction statistics.
377
-
378
- Returns:
379
- dict: Processing statistics
380
- """
381
- vad_stats = self.vad_processor.get_stats()
382
-
383
- return {
384
- 'total_chunks_processed': self.total_chunks_processed,
385
- 'features_extracted': self.features_extracted,
386
- 'speech_segments_processed': self.speech_segments_processed,
387
- 'vad_stats': vad_stats,
388
- 'is_processing': self.is_processing,
389
- 'queue_sizes': {
390
- 'processing_queue': self.processing_queue.qsize(),
391
- 'feature_queue': self.feature_queue.qsize()
392
- }
393
- }
394
-
395
- def reset_state(self):
396
- """Reset all processing state."""
397
- self.vad_processor.reset_state()
398
- self.audio_buffer.clear()
399
- self.feature_buffer.clear()
400
-
401
- # Clear queues
402
- while not self.processing_queue.empty():
403
- try:
404
- self.processing_queue.get_nowait()
405
- except queue.Empty:
406
- break
407
-
408
- while not self.feature_queue.empty():
409
- try:
410
- self.feature_queue.get_nowait()
411
- except queue.Empty:
412
- break
413
-
414
- logger.info("Feature extractor state reset")
415
-
416
- class VADMFCCProcessor:
417
- """
418
- Simplified VAD + MFCC processor for digit recognition.
419
- Optimized for low-latency real-time processing.
420
- """
421
-
422
- def __init__(self, sample_rate=16000, n_mfcc=13):
423
- """Initialize VAD + MFCC processor."""
424
- self.sample_rate = sample_rate
425
- self.n_mfcc = n_mfcc
426
-
427
- self.vad_processor = WebRTCVADProcessor(
428
- aggressiveness=1, # Less aggressive for better digit detection
429
- sample_rate=sample_rate,
430
- frame_duration=30
431
- )
432
-
433
- self.features_extracted = 0
434
-
435
- logger.info("VAD-MFCC processor initialized")
436
-
437
- def process_audio_for_digit_recognition(self, audio_chunk: bytes) -> List[np.ndarray]:
438
- """
439
- Process audio chunk and extract MFCC features from speech segments.
440
-
441
- Args:
442
- audio_chunk: Raw audio bytes
443
-
444
- Returns:
445
- List[np.ndarray]: MFCC feature vectors for each speech segment
446
- """
447
- # Get speech segments
448
- speech_segments = self.vad_processor.process_audio_chunk(audio_chunk)
449
-
450
- mfcc_features = []
451
-
452
- for segment in speech_segments:
453
- # Convert to numpy array
454
- audio_array = np.frombuffer(segment, dtype=np.int16).astype(np.float32) / 32768.0
455
-
456
- # Extract MFCC features
457
- try:
458
- mfccs = librosa.feature.mfcc(
459
- y=audio_array,
460
- sr=self.sample_rate,
461
- n_mfcc=self.n_mfcc,
462
- n_fft=1024, # Smaller FFT for faster processing
463
- hop_length=256
464
- )
465
-
466
- # Use mean across time for simplicity
467
- mfcc_mean = np.mean(mfccs, axis=1)
468
- mfcc_features.append(mfcc_mean)
469
- self.features_extracted += 1
470
-
471
- except Exception as e:
472
- logger.error(f"MFCC extraction failed: {e}")
473
-
474
- return mfcc_features
475
-
476
- def get_stats(self) -> Dict:
477
- """Get processing statistics."""
478
- vad_stats = self.vad_processor.get_stats()
479
-
480
- return {
481
- 'features_extracted': self.features_extracted,
482
- 'vad_stats': vad_stats
483
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/webrtc_vad.py DELETED
@@ -1,442 +0,0 @@
1
- """
2
- WebRTC VAD implementation for streaming audio processing
3
- Provides high-performance voice activity detection with proper audio chunking
4
- """
5
-
6
- import webrtcvad
7
- import collections
8
- import numpy as np
9
- import logging
10
- from typing import List, Tuple, Optional, Generator
11
- import struct
12
- import threading
13
- import queue
14
- import time
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- class WebRTCVADProcessor:
19
- """
20
- WebRTC-based Voice Activity Detection processor for streaming audio.
21
-
22
- Features:
23
- - Real-time VAD processing with WebRTC library
24
- - Proper audio chunking and buffering
25
- - Speech segment detection and extraction
26
- - Thread-safe operation for streaming applications
27
- """
28
-
29
- def __init__(self, aggressiveness=2, sample_rate=16000, frame_duration=30):
30
- """
31
- Initialize WebRTC VAD processor.
32
-
33
- Args:
34
- aggressiveness: VAD aggressiveness mode (0-3, higher = more aggressive)
35
- sample_rate: Audio sample rate (8000, 16000, 32000, or 48000 Hz)
36
- frame_duration: Frame duration in milliseconds (10, 20, or 30 ms)
37
- """
38
- self.vad = webrtcvad.Vad(aggressiveness)
39
- self.sample_rate = sample_rate
40
- self.frame_duration = frame_duration
41
- self.frame_size = int(sample_rate * frame_duration / 1000)
42
-
43
- # Circular buffer for frame management
44
- self.ring_buffer_size = max(10, int(500 / frame_duration)) # ~500ms buffer
45
- self.ring_buffer = collections.deque(maxlen=self.ring_buffer_size)
46
-
47
- # State tracking
48
- self.triggered = False
49
- self.speech_buffer = collections.deque()
50
- self.is_recording = False
51
- self.current_utterance_start = None
52
-
53
- # Configuration parameters
54
- self.silence_threshold = 0.8 # Ratio of silence frames to trigger end
55
- self.speech_threshold = 0.5 # Ratio of speech frames to trigger start
56
- self.min_speech_duration = 0.5 # Minimum speech duration in seconds
57
- self.max_speech_duration = 10.0 # Maximum speech duration in seconds
58
- self.max_silence_duration = 2.0 # Maximum silence before reset
59
-
60
- # Performance tracking
61
- self.total_frames_processed = 0
62
- self.speech_frames_detected = 0
63
- self.segments_extracted = 0
64
-
65
- # Thread-safe queue for streaming chunks
66
- self.audio_queue = queue.Queue()
67
- self.output_queue = queue.Queue()
68
- self.processing = False
69
-
70
- logger.info(f"WebRTC VAD initialized: aggressiveness={aggressiveness}, "
71
- f"sample_rate={sample_rate}Hz, frame_duration={frame_duration}ms")
72
-
73
- def reset_state(self):
74
- """Reset VAD state for new processing session."""
75
- self.triggered = False
76
- self.is_recording = False
77
- self.ring_buffer.clear()
78
- self.speech_buffer.clear()
79
- self.current_utterance_start = None
80
- logger.debug("VAD state reset")
81
-
82
- def convert_audio_to_frames(self, audio_data: bytes) -> Generator[bytes, None, None]:
83
- """
84
- Convert audio data to properly sized frames for WebRTC VAD.
85
-
86
- Args:
87
- audio_data: Raw audio bytes (16-bit PCM)
88
-
89
- Yields:
90
- bytes: Frame data suitable for VAD processing
91
- """
92
- frame_size_bytes = self.frame_size * 2 # 16-bit = 2 bytes per sample
93
-
94
- for i in range(0, len(audio_data) - frame_size_bytes + 1, frame_size_bytes):
95
- frame = audio_data[i:i + frame_size_bytes]
96
- if len(frame) == frame_size_bytes:
97
- yield frame
98
-
99
- def is_speech_frame(self, frame: bytes) -> bool:
100
- """
101
- Determine if a frame contains speech using WebRTC VAD.
102
-
103
- Args:
104
- frame: Audio frame bytes
105
-
106
- Returns:
107
- bool: True if frame contains speech
108
- """
109
- try:
110
- if len(frame) != self.frame_size * 2:
111
- return False
112
- return self.vad.is_speech(frame, self.sample_rate)
113
- except Exception as e:
114
- logger.warning(f"VAD frame analysis failed: {e}")
115
- return False
116
-
117
- def process_audio_chunk(self, audio_data: bytes) -> List[bytes]:
118
- """
119
- Process audio chunk and return complete speech segments.
120
-
121
- Args:
122
- audio_data: Raw audio bytes (16-bit PCM)
123
-
124
- Returns:
125
- List[bytes]: List of detected speech segments
126
- """
127
- speech_segments = []
128
-
129
- for frame in self.convert_audio_to_frames(audio_data):
130
- self.total_frames_processed += 1
131
- is_speech = self.is_speech_frame(frame)
132
-
133
- if is_speech:
134
- self.speech_frames_detected += 1
135
-
136
- # Process frame through VAD collector
137
- collected_audio = self._vad_collector_step(frame, is_speech)
138
-
139
- if collected_audio is not None:
140
- # Complete speech segment detected
141
- speech_segments.append(collected_audio)
142
- self.segments_extracted += 1
143
- logger.debug(f"Speech segment extracted: {len(collected_audio)} bytes")
144
-
145
- return speech_segments
146
-
147
- def _vad_collector_step(self, frame: bytes, is_speech: bool) -> Optional[bytes]:
148
- """
149
- Single step of VAD collection algorithm.
150
-
151
- Args:
152
- frame: Audio frame
153
- is_speech: Whether frame contains speech
154
-
155
- Returns:
156
- bytes: Complete speech segment if detected, None otherwise
157
- """
158
- if not self.triggered:
159
- # Not currently in speech mode
160
- self.ring_buffer.append((frame, is_speech))
161
- num_voiced = sum(1 for f, speech in self.ring_buffer if speech)
162
-
163
- # Check if we should trigger speech detection
164
- if len(self.ring_buffer) == self.ring_buffer.maxlen:
165
- if num_voiced >= self.speech_threshold * self.ring_buffer.maxlen:
166
- self.triggered = True
167
- self.is_recording = True
168
- self.current_utterance_start = time.time()
169
-
170
- # Output buffered frames to start speech segment
171
- self.speech_buffer.clear()
172
- for f, s in self.ring_buffer:
173
- self.speech_buffer.append(f)
174
-
175
- self.ring_buffer.clear()
176
- logger.debug("Speech triggered - starting collection")
177
-
178
- else:
179
- # Currently in speech mode
180
- self.speech_buffer.append(frame)
181
- self.ring_buffer.append((frame, is_speech))
182
-
183
- # Check duration limits
184
- if self.current_utterance_start:
185
- utterance_duration = time.time() - self.current_utterance_start
186
-
187
- if utterance_duration > self.max_speech_duration:
188
- # Force end due to maximum duration
189
- logger.debug("Speech segment ended due to max duration")
190
- return self._finalize_speech_segment()
191
-
192
- # Check for end of speech
193
- if len(self.ring_buffer) == self.ring_buffer.maxlen:
194
- num_unvoiced = sum(1 for f, speech in self.ring_buffer if not speech)
195
-
196
- if num_unvoiced >= self.silence_threshold * self.ring_buffer.maxlen:
197
- # End of speech detected
198
- logger.debug("Speech segment ended due to silence")
199
- return self._finalize_speech_segment()
200
-
201
- return None
202
-
203
- def _finalize_speech_segment(self) -> Optional[bytes]:
204
- """
205
- Finalize and return current speech segment.
206
-
207
- Returns:
208
- bytes: Complete speech segment or None if too short
209
- """
210
- if not self.speech_buffer:
211
- self.triggered = False
212
- self.is_recording = False
213
- return None
214
-
215
- # Calculate duration
216
- total_frames = len(self.speech_buffer)
217
- duration = total_frames * self.frame_duration / 1000.0
218
-
219
- # Apply stricter minimum duration filter (0.1s minimum)
220
- min_duration = max(self.min_speech_duration, 0.1) # At least 100ms
221
-
222
- # Check minimum duration
223
- if duration < min_duration:
224
- logger.debug(f"Speech segment too short: {duration:.2f}s < {min_duration}s")
225
- self.triggered = False
226
- self.is_recording = False
227
- self.speech_buffer.clear()
228
- self.ring_buffer.clear()
229
- return None
230
-
231
- # Create complete audio segment
232
- speech_data = b''.join(self.speech_buffer)
233
-
234
- # Reset state
235
- self.triggered = False
236
- self.is_recording = False
237
- self.speech_buffer.clear()
238
- self.ring_buffer.clear()
239
- self.current_utterance_start = None
240
-
241
- logger.info(f"Speech segment finalized: {duration:.2f}s, {len(speech_data)} bytes")
242
- return speech_data
243
-
244
- def process_numpy_audio(self, audio_array: np.ndarray) -> List[bytes]:
245
- """
246
- Process numpy audio array.
247
-
248
- Args:
249
- audio_array: Audio data as numpy array (float32, -1 to 1 range)
250
-
251
- Returns:
252
- List[bytes]: List of detected speech segments
253
- """
254
- # Convert to 16-bit PCM bytes
255
- if audio_array.dtype != np.int16:
256
- # Normalize and convert to int16
257
- audio_normalized = np.clip(audio_array, -1.0, 1.0)
258
- audio_int16 = (audio_normalized * 32767).astype(np.int16)
259
- else:
260
- audio_int16 = audio_array
261
-
262
- # Convert to bytes
263
- audio_bytes = audio_int16.tobytes()
264
-
265
- return self.process_audio_chunk(audio_bytes)
266
-
267
- def get_current_segment(self) -> Optional[bytes]:
268
- """
269
- Get current ongoing speech segment if any.
270
-
271
- Returns:
272
- bytes: Current speech segment or None
273
- """
274
- if self.is_recording and self.speech_buffer:
275
- current_duration = len(self.speech_buffer) * self.frame_duration / 1000.0
276
- if current_duration >= self.min_speech_duration:
277
- return b''.join(self.speech_buffer)
278
- return None
279
-
280
- def start_streaming_processing(self):
281
- """Start background thread for streaming audio processing."""
282
- if self.processing:
283
- return
284
-
285
- self.processing = True
286
- self.processing_thread = threading.Thread(target=self._streaming_worker, daemon=True)
287
- self.processing_thread.start()
288
- logger.info("Started streaming VAD processing")
289
-
290
- def stop_streaming_processing(self):
291
- """Stop background streaming processing."""
292
- self.processing = False
293
- if hasattr(self, 'processing_thread'):
294
- self.processing_thread.join(timeout=1.0)
295
- logger.info("Stopped streaming VAD processing")
296
-
297
- def add_audio_chunk(self, audio_data: bytes):
298
- """
299
- Add audio chunk to processing queue (thread-safe).
300
-
301
- Args:
302
- audio_data: Raw audio bytes
303
- """
304
- if self.processing:
305
- try:
306
- self.audio_queue.put_nowait(audio_data)
307
- except queue.Full:
308
- logger.warning("Audio queue full, dropping chunk")
309
-
310
- def get_speech_segments(self) -> List[bytes]:
311
- """
312
- Get all available speech segments from processing queue.
313
-
314
- Returns:
315
- List[bytes]: Available speech segments
316
- """
317
- segments = []
318
- try:
319
- while True:
320
- segment = self.output_queue.get_nowait()
321
- segments.append(segment)
322
- except queue.Empty:
323
- pass
324
- return segments
325
-
326
- def _streaming_worker(self):
327
- """Background worker for streaming audio processing."""
328
- while self.processing:
329
- try:
330
- # Get audio chunk with timeout
331
- audio_chunk = self.audio_queue.get(timeout=0.1)
332
-
333
- # Process chunk
334
- segments = self.process_audio_chunk(audio_chunk)
335
-
336
- # Add segments to output queue
337
- for segment in segments:
338
- try:
339
- self.output_queue.put_nowait(segment)
340
- except queue.Full:
341
- logger.warning("Output queue full, dropping segment")
342
-
343
- except queue.Empty:
344
- continue
345
- except Exception as e:
346
- logger.error(f"Streaming processing error: {e}")
347
-
348
- def get_stats(self) -> dict:
349
- """
350
- Get VAD processing statistics.
351
-
352
- Returns:
353
- dict: Processing statistics
354
- """
355
- return {
356
- 'total_frames_processed': self.total_frames_processed,
357
- 'speech_frames_detected': self.speech_frames_detected,
358
- 'segments_extracted': self.segments_extracted,
359
- 'speech_ratio': (
360
- self.speech_frames_detected / max(1, self.total_frames_processed)
361
- ),
362
- 'is_recording': self.is_recording,
363
- 'triggered': self.triggered,
364
- 'buffer_size': len(self.speech_buffer),
365
- 'ring_buffer_size': len(self.ring_buffer),
366
- 'configuration': {
367
- 'sample_rate': self.sample_rate,
368
- 'frame_duration': self.frame_duration,
369
- 'min_speech_duration': self.min_speech_duration,
370
- 'max_speech_duration': self.max_speech_duration
371
- }
372
- }
373
-
374
- class StreamingAudioBuffer:
375
- """
376
- Optimized audio buffer for streaming VAD processing.
377
- Thread-safe with memory pool for high performance.
378
- """
379
-
380
- def __init__(self, sample_rate=16000, max_duration=30):
381
- self.sample_rate = sample_rate
382
- self.max_samples = sample_rate * max_duration
383
-
384
- # Thread-safe circular buffer
385
- self.buffer = collections.deque(maxlen=self.max_samples)
386
- self.buffer_lock = threading.RLock()
387
-
388
- # Performance tracking
389
- self.total_samples_added = 0
390
- self.buffer_overruns = 0
391
-
392
- def add_audio(self, audio_data: np.ndarray):
393
- """
394
- Add audio data to buffer (thread-safe).
395
-
396
- Args:
397
- audio_data: Audio samples as numpy array
398
- """
399
- with self.buffer_lock:
400
- if len(self.buffer) + len(audio_data) > self.max_samples:
401
- self.buffer_overruns += 1
402
- # Remove old samples to make room
403
- samples_to_remove = len(audio_data)
404
- for _ in range(min(samples_to_remove, len(self.buffer))):
405
- self.buffer.popleft()
406
-
407
- self.buffer.extend(audio_data)
408
- self.total_samples_added += len(audio_data)
409
-
410
- def get_recent_audio(self, duration_ms: int = 1000) -> np.ndarray:
411
- """
412
- Get recent audio with specified duration.
413
-
414
- Args:
415
- duration_ms: Duration in milliseconds
416
-
417
- Returns:
418
- np.ndarray: Recent audio samples
419
- """
420
- samples_needed = int(self.sample_rate * duration_ms / 1000)
421
-
422
- with self.buffer_lock:
423
- if len(self.buffer) >= samples_needed:
424
- return np.array(list(self.buffer)[-samples_needed:], dtype=np.float32)
425
- else:
426
- return np.array(list(self.buffer), dtype=np.float32)
427
-
428
- def clear(self):
429
- """Clear buffer contents."""
430
- with self.buffer_lock:
431
- self.buffer.clear()
432
-
433
- def get_stats(self) -> dict:
434
- """Get buffer statistics."""
435
- with self.buffer_lock:
436
- return {
437
- 'buffer_size': len(self.buffer),
438
- 'max_size': self.max_samples,
439
- 'utilization': len(self.buffer) / self.max_samples,
440
- 'total_added': self.total_samples_added,
441
- 'overruns': self.buffer_overruns
442
- }