sherif31 commited on
Commit
55131fa
·
1 Parent(s): e7209eb
Files changed (2) hide show
  1. app.py +43 -44
  2. static/app.js +10 -5
app.py CHANGED
@@ -37,7 +37,7 @@ model.eval()
37
  # Load TEN-VAD model
38
  # hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
39
  VAD_HOP_SIZE = 256
40
- VAD_THRESHOLD = 0.90
41
 
42
  def create_vad():
43
  """Create a TEN-VAD instance"""
@@ -74,8 +74,8 @@ class AudioProcessor:
74
 
75
  SAMPLE_RATE = 16000
76
  VAD_CHUNK_SIZE = VAD_HOP_SIZE # 256 samples (16ms) at 16kHz
77
- SILENCE_THRESHOLD = 0.3 # seconds
78
- SPEECH_PROB_THRESHOLD = 0.90 # Use higher threshold for more reliable speech detection
79
 
80
  def __init__(self):
81
  # Each processor gets its own VAD instance for independent state
@@ -87,8 +87,8 @@ class AudioProcessor:
87
  self.audio_buffer = [] # For ASR (full audio)
88
  self.pending_samples = np.array([], dtype=np.float32) # Buffer for incomplete VAD chunks
89
  self.is_speaking = False
90
- self.silence_start = None
91
  self.speech_detected = False
 
92
  self.last_result = {"status": "listening", "probability": 0.0}
93
  # Recreate VAD instance to reset internal state
94
  self.vad = create_vad()
@@ -139,48 +139,47 @@ class AudioProcessor:
139
  current_time = time.time()
140
 
141
  # Use probability threshold for more control (speech_flag uses internal threshold)
142
- if speech_prob >= self.SPEECH_PROB_THRESHOLD:
143
- # Speech detected
144
- self.is_speaking = True
145
  self.speech_detected = True
146
- self.silence_start = None
147
- return {"status": "speaking", "probability": speech_prob}
148
- else:
149
- # Silence detected
150
- if self.is_speaking:
151
- # Just stopped speaking
152
- if self.silence_start is None:
153
- self.silence_start = current_time
154
- return {"status": "silence", "probability": speech_prob}
155
-
156
- # Check if silence duration exceeded threshold
157
- silence_duration = current_time - self.silence_start
158
-
159
- if silence_duration >= self.SILENCE_THRESHOLD:
160
- # Trigger ASR inference
161
- # Trigger ASR inference
162
- if self.speech_detected and len(self.audio_buffer) > 0:
163
- transcription_result = self._transcribe()
164
- self.reset()
165
- result = {
166
- "status": "transcription",
167
- "transcription": transcription_result["text"],
168
- "confidence": transcription_result["confidence"],
169
- "token_confidences": transcription_result["tokens"],
170
- "probability": speech_prob
171
- }
172
- print(f"Sending transcription to client: {result}")
173
- return result
174
- else:
175
- # Still accumulating silence
176
- remaining = self.SILENCE_THRESHOLD - silence_duration
177
- return {
178
- "status": "silence",
179
- "probability": speech_prob,
180
- "remaining": round(remaining, 2)
181
- }
182
 
183
- return {"status": "listening", "probability": speech_prob}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  def _transcribe(self) -> dict:
186
  """Run ASR on accumulated audio and return transcription with confidence"""
 
37
  # Load TEN-VAD model
38
  # hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
39
  VAD_HOP_SIZE = 256
40
+ VAD_THRESHOLD = 0.95
41
 
42
  def create_vad():
43
  """Create a TEN-VAD instance"""
 
74
 
75
  SAMPLE_RATE = 16000
76
  VAD_CHUNK_SIZE = VAD_HOP_SIZE # 256 samples (16ms) at 16kHz
77
+ SPEECH_ACCUMULATION_DELAY = 0.5 # seconds - wait time after speech detection starts
78
+ SPEECH_PROB_THRESHOLD = 0.95 # Use higher threshold for more reliable speech detection
79
 
80
  def __init__(self):
81
  # Each processor gets its own VAD instance for independent state
 
87
  self.audio_buffer = [] # For ASR (full audio)
88
  self.pending_samples = np.array([], dtype=np.float32) # Buffer for incomplete VAD chunks
89
  self.is_speaking = False
 
90
  self.speech_detected = False
91
+ self.speech_start_time = None # Track when speech first started
92
  self.last_result = {"status": "listening", "probability": 0.0}
93
  # Recreate VAD instance to reset internal state
94
  self.vad = create_vad()
 
139
  current_time = time.time()
140
 
141
  # Use probability threshold for more control (speech_flag uses internal threshold)
142
+ # Track when speech first started for accumulation delay
143
+ if speech_prob >= self.SPEECH_PROB_THRESHOLD and self.speech_start_time is None:
144
+ self.speech_start_time = current_time
145
  self.speech_detected = True
146
+ self.is_speaking = True
147
+
148
+ if self.speech_start_time is not None:
149
+ # Calculate how long since speech first started
150
+ speech_duration = current_time - self.speech_start_time
151
+ accumulation_remaining = max(0, self.SPEECH_ACCUMULATION_DELAY - speech_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # If delay is reached, trigger ASR immediately and reset
154
+ if speech_duration >= self.SPEECH_ACCUMULATION_DELAY:
155
+ if self.speech_detected and len(self.audio_buffer) > 0:
156
+ transcription_result = self._transcribe()
157
+ self.reset()
158
+ if transcription_result["confidence"] >= 0.6 and len(transcription_result["text"]) == 7:
159
+ result = {
160
+ "status": "transcription",
161
+ "transcription": transcription_result["text"],
162
+ "confidence": transcription_result["confidence"],
163
+ "token_confidences": transcription_result["tokens"],
164
+ "probability": speech_prob
165
+ }
166
+ print(f"Sending transcription to client: {result}")
167
+ return result
168
+ else:
169
+ print(f"Suppressed low-confidence transcription: {transcription_result}")
170
+ return {"status": "listening", "probability": speech_prob}
171
+
172
+ # Otherwise, keep accumulating
173
+ status = "speaking" if speech_prob >= self.SPEECH_PROB_THRESHOLD else "waiting"
174
+ return {
175
+ "status": status,
176
+ "probability": speech_prob,
177
+ "accumulating": True,
178
+ "accumulation_remaining": round(accumulation_remaining, 2)
179
+ }
180
+
181
+ # If no speech started yet, just keep listening
182
+ return {"status": "listening", "probability": speech_prob}
183
 
184
  def _transcribe(self) -> dict:
185
  """Run ASR on accumulated audio and return transcription with confidence"""
static/app.js CHANGED
@@ -275,7 +275,7 @@ class AudioRecorder {
275
  }
276
 
277
  handleServerMessage(data) {
278
- const { status, probability, transcription, remaining, confidence, token_confidences } = data;
279
 
280
  // Update probability bar
281
  if (probability !== undefined) {
@@ -285,11 +285,16 @@ class AudioRecorder {
285
  // Update status
286
  switch (status) {
287
  case 'speaking':
288
- this.updateStatus('speaking', 'جاري التحدث...');
 
 
 
 
289
  break;
290
- case 'silence':
291
- const remainingText = remaining ? ` (${remaining}s)` : '';
292
- this.updateStatus('silence', `صمت${remainingText}`);
 
293
  break;
294
  case 'listening':
295
  this.updateStatus('listening', 'في انتظار الكلام...');
 
275
  }
276
 
277
  handleServerMessage(data) {
278
+ const { status, probability, transcription, remaining, confidence, token_confidences, accumulating, accumulation_remaining } = data;
279
 
280
  // Update probability bar
281
  if (probability !== undefined) {
 
285
  // Update status
286
  switch (status) {
287
  case 'speaking':
288
+ if (accumulating && accumulation_remaining > 0) {
289
+ this.updateStatus('speaking', `جاري التحدث... (${accumulation_remaining}s)`);
290
+ } else {
291
+ this.updateStatus('speaking', 'جاري التحدث...');
292
+ }
293
  break;
294
+ case 'waiting':
295
+ // Waiting for speech accumulation delay
296
+ const waitText = accumulation_remaining ? ` (${accumulation_remaining}s)` : '';
297
+ this.updateStatus('speaking', `انتظر لإكمال الكلام${waitText}`);
298
  break;
299
  case 'listening':
300
  this.updateStatus('listening', 'في انتظار الكلام...');