Spaces:
Sleeping
Sleeping
update
Browse files- app.py +43 -44
- static/app.js +10 -5
app.py
CHANGED
|
@@ -37,7 +37,7 @@ model.eval()
|
|
| 37 |
# Load TEN-VAD model
|
| 38 |
# hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
|
| 39 |
VAD_HOP_SIZE = 256
|
| 40 |
-
VAD_THRESHOLD = 0.
|
| 41 |
|
| 42 |
def create_vad():
|
| 43 |
"""Create a TEN-VAD instance"""
|
|
@@ -74,8 +74,8 @@ class AudioProcessor:
|
|
| 74 |
|
| 75 |
SAMPLE_RATE = 16000
|
| 76 |
VAD_CHUNK_SIZE = VAD_HOP_SIZE # 256 samples (16ms) at 16kHz
|
| 77 |
-
|
| 78 |
-
SPEECH_PROB_THRESHOLD = 0.
|
| 79 |
|
| 80 |
def __init__(self):
|
| 81 |
# Each processor gets its own VAD instance for independent state
|
|
@@ -87,8 +87,8 @@ class AudioProcessor:
|
|
| 87 |
self.audio_buffer = [] # For ASR (full audio)
|
| 88 |
self.pending_samples = np.array([], dtype=np.float32) # Buffer for incomplete VAD chunks
|
| 89 |
self.is_speaking = False
|
| 90 |
-
self.silence_start = None
|
| 91 |
self.speech_detected = False
|
|
|
|
| 92 |
self.last_result = {"status": "listening", "probability": 0.0}
|
| 93 |
# Recreate VAD instance to reset internal state
|
| 94 |
self.vad = create_vad()
|
|
@@ -139,48 +139,47 @@ class AudioProcessor:
|
|
| 139 |
current_time = time.time()
|
| 140 |
|
| 141 |
# Use probability threshold for more control (speech_flag uses internal threshold)
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
self.
|
| 145 |
self.speech_detected = True
|
| 146 |
-
self.
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
if self.silence_start is None:
|
| 153 |
-
self.silence_start = current_time
|
| 154 |
-
return {"status": "silence", "probability": speech_prob}
|
| 155 |
-
|
| 156 |
-
# Check if silence duration exceeded threshold
|
| 157 |
-
silence_duration = current_time - self.silence_start
|
| 158 |
-
|
| 159 |
-
if silence_duration >= self.SILENCE_THRESHOLD:
|
| 160 |
-
# Trigger ASR inference
|
| 161 |
-
# Trigger ASR inference
|
| 162 |
-
if self.speech_detected and len(self.audio_buffer) > 0:
|
| 163 |
-
transcription_result = self._transcribe()
|
| 164 |
-
self.reset()
|
| 165 |
-
result = {
|
| 166 |
-
"status": "transcription",
|
| 167 |
-
"transcription": transcription_result["text"],
|
| 168 |
-
"confidence": transcription_result["confidence"],
|
| 169 |
-
"token_confidences": transcription_result["tokens"],
|
| 170 |
-
"probability": speech_prob
|
| 171 |
-
}
|
| 172 |
-
print(f"Sending transcription to client: {result}")
|
| 173 |
-
return result
|
| 174 |
-
else:
|
| 175 |
-
# Still accumulating silence
|
| 176 |
-
remaining = self.SILENCE_THRESHOLD - silence_duration
|
| 177 |
-
return {
|
| 178 |
-
"status": "silence",
|
| 179 |
-
"probability": speech_prob,
|
| 180 |
-
"remaining": round(remaining, 2)
|
| 181 |
-
}
|
| 182 |
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
def _transcribe(self) -> dict:
|
| 186 |
"""Run ASR on accumulated audio and return transcription with confidence"""
|
|
|
|
| 37 |
# Load TEN-VAD model
|
| 38 |
# hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
|
| 39 |
VAD_HOP_SIZE = 256
|
| 40 |
+
VAD_THRESHOLD = 0.95
|
| 41 |
|
| 42 |
def create_vad():
|
| 43 |
"""Create a TEN-VAD instance"""
|
|
|
|
| 74 |
|
| 75 |
SAMPLE_RATE = 16000
|
| 76 |
VAD_CHUNK_SIZE = VAD_HOP_SIZE # 256 samples (16ms) at 16kHz
|
| 77 |
+
SPEECH_ACCUMULATION_DELAY = 0.5 # seconds - wait time after speech detection starts
|
| 78 |
+
SPEECH_PROB_THRESHOLD = 0.95 # Use higher threshold for more reliable speech detection
|
| 79 |
|
| 80 |
def __init__(self):
|
| 81 |
# Each processor gets its own VAD instance for independent state
|
|
|
|
| 87 |
self.audio_buffer = [] # For ASR (full audio)
|
| 88 |
self.pending_samples = np.array([], dtype=np.float32) # Buffer for incomplete VAD chunks
|
| 89 |
self.is_speaking = False
|
|
|
|
| 90 |
self.speech_detected = False
|
| 91 |
+
self.speech_start_time = None # Track when speech first started
|
| 92 |
self.last_result = {"status": "listening", "probability": 0.0}
|
| 93 |
# Recreate VAD instance to reset internal state
|
| 94 |
self.vad = create_vad()
|
|
|
|
| 139 |
current_time = time.time()
|
| 140 |
|
| 141 |
# Use probability threshold for more control (speech_flag uses internal threshold)
|
| 142 |
+
# Track when speech first started for accumulation delay
|
| 143 |
+
if speech_prob >= self.SPEECH_PROB_THRESHOLD and self.speech_start_time is None:
|
| 144 |
+
self.speech_start_time = current_time
|
| 145 |
self.speech_detected = True
|
| 146 |
+
self.is_speaking = True
|
| 147 |
+
|
| 148 |
+
if self.speech_start_time is not None:
|
| 149 |
+
# Calculate how long since speech first started
|
| 150 |
+
speech_duration = current_time - self.speech_start_time
|
| 151 |
+
accumulation_remaining = max(0, self.SPEECH_ACCUMULATION_DELAY - speech_duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
# If delay is reached, trigger ASR immediately and reset
|
| 154 |
+
if speech_duration >= self.SPEECH_ACCUMULATION_DELAY:
|
| 155 |
+
if self.speech_detected and len(self.audio_buffer) > 0:
|
| 156 |
+
transcription_result = self._transcribe()
|
| 157 |
+
self.reset()
|
| 158 |
+
if transcription_result["confidence"] >= 0.6 and len(transcription_result["text"]) == 7:
|
| 159 |
+
result = {
|
| 160 |
+
"status": "transcription",
|
| 161 |
+
"transcription": transcription_result["text"],
|
| 162 |
+
"confidence": transcription_result["confidence"],
|
| 163 |
+
"token_confidences": transcription_result["tokens"],
|
| 164 |
+
"probability": speech_prob
|
| 165 |
+
}
|
| 166 |
+
print(f"Sending transcription to client: {result}")
|
| 167 |
+
return result
|
| 168 |
+
else:
|
| 169 |
+
print(f"Suppressed low-confidence transcription: {transcription_result}")
|
| 170 |
+
return {"status": "listening", "probability": speech_prob}
|
| 171 |
+
|
| 172 |
+
# Otherwise, keep accumulating
|
| 173 |
+
status = "speaking" if speech_prob >= self.SPEECH_PROB_THRESHOLD else "waiting"
|
| 174 |
+
return {
|
| 175 |
+
"status": status,
|
| 176 |
+
"probability": speech_prob,
|
| 177 |
+
"accumulating": True,
|
| 178 |
+
"accumulation_remaining": round(accumulation_remaining, 2)
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
# If no speech started yet, just keep listening
|
| 182 |
+
return {"status": "listening", "probability": speech_prob}
|
| 183 |
|
| 184 |
def _transcribe(self) -> dict:
|
| 185 |
"""Run ASR on accumulated audio and return transcription with confidence"""
|
static/app.js
CHANGED
|
@@ -275,7 +275,7 @@ class AudioRecorder {
|
|
| 275 |
}
|
| 276 |
|
| 277 |
handleServerMessage(data) {
|
| 278 |
-
const { status, probability, transcription, remaining, confidence, token_confidences } = data;
|
| 279 |
|
| 280 |
// Update probability bar
|
| 281 |
if (probability !== undefined) {
|
|
@@ -285,11 +285,16 @@ class AudioRecorder {
|
|
| 285 |
// Update status
|
| 286 |
switch (status) {
|
| 287 |
case 'speaking':
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
break;
|
| 290 |
-
case '
|
| 291 |
-
|
| 292 |
-
|
|
|
|
| 293 |
break;
|
| 294 |
case 'listening':
|
| 295 |
this.updateStatus('listening', 'في انتظار الكلام...');
|
|
|
|
| 275 |
}
|
| 276 |
|
| 277 |
handleServerMessage(data) {
|
| 278 |
+
const { status, probability, transcription, remaining, confidence, token_confidences, accumulating, accumulation_remaining } = data;
|
| 279 |
|
| 280 |
// Update probability bar
|
| 281 |
if (probability !== undefined) {
|
|
|
|
| 285 |
// Update status
|
| 286 |
switch (status) {
|
| 287 |
case 'speaking':
|
| 288 |
+
if (accumulating && accumulation_remaining > 0) {
|
| 289 |
+
this.updateStatus('speaking', `جاري التحدث... (${accumulation_remaining}s)`);
|
| 290 |
+
} else {
|
| 291 |
+
this.updateStatus('speaking', 'جاري التحدث...');
|
| 292 |
+
}
|
| 293 |
break;
|
| 294 |
+
case 'waiting':
|
| 295 |
+
// Waiting for speech accumulation delay
|
| 296 |
+
const waitText = accumulation_remaining ? ` (${accumulation_remaining}s)` : '';
|
| 297 |
+
this.updateStatus('speaking', `انتظر لإكمال الكلام${waitText}`);
|
| 298 |
break;
|
| 299 |
case 'listening':
|
| 300 |
this.updateStatus('listening', 'في انتظار الكلام...');
|