Spaces:

shredder-31
/

Car-plate-ASR

Sleeping

App Files Files Community

sherif31 commited on Jan 6

Commit

55131fa

1 Parent(s): e7209eb

update

Browse files

Files changed (2) hide show

app.py +43 -44
static/app.js +10 -5

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ model.eval()
 # Load TEN-VAD model
 # hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
 VAD_HOP_SIZE = 256
-VAD_THRESHOLD = 0.90
 def create_vad():
     """Create a TEN-VAD instance"""
@@ -74,8 +74,8 @@ class AudioProcessor:
     SAMPLE_RATE = 16000
     VAD_CHUNK_SIZE = VAD_HOP_SIZE  # 256 samples (16ms) at 16kHz
-    SILENCE_THRESHOLD = 0.3  # seconds
-    SPEECH_PROB_THRESHOLD = 0.90  # Use higher threshold for more reliable speech detection
     def __init__(self):
         # Each processor gets its own VAD instance for independent state
@@ -87,8 +87,8 @@ class AudioProcessor:
         self.audio_buffer = []  # For ASR (full audio)
         self.pending_samples = np.array([], dtype=np.float32)  # Buffer for incomplete VAD chunks
         self.is_speaking = False
-        self.silence_start = None
         self.speech_detected = False
         self.last_result = {"status": "listening", "probability": 0.0}
         # Recreate VAD instance to reset internal state
         self.vad = create_vad()
@@ -139,48 +139,47 @@ class AudioProcessor:
         current_time = time.time()
         # Use probability threshold for more control (speech_flag uses internal threshold)
-        if speech_prob >= self.SPEECH_PROB_THRESHOLD:
-            # Speech detected
-            self.is_speaking = True
             self.speech_detected = True
-            self.silence_start = None
-            return {"status": "speaking", "probability": speech_prob}
-        else:
-            # Silence detected
-            if self.is_speaking:
-                # Just stopped speaking
-                if self.silence_start is None:
-                    self.silence_start = current_time
-                    return {"status": "silence", "probability": speech_prob}
-                # Check if silence duration exceeded threshold
-                silence_duration = current_time - self.silence_start
-                if silence_duration >= self.SILENCE_THRESHOLD:
-                    # Trigger ASR inference
-                    # Trigger ASR inference
-                    if self.speech_detected and len(self.audio_buffer) > 0:
-                        transcription_result = self._transcribe()
-                        self.reset()
-                        result = {
-                            "status": "transcription",
-                            "transcription": transcription_result["text"],
-                            "confidence": transcription_result["confidence"],
-                            "token_confidences": transcription_result["tokens"],
-                            "probability": speech_prob
-                        }
-                        print(f"Sending transcription to client: {result}")
-                        return result
-                else:
-                    # Still accumulating silence
-                    remaining = self.SILENCE_THRESHOLD - silence_duration
-                    return {
-                        "status": "silence",
-                        "probability": speech_prob,
-                        "remaining": round(remaining, 2)
-                    }
-            return {"status": "listening", "probability": speech_prob}
     def _transcribe(self) -> dict:
         """Run ASR on accumulated audio and return transcription with confidence"""

 # Load TEN-VAD model
 # hop_size=256 (16ms frame), threshold=0.5 (speech probability threshold)
 VAD_HOP_SIZE = 256
+VAD_THRESHOLD = 0.95
 def create_vad():
     """Create a TEN-VAD instance"""
     SAMPLE_RATE = 16000
     VAD_CHUNK_SIZE = VAD_HOP_SIZE  # 256 samples (16ms) at 16kHz
+    SPEECH_ACCUMULATION_DELAY = 0.5  # seconds - wait time after speech detection starts
+    SPEECH_PROB_THRESHOLD = 0.95  # Use higher threshold for more reliable speech detection
     def __init__(self):
         # Each processor gets its own VAD instance for independent state
         self.audio_buffer = []  # For ASR (full audio)
         self.pending_samples = np.array([], dtype=np.float32)  # Buffer for incomplete VAD chunks
         self.is_speaking = False
         self.speech_detected = False
+        self.speech_start_time = None  # Track when speech first started
         self.last_result = {"status": "listening", "probability": 0.0}
         # Recreate VAD instance to reset internal state
         self.vad = create_vad()
         current_time = time.time()
         # Use probability threshold for more control (speech_flag uses internal threshold)
+        # Track when speech first started for accumulation delay
+        if speech_prob >= self.SPEECH_PROB_THRESHOLD and self.speech_start_time is None:
+            self.speech_start_time = current_time
             self.speech_detected = True
+            self.is_speaking = True
+        if self.speech_start_time is not None:
+            # Calculate how long since speech first started
+            speech_duration = current_time - self.speech_start_time
+            accumulation_remaining = max(0, self.SPEECH_ACCUMULATION_DELAY - speech_duration)
+            # If delay is reached, trigger ASR immediately and reset
+            if speech_duration >= self.SPEECH_ACCUMULATION_DELAY:
+                if self.speech_detected and len(self.audio_buffer) > 0:
+                    transcription_result = self._transcribe()
+                    self.reset()
+                    if transcription_result["confidence"] >= 0.6 and len(transcription_result["text"]) == 7:
+                            result = {
+                                "status": "transcription",
+                                "transcription": transcription_result["text"],
+                                "confidence": transcription_result["confidence"],
+                                "token_confidences": transcription_result["tokens"],
+                                "probability": speech_prob
+                            }
+                            print(f"Sending transcription to client: {result}")
+                            return result
+                    else:
+                        print(f"Suppressed low-confidence transcription: {transcription_result}")
+                        return {"status": "listening", "probability": speech_prob}
+            # Otherwise, keep accumulating
+            status = "speaking" if speech_prob >= self.SPEECH_PROB_THRESHOLD else "waiting"
+            return {
+                "status": status,
+                "probability": speech_prob,
+                "accumulating": True,
+                "accumulation_remaining": round(accumulation_remaining, 2)
+            }
+        # If no speech started yet, just keep listening
+        return {"status": "listening", "probability": speech_prob}
     def _transcribe(self) -> dict:
         """Run ASR on accumulated audio and return transcription with confidence"""

static/app.js CHANGED Viewed

@@ -275,7 +275,7 @@ class AudioRecorder {
     }
     handleServerMessage(data) {
-        const { status, probability, transcription, remaining, confidence, token_confidences } = data;
         // Update probability bar
         if (probability !== undefined) {
@@ -285,11 +285,16 @@ class AudioRecorder {
         // Update status
         switch (status) {
             case 'speaking':
-                this.updateStatus('speaking', 'جاري التحدث...');
                 break;
-            case 'silence':
-                const remainingText = remaining ? ` (${remaining}s)` : '';
-                this.updateStatus('silence', `صمت${remainingText}`);
                 break;
             case 'listening':
                 this.updateStatus('listening', 'في انتظار الكلام...');

     }
     handleServerMessage(data) {
+        const { status, probability, transcription, remaining, confidence, token_confidences, accumulating, accumulation_remaining } = data;
         // Update probability bar
         if (probability !== undefined) {
         // Update status
         switch (status) {
             case 'speaking':
+                if (accumulating && accumulation_remaining > 0) {
+                    this.updateStatus('speaking', `جاري التحدث... (${accumulation_remaining}s)`);
+                } else {
+                    this.updateStatus('speaking', 'جاري التحدث...');
+                }
                 break;
+            case 'waiting':
+                // Waiting for speech accumulation delay
+                const waitText = accumulation_remaining ? ` (${accumulation_remaining}s)` : '';
+                this.updateStatus('speaking', `انتظر لإكمال الكلام${waitText}`);
                 break;
             case 'listening':
                 this.updateStatus('listening', 'في انتظار الكلام...');