Spaces:

shredder-31
/

Car-plate-ASR

Sleeping

App Files Files Community

sherif31 commited on Jan 4

Commit

4beeb3d

1 Parent(s): f1fc6e5

update

Browse files

Files changed (5) hide show

app.py +58 -12
static/app.js +72 -7
static/index.html +93 -60
static/styles.css +111 -1
vad/HumAwareVAD.jit +3 -0

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ model.generation_config.begin_suppress_tokens = [220, 50257]
 model.eval()
 # Load Silero VAD model
-def load_vad(model_path="vad/silero_vad.jit"):
     vad = torch.jit.load(model_path, map_location="cpu")
     vad.eval()
     return vad
@@ -69,8 +69,8 @@ class AudioProcessor:
     SAMPLE_RATE = 16000
     VAD_CHUNK_SIZE = 512  # Silero VAD requires exactly 512 samples at 16kHz
-    SILENCE_THRESHOLD = 0.5  # seconds
-    VAD_THRESHOLD = 0.7  # Speech probability threshold
     def __init__(self):
         self.reset()
@@ -152,13 +152,16 @@ class AudioProcessor:
                 silence_duration = current_time - self.silence_start
                 if silence_duration >= self.SILENCE_THRESHOLD:
                     # Trigger ASR inference
                     if self.speech_detected and len(self.audio_buffer) > 0:
-                        transcription = self._transcribe()
                         self.reset()
                         result = {
                             "status": "transcription",
-                            "transcription": transcription,
                             "probability": speech_prob
                         }
                         print(f"Sending transcription to client: {result}")
@@ -174,10 +177,10 @@ class AudioProcessor:
             return {"status": "listening", "probability": speech_prob}
-    def _transcribe(self) -> str:
-        """Run ASR on accumulated audio"""
         if not self.audio_buffer:
-            return ""
         # Concatenate all audio chunks
         audio_data = np.concatenate(self.audio_buffer)
@@ -194,20 +197,63 @@ class AudioProcessor:
         )
         input_features = inputs.input_features.to(device)
-        # Generate transcription
         with torch.no_grad():
-            predicted_ids = model.generate(
                 input_features,
                 language="arabic",
-                task="transcribe"
             )
         transcription = processor.batch_decode(
             predicted_ids,
             skip_special_tokens=True
         )[0]
-        return transcription
 # ============== WEBSOCKET ENDPOINT ==============

 model.eval()
 # Load Silero VAD model
+def load_vad(model_path="vad/HumAwareVAD.jit"):
     vad = torch.jit.load(model_path, map_location="cpu")
     vad.eval()
     return vad
     SAMPLE_RATE = 16000
     VAD_CHUNK_SIZE = 512  # Silero VAD requires exactly 512 samples at 16kHz
+    SILENCE_THRESHOLD = 0.25  # seconds
+    VAD_THRESHOLD = 0.9  # Speech probability threshold
     def __init__(self):
         self.reset()
                 silence_duration = current_time - self.silence_start
                 if silence_duration >= self.SILENCE_THRESHOLD:
+                    # Trigger ASR inference
                     # Trigger ASR inference
                     if self.speech_detected and len(self.audio_buffer) > 0:
+                        transcription_result = self._transcribe()
                         self.reset()
                         result = {
                             "status": "transcription",
+                            "transcription": transcription_result["text"],
+                            "confidence": transcription_result["confidence"],
+                            "token_confidences": transcription_result["tokens"],
                             "probability": speech_prob
                         }
                         print(f"Sending transcription to client: {result}")
             return {"status": "listening", "probability": speech_prob}
+    def _transcribe(self) -> dict:
+        """Run ASR on accumulated audio and return transcription with confidence"""
         if not self.audio_buffer:
+            return {"text": "", "confidence": 0.0, "tokens": []}
         # Concatenate all audio chunks
         audio_data = np.concatenate(self.audio_buffer)
         )
         input_features = inputs.input_features.to(device)
+        # Generate transcription with scores
         with torch.no_grad():
+            outputs = model.generate(
                 input_features,
                 language="arabic",
+                task="transcribe",
+                output_scores=True,
+                return_dict_in_generate=True
             )
+        predicted_ids = outputs.sequences
+        scores = outputs.scores
+        # Decode transcription
         transcription = processor.batch_decode(
             predicted_ids,
             skip_special_tokens=True
         )[0]
+        # Calculate confidence scores
+        # scores is a tuple of tensors (one for each step), each shape (batch_size, vocab_size)
+        # predicted_ids is (batch_size, seq_len)
+        # We need to skip the initial tokens that were not generated (like start token, lang token, etc.)
+        # The number of generated tokens matches the length of scores
+        gen_tokens = predicted_ids[0, -len(scores):]
+        token_confidences = []
+        sum_log_prob = 0.0
+        for i, (score, token_id) in enumerate(zip(scores, gen_tokens)):
+            # score is (batch_size, vocab_size)
+            # Get softmax probabilities
+            probs = torch.softmax(score[0], dim=-1)
+            prob = probs[token_id].item()
+            # Get token text
+            token_text = processor.decode([token_id], skip_special_tokens=True)
+            # cleanup clean text
+            if token_text.strip():
+                token_confidences.append({
+                    "token": token_text,
+                    "probability": round(prob, 4)
+                })
+                sum_log_prob += np.log(prob + 1e-10)
+        # Global confidence (geometric mean of probabilities)
+        avg_confidence = 0.0
+        if len(token_confidences) > 0:
+            avg_confidence = np.exp(sum_log_prob / len(token_confidences))
+        return {
+            "text": transcription,
+            "confidence": round(avg_confidence, 4),
+            "tokens": token_confidences
+        }
 # ============== WEBSOCKET ENDPOINT ==============

static/app.js CHANGED Viewed

@@ -8,13 +8,15 @@ class AudioRecorder {
         // Audio settings
         this.sampleRate = 16000;
         this.chunkSize = 512; // Samples per chunk
-        this.bufferSize = 4096;
         // State
         this.isRecording = false;
         this.audioContext = null;
         this.mediaStream = null;
         this.processor = null;
         this.websocket = null;
         // UI elements
@@ -27,6 +29,11 @@ class AudioRecorder {
         this.connectionStatus = document.getElementById('connectionStatus');
         this.transcriptionContent = document.getElementById('transcriptionContent');
         this.transcriptionHistory = document.getElementById('transcriptionHistory');
         this.waveformCanvas = document.getElementById('waveformCanvas');
         this.waveformCtx = this.waveformCanvas.getContext('2d');
@@ -135,6 +142,12 @@ class AudioRecorder {
             // Create audio processing pipeline
             const source = this.audioContext.createMediaStreamSource(this.mediaStream);
             // Use ScriptProcessorNode for audio processing
             this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
@@ -143,21 +156,21 @@ class AudioRecorder {
                 const inputData = e.inputBuffer.getChannelData(0);
-                // Update visualization buffer
-                this.audioDataBuffer = new Float32Array(inputData.slice(0, 128));
-                this.drawWaveform(this.audioDataBuffer);
                 // Send audio chunks to server
                 this.sendAudioChunk(inputData);
             };
-            source.connect(this.processor);
             this.processor.connect(this.audioContext.destination);
             // Update UI
             this.isRecording = true;
             this.updateUI('recording');
         } catch (error) {
             console.error('Error starting recording:', error);
             this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
@@ -167,12 +180,22 @@ class AudioRecorder {
     stopRecording() {
         this.isRecording = false;
         // Stop audio processing
         if (this.processor) {
             this.processor.disconnect();
             this.processor = null;
         }
         if (this.audioContext) {
             this.audioContext.close();
             this.audioContext = null;
@@ -193,6 +216,15 @@ class AudioRecorder {
         this.updateUI('stopped');
         this.drawIdleWaveform();
     }
     async connectWebSocket() {
         return new Promise((resolve, reject) => {
@@ -243,7 +275,7 @@ class AudioRecorder {
     }
     handleServerMessage(data) {
-        const { status, probability, transcription, remaining } = data;
         // Update probability bar
         if (probability !== undefined) {
@@ -265,6 +297,9 @@ class AudioRecorder {
             case 'transcription':
                 this.updateStatus('listening', 'تم التعرف على الكلام');
                 this.showTranscription(transcription);
                 break;
         }
     }
@@ -300,6 +335,36 @@ class AudioRecorder {
         // Show new transcription
         this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
     }
     updateUI(state) {
         if (state === 'recording') {

         // Audio settings
         this.sampleRate = 16000;
         this.chunkSize = 512; // Samples per chunk
+        this.bufferSize = 1024;
         // State
         this.isRecording = false;
         this.audioContext = null;
         this.mediaStream = null;
         this.processor = null;
+        this.analyser = null;
+        this.animationId = null;
         this.websocket = null;
         // UI elements
         this.connectionStatus = document.getElementById('connectionStatus');
         this.transcriptionContent = document.getElementById('transcriptionContent');
         this.transcriptionHistory = document.getElementById('transcriptionHistory');
+        this.confidencePanel = document.getElementById('confidencePanel');
+        this.confidenceTableBody = document.getElementById('confidenceTableBody');
+        this.globalConfidence = document.getElementById('globalConfidence');
         this.waveformCanvas = document.getElementById('waveformCanvas');
         this.waveformCtx = this.waveformCanvas.getContext('2d');
             // Create audio processing pipeline
             const source = this.audioContext.createMediaStreamSource(this.mediaStream);
+            // Create analyser for smooth visualization
+            this.analyser = this.audioContext.createAnalyser();
+            this.analyser.fftSize = 512; // Controls resolution of data
+            this.analyser.smoothingTimeConstant = 0.5;
+            this.audioDataBuffer = new Float32Array(this.analyser.fftSize);
             // Use ScriptProcessorNode for audio processing
             this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
                 const inputData = e.inputBuffer.getChannelData(0);
                 // Send audio chunks to server
                 this.sendAudioChunk(inputData);
             };
+            source.connect(this.analyser);
+            this.analyser.connect(this.processor);
             this.processor.connect(this.audioContext.destination);
             // Update UI
             this.isRecording = true;
             this.updateUI('recording');
+            // Start visualization loop
+            this.visualize();
         } catch (error) {
             console.error('Error starting recording:', error);
             this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
     stopRecording() {
         this.isRecording = false;
+        if (this.animationId) {
+            cancelAnimationFrame(this.animationId);
+            this.animationId = null;
+        }
         // Stop audio processing
         if (this.processor) {
             this.processor.disconnect();
             this.processor = null;
         }
+        if (this.analyser) {
+            this.analyser.disconnect();
+            this.analyser = null;
+        }
         if (this.audioContext) {
             this.audioContext.close();
             this.audioContext = null;
         this.updateUI('stopped');
         this.drawIdleWaveform();
     }
+    visualize() {
+        if (!this.isRecording || !this.analyser) return;
+        this.analyser.getFloatTimeDomainData(this.audioDataBuffer);
+        this.drawWaveform(this.audioDataBuffer);
+        this.animationId = requestAnimationFrame(() => this.visualize());
+    }
     async connectWebSocket() {
         return new Promise((resolve, reject) => {
     }
     handleServerMessage(data) {
+        const { status, probability, transcription, remaining, confidence, token_confidences } = data;
         // Update probability bar
         if (probability !== undefined) {
             case 'transcription':
                 this.updateStatus('listening', 'تم التعرف على الكلام');
                 this.showTranscription(transcription);
+                if (token_confidences) {
+                    this.updateConfidenceTable(token_confidences, confidence);
+                }
                 break;
         }
     }
         // Show new transcription
         this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
     }
+    updateConfidenceTable(tokens, globalConf) {
+        this.confidencePanel.classList.remove('hidden');
+        // Update global confidence
+        const percentage = Math.round(globalConf * 100);
+        this.globalConfidence.textContent = `${percentage}%`;
+        this.globalConfidence.className = 'confidence-value';
+        if (percentage < 50) this.globalConfidence.classList.add('low');
+        else if (percentage < 80) this.globalConfidence.classList.add('medium');
+        // Update table
+        this.confidenceTableBody.innerHTML = '';
+        tokens.forEach(tk => {
+            const row = document.createElement('tr');
+            const prob = Math.round(tk.probability * 100);
+            let probClass = 'confidence-value';
+            if (prob < 50) probClass += ' low';
+            else if (prob < 80) probClass += ' medium';
+            row.innerHTML = `
+                <td>${tk.token}</td>
+                <td class="${probClass}">${prob}%</td>
+            `;
+            this.confidenceTableBody.appendChild(row);
+        });
+    }
     updateUI(state) {
         if (state === 'recording') {

static/index.html CHANGED Viewed

@@ -3,7 +3,7 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>الصقر </title>
     <meta
       name="description"
       content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
@@ -51,76 +51,109 @@
       <!-- Main content -->
       <main class="main-content">
-        <!-- Status card -->
-        <div class="status-card glass-card">
-          <div class="status-indicator" id="statusIndicator">
-            <div class="pulse-ring"></div>
-            <div class="status-icon">
-              <svg
-                viewBox="0 0 24 24"
-                fill="none"
-                stroke="currentColor"
-                stroke-width="2"
-              >
-                <path
-                  d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
-                />
-                <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
-              </svg>
             </div>
-          </div>
-          <p class="status-message" id="statusMessage">اضغط للبدء</p>
-          <div class="probability-bar" id="probabilityBar">
-            <div class="probability-fill" id="probabilityFill"></div>
-          </div>
-        </div>
-        <!-- Microphone button -->
-        <button class="mic-button" id="micButton" aria-label="Start recording">
-          <div class="mic-button-inner">
-            <svg
-              class="mic-icon"
-              viewBox="0 0 24 24"
-              fill="none"
-              stroke="currentColor"
-              stroke-width="2"
             >
-              <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
-              <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
-              <line x1="12" y1="19" x2="12" y2="23" />
-              <line x1="8" y1="23" x2="16" y2="23" />
-            </svg>
-            <svg
-              class="stop-icon hidden"
-              viewBox="0 0 24 24"
-              fill="currentColor"
-            >
-              <rect x="6" y="6" width="12" height="12" rx="2" />
-            </svg>
-          </div>
-          <div class="mic-ripple"></div>
-        </button>
-        <!-- Waveform visualization -->
-        <div class="waveform-container glass-card" id="waveformContainer">
-          <canvas id="waveformCanvas"></canvas>
-        </div>
-        <!-- Transcription result -->
-        <div class="transcription-card glass-card" id="transcriptionCard">
-          <h2>النتيجة</h2>
-          <div class="transcription-content" id="transcriptionContent">
-            <p class="placeholder-text">سيظهر النص هنا بعد انتهاء الكلام...</p>
           </div>
-          <div class="transcription-history" id="transcriptionHistory">
-            <!-- Previous transcriptions will be added here -->
           </div>
         </div>
       </main>
       <!-- Footer -->
-      <footer class="footer">
-      </footer>
     </div>
     <script src="/static/app.js"></script>

   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>الصقر</title>
     <meta
       name="description"
       content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
       <!-- Main content -->
       <main class="main-content">
+        <div class="content-wrapper">
+          <div class="center-panel">
+            <!-- Status card -->
+            <div class="status-card glass-card">
+              <div class="status-indicator" id="statusIndicator">
+                <div class="pulse-ring"></div>
+                <div class="status-icon">
+                  <svg
+                    viewBox="0 0 24 24"
+                    fill="none"
+                    stroke="currentColor"
+                    stroke-width="2"
+                  >
+                    <path
+                      d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
+                    />
+                    <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+                  </svg>
+                </div>
+              </div>
+              <p class="status-message" id="statusMessage">اضغط للبدء</p>
+              <div class="probability-bar" id="probabilityBar">
+                <div class="probability-fill" id="probabilityFill"></div>
+              </div>
             </div>
+            <!-- Microphone button -->
+            <button
+              class="mic-button"
+              id="micButton"
+              aria-label="Start recording"
             >
+              <div class="mic-button-inner">
+                <svg
+                  class="mic-icon"
+                  viewBox="0 0 24 24"
+                  fill="none"
+                  stroke="currentColor"
+                  stroke-width="2"
+                >
+                  <path
+                    d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
+                  />
+                  <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+                  <line x1="12" y1="19" x2="12" y2="23" />
+                  <line x1="8" y1="23" x2="16" y2="23" />
+                </svg>
+                <svg
+                  class="stop-icon hidden"
+                  viewBox="0 0 24 24"
+                  fill="currentColor"
+                >
+                  <rect x="6" y="6" width="12" height="12" rx="2" />
+                </svg>
+              </div>
+              <div class="mic-ripple"></div>
+            </button>
+            <!-- Waveform visualization -->
+            <div class="waveform-container glass-card" id="waveformContainer">
+              <canvas id="waveformCanvas"></canvas>
+            </div>
+            <!-- Transcription result -->
+            <div class="transcription-card glass-card" id="transcriptionCard">
+              <h2>النتيجة</h2>
+              <div class="transcription-content" id="transcriptionContent">
+                <p class="placeholder-text">
+                  سيظهر النص هنا بعد انتهاء الكلام...
+                </p>
+              </div>
+              <div class="transcription-history" id="transcriptionHistory">
+                <!-- Previous transcriptions will be added here -->
+              </div>
+            </div>
           </div>
+          <!-- Confidence Side Panel -->
+          <div class="side-panel glass-card hidden" id="confidencePanel">
+            <h3>تفاصيل الدقة</h3>
+            <div class="confidence-summary">
+              <span>نسبة الثقة العامة:</span>
+              <span id="globalConfidence" class="confidence-value">0%</span>
+            </div>
+            <div class="table-container">
+              <table class="confidence-table">
+                <thead>
+                  <tr>
+                    <th>الرمز</th>
+                    <th>الثقة</th>
+                  </tr>
+                </thead>
+                <tbody id="confidenceTableBody">
+                  <!-- Rows will be added dynamically -->
+                </tbody>
+              </table>
+            </div>
           </div>
         </div>
       </main>
       <!-- Footer -->
+      <footer class="footer"></footer>
     </div>
     <script src="/static/app.js"></script>

static/styles.css CHANGED Viewed

@@ -81,7 +81,7 @@ body {
   display: flex;
   flex-direction: column;
   padding: var(--spacing-md);
-  max-width: 800px;
   margin: 0 auto;
 }
@@ -227,11 +227,121 @@ body {
 /* Main content */
 .main-content {
   flex: 1;
   display: flex;
   flex-direction: column;
   align-items: center;
   gap: var(--spacing-xl);
 }
 /* Status card */

   display: flex;
   flex-direction: column;
   padding: var(--spacing-md);
+  max-width: 1400px;
   margin: 0 auto;
 }
 /* Main content */
 .main-content {
+  flex: 1;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  width: 100%;
+}
+.content-wrapper {
+  display: flex;
+  flex-direction: row;
+  width: 100%;
+  gap: var(--spacing-lg);
+  justify-content: center;
+  align-items: flex-start;
+  flex-wrap: wrap;
+}
+.center-panel {
   flex: 1;
   display: flex;
   flex-direction: column;
   align-items: center;
   gap: var(--spacing-xl);
+  min-width: 300px;
+  max-width: 800px; /* Keep the original limit for the center part */
+}
+/* Side Panel & Confidence Table */
+.side-panel {
+  width: 300px;
+  padding: var(--spacing-lg);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-md);
+  max-height: 80vh;
+  position: sticky;
+  top: var(--spacing-md);
+  margin-top: var(--spacing-xl); /* Align with content */
+}
+.side-panel h3 {
+  font-size: 1.1rem;
+  font-weight: 600;
+  color: var(--text-primary);
+  margin-bottom: var(--spacing-xs);
+  text-align: center;
+}
+.confidence-summary {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: var(--spacing-sm) var(--spacing-md);
+  background: rgba(255, 255, 255, 0.05);
+  border-radius: var(--radius-sm);
+  margin-bottom: var(--spacing-sm);
+}
+.confidence-value {
+  font-weight: 700;
+  color: var(--success);
+}
+.confidence-value.low {
+  color: var(--error);
+}
+.confidence-value.medium {
+  color: var(--warning);
+}
+.table-container {
+  overflow-y: auto;
+  flex: 1;
+  border-radius: var(--radius-sm);
+  max-height: 400px;
+}
+.confidence-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.9rem;
+}
+.confidence-table th,
+.confidence-table td {
+  padding: var(--spacing-xs) var(--spacing-sm);
+  text-align: right;
+  border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+}
+.confidence-table th {
+  color: var(--text-secondary);
+  font-weight: 500;
+  position: sticky;
+  top: 0;
+  background: rgba(0, 0, 0, 0.2);
+  backdrop-filter: blur(5px);
+}
+.confidence-table td:last-child {
+  text-align: left;
+  direction: ltr; /* numbers look better LTR */
+  font-variant-numeric: tabular-nums;
+}
+@media (max-width: 900px) {
+  .content-wrapper {
+    flex-direction: column;
+    align-items: center;
+  }
+  .side-panel {
+    width: 100%;
+    max-width: 500px;
+    position: static;
+    margin-top: 0;
+  }
 }
 /* Status card */

vad/HumAwareVAD.jit ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82630272369a072c4d0ff8d3df7cb7894812fc2007b24b8c1823ec3c61430d47
+size 2271580