sherif31 commited on
Commit
4beeb3d
·
1 Parent(s): f1fc6e5
Files changed (5) hide show
  1. app.py +58 -12
  2. static/app.js +72 -7
  3. static/index.html +93 -60
  4. static/styles.css +111 -1
  5. vad/HumAwareVAD.jit +3 -0
app.py CHANGED
@@ -34,7 +34,7 @@ model.generation_config.begin_suppress_tokens = [220, 50257]
34
  model.eval()
35
 
36
  # Load Silero VAD model
37
- def load_vad(model_path="vad/silero_vad.jit"):
38
  vad = torch.jit.load(model_path, map_location="cpu")
39
  vad.eval()
40
  return vad
@@ -69,8 +69,8 @@ class AudioProcessor:
69
 
70
  SAMPLE_RATE = 16000
71
  VAD_CHUNK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
72
- SILENCE_THRESHOLD = 0.5 # seconds
73
- VAD_THRESHOLD = 0.7 # Speech probability threshold
74
 
75
  def __init__(self):
76
  self.reset()
@@ -152,13 +152,16 @@ class AudioProcessor:
152
  silence_duration = current_time - self.silence_start
153
 
154
  if silence_duration >= self.SILENCE_THRESHOLD:
 
155
  # Trigger ASR inference
156
  if self.speech_detected and len(self.audio_buffer) > 0:
157
- transcription = self._transcribe()
158
  self.reset()
159
  result = {
160
  "status": "transcription",
161
- "transcription": transcription,
 
 
162
  "probability": speech_prob
163
  }
164
  print(f"Sending transcription to client: {result}")
@@ -174,10 +177,10 @@ class AudioProcessor:
174
 
175
  return {"status": "listening", "probability": speech_prob}
176
 
177
- def _transcribe(self) -> str:
178
- """Run ASR on accumulated audio"""
179
  if not self.audio_buffer:
180
- return ""
181
 
182
  # Concatenate all audio chunks
183
  audio_data = np.concatenate(self.audio_buffer)
@@ -194,20 +197,63 @@ class AudioProcessor:
194
  )
195
  input_features = inputs.input_features.to(device)
196
 
197
- # Generate transcription
198
  with torch.no_grad():
199
- predicted_ids = model.generate(
200
  input_features,
201
  language="arabic",
202
- task="transcribe"
 
 
203
  )
204
 
 
 
 
 
205
  transcription = processor.batch_decode(
206
  predicted_ids,
207
  skip_special_tokens=True
208
  )[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- return transcription
 
 
 
 
 
 
 
 
 
211
 
212
 
213
  # ============== WEBSOCKET ENDPOINT ==============
 
34
  model.eval()
35
 
36
  # Load Silero VAD model
37
+ def load_vad(model_path="vad/HumAwareVAD.jit"):
38
  vad = torch.jit.load(model_path, map_location="cpu")
39
  vad.eval()
40
  return vad
 
69
 
70
  SAMPLE_RATE = 16000
71
  VAD_CHUNK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
72
+ SILENCE_THRESHOLD = 0.25 # seconds
73
+ VAD_THRESHOLD = 0.9 # Speech probability threshold
74
 
75
  def __init__(self):
76
  self.reset()
 
152
  silence_duration = current_time - self.silence_start
153
 
154
  if silence_duration >= self.SILENCE_THRESHOLD:
155
+ # Trigger ASR inference
156
  # Trigger ASR inference
157
  if self.speech_detected and len(self.audio_buffer) > 0:
158
+ transcription_result = self._transcribe()
159
  self.reset()
160
  result = {
161
  "status": "transcription",
162
+ "transcription": transcription_result["text"],
163
+ "confidence": transcription_result["confidence"],
164
+ "token_confidences": transcription_result["tokens"],
165
  "probability": speech_prob
166
  }
167
  print(f"Sending transcription to client: {result}")
 
177
 
178
  return {"status": "listening", "probability": speech_prob}
179
 
180
+ def _transcribe(self) -> dict:
181
+ """Run ASR on accumulated audio and return transcription with confidence"""
182
  if not self.audio_buffer:
183
+ return {"text": "", "confidence": 0.0, "tokens": []}
184
 
185
  # Concatenate all audio chunks
186
  audio_data = np.concatenate(self.audio_buffer)
 
197
  )
198
  input_features = inputs.input_features.to(device)
199
 
200
+ # Generate transcription with scores
201
  with torch.no_grad():
202
+ outputs = model.generate(
203
  input_features,
204
  language="arabic",
205
+ task="transcribe",
206
+ output_scores=True,
207
+ return_dict_in_generate=True
208
  )
209
 
210
+ predicted_ids = outputs.sequences
211
+ scores = outputs.scores
212
+
213
+ # Decode transcription
214
  transcription = processor.batch_decode(
215
  predicted_ids,
216
  skip_special_tokens=True
217
  )[0]
218
+
219
+ # Calculate confidence scores
220
+ # scores is a tuple of tensors (one for each step), each shape (batch_size, vocab_size)
221
+ # predicted_ids is (batch_size, seq_len)
222
+
223
+ # We need to skip the initial tokens that were not generated (like start token, lang token, etc.)
224
+ # The number of generated tokens matches the length of scores
225
+ gen_tokens = predicted_ids[0, -len(scores):]
226
+
227
+ token_confidences = []
228
+ sum_log_prob = 0.0
229
+
230
+ for i, (score, token_id) in enumerate(zip(scores, gen_tokens)):
231
+ # score is (batch_size, vocab_size)
232
+ # Get softmax probabilities
233
+ probs = torch.softmax(score[0], dim=-1)
234
+ prob = probs[token_id].item()
235
+
236
+ # Get token text
237
+ token_text = processor.decode([token_id], skip_special_tokens=True)
238
+
239
+ # cleanup clean text
240
+ if token_text.strip():
241
+ token_confidences.append({
242
+ "token": token_text,
243
+ "probability": round(prob, 4)
244
+ })
245
+ sum_log_prob += np.log(prob + 1e-10)
246
 
247
+ # Global confidence (geometric mean of probabilities)
248
+ avg_confidence = 0.0
249
+ if len(token_confidences) > 0:
250
+ avg_confidence = np.exp(sum_log_prob / len(token_confidences))
251
+
252
+ return {
253
+ "text": transcription,
254
+ "confidence": round(avg_confidence, 4),
255
+ "tokens": token_confidences
256
+ }
257
 
258
 
259
  # ============== WEBSOCKET ENDPOINT ==============
static/app.js CHANGED
@@ -8,13 +8,15 @@ class AudioRecorder {
8
  // Audio settings
9
  this.sampleRate = 16000;
10
  this.chunkSize = 512; // Samples per chunk
11
- this.bufferSize = 4096;
12
 
13
  // State
14
  this.isRecording = false;
15
  this.audioContext = null;
16
  this.mediaStream = null;
17
  this.processor = null;
 
 
18
  this.websocket = null;
19
 
20
  // UI elements
@@ -27,6 +29,11 @@ class AudioRecorder {
27
  this.connectionStatus = document.getElementById('connectionStatus');
28
  this.transcriptionContent = document.getElementById('transcriptionContent');
29
  this.transcriptionHistory = document.getElementById('transcriptionHistory');
 
 
 
 
 
30
  this.waveformCanvas = document.getElementById('waveformCanvas');
31
  this.waveformCtx = this.waveformCanvas.getContext('2d');
32
 
@@ -135,6 +142,12 @@ class AudioRecorder {
135
  // Create audio processing pipeline
136
  const source = this.audioContext.createMediaStreamSource(this.mediaStream);
137
 
 
 
 
 
 
 
138
  // Use ScriptProcessorNode for audio processing
139
  this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
140
 
@@ -143,21 +156,21 @@ class AudioRecorder {
143
 
144
  const inputData = e.inputBuffer.getChannelData(0);
145
 
146
- // Update visualization buffer
147
- this.audioDataBuffer = new Float32Array(inputData.slice(0, 128));
148
- this.drawWaveform(this.audioDataBuffer);
149
-
150
  // Send audio chunks to server
151
  this.sendAudioChunk(inputData);
152
  };
153
 
154
- source.connect(this.processor);
 
155
  this.processor.connect(this.audioContext.destination);
156
 
157
  // Update UI
158
  this.isRecording = true;
159
  this.updateUI('recording');
160
 
 
 
 
161
  } catch (error) {
162
  console.error('Error starting recording:', error);
163
  this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
@@ -167,12 +180,22 @@ class AudioRecorder {
167
  stopRecording() {
168
  this.isRecording = false;
169
 
 
 
 
 
 
170
  // Stop audio processing
171
  if (this.processor) {
172
  this.processor.disconnect();
173
  this.processor = null;
174
  }
175
 
 
 
 
 
 
176
  if (this.audioContext) {
177
  this.audioContext.close();
178
  this.audioContext = null;
@@ -193,6 +216,15 @@ class AudioRecorder {
193
  this.updateUI('stopped');
194
  this.drawIdleWaveform();
195
  }
 
 
 
 
 
 
 
 
 
196
 
197
  async connectWebSocket() {
198
  return new Promise((resolve, reject) => {
@@ -243,7 +275,7 @@ class AudioRecorder {
243
  }
244
 
245
  handleServerMessage(data) {
246
- const { status, probability, transcription, remaining } = data;
247
 
248
  // Update probability bar
249
  if (probability !== undefined) {
@@ -265,6 +297,9 @@ class AudioRecorder {
265
  case 'transcription':
266
  this.updateStatus('listening', 'تم التعرف على الكلام');
267
  this.showTranscription(transcription);
 
 
 
268
  break;
269
  }
270
  }
@@ -300,6 +335,36 @@ class AudioRecorder {
300
  // Show new transcription
301
  this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
302
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  updateUI(state) {
305
  if (state === 'recording') {
 
8
  // Audio settings
9
  this.sampleRate = 16000;
10
  this.chunkSize = 512; // Samples per chunk
11
+ this.bufferSize = 1024;
12
 
13
  // State
14
  this.isRecording = false;
15
  this.audioContext = null;
16
  this.mediaStream = null;
17
  this.processor = null;
18
+ this.analyser = null;
19
+ this.animationId = null;
20
  this.websocket = null;
21
 
22
  // UI elements
 
29
  this.connectionStatus = document.getElementById('connectionStatus');
30
  this.transcriptionContent = document.getElementById('transcriptionContent');
31
  this.transcriptionHistory = document.getElementById('transcriptionHistory');
32
+
33
+ this.confidencePanel = document.getElementById('confidencePanel');
34
+ this.confidenceTableBody = document.getElementById('confidenceTableBody');
35
+ this.globalConfidence = document.getElementById('globalConfidence');
36
+
37
  this.waveformCanvas = document.getElementById('waveformCanvas');
38
  this.waveformCtx = this.waveformCanvas.getContext('2d');
39
 
 
142
  // Create audio processing pipeline
143
  const source = this.audioContext.createMediaStreamSource(this.mediaStream);
144
 
145
+ // Create analyser for smooth visualization
146
+ this.analyser = this.audioContext.createAnalyser();
147
+ this.analyser.fftSize = 512; // Controls resolution of data
148
+ this.analyser.smoothingTimeConstant = 0.5;
149
+ this.audioDataBuffer = new Float32Array(this.analyser.fftSize);
150
+
151
  // Use ScriptProcessorNode for audio processing
152
  this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
153
 
 
156
 
157
  const inputData = e.inputBuffer.getChannelData(0);
158
 
 
 
 
 
159
  // Send audio chunks to server
160
  this.sendAudioChunk(inputData);
161
  };
162
 
163
+ source.connect(this.analyser);
164
+ this.analyser.connect(this.processor);
165
  this.processor.connect(this.audioContext.destination);
166
 
167
  // Update UI
168
  this.isRecording = true;
169
  this.updateUI('recording');
170
 
171
+ // Start visualization loop
172
+ this.visualize();
173
+
174
  } catch (error) {
175
  console.error('Error starting recording:', error);
176
  this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
 
180
  stopRecording() {
181
  this.isRecording = false;
182
 
183
+ if (this.animationId) {
184
+ cancelAnimationFrame(this.animationId);
185
+ this.animationId = null;
186
+ }
187
+
188
  // Stop audio processing
189
  if (this.processor) {
190
  this.processor.disconnect();
191
  this.processor = null;
192
  }
193
 
194
+ if (this.analyser) {
195
+ this.analyser.disconnect();
196
+ this.analyser = null;
197
+ }
198
+
199
  if (this.audioContext) {
200
  this.audioContext.close();
201
  this.audioContext = null;
 
216
  this.updateUI('stopped');
217
  this.drawIdleWaveform();
218
  }
219
+
220
+ visualize() {
221
+ if (!this.isRecording || !this.analyser) return;
222
+
223
+ this.analyser.getFloatTimeDomainData(this.audioDataBuffer);
224
+ this.drawWaveform(this.audioDataBuffer);
225
+
226
+ this.animationId = requestAnimationFrame(() => this.visualize());
227
+ }
228
 
229
  async connectWebSocket() {
230
  return new Promise((resolve, reject) => {
 
275
  }
276
 
277
  handleServerMessage(data) {
278
+ const { status, probability, transcription, remaining, confidence, token_confidences } = data;
279
 
280
  // Update probability bar
281
  if (probability !== undefined) {
 
297
  case 'transcription':
298
  this.updateStatus('listening', 'تم التعرف على الكلام');
299
  this.showTranscription(transcription);
300
+ if (token_confidences) {
301
+ this.updateConfidenceTable(token_confidences, confidence);
302
+ }
303
  break;
304
  }
305
  }
 
335
  // Show new transcription
336
  this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
337
  }
338
+
339
+ updateConfidenceTable(tokens, globalConf) {
340
+ this.confidencePanel.classList.remove('hidden');
341
+
342
+ // Update global confidence
343
+ const percentage = Math.round(globalConf * 100);
344
+ this.globalConfidence.textContent = `${percentage}%`;
345
+ this.globalConfidence.className = 'confidence-value';
346
+
347
+ if (percentage < 50) this.globalConfidence.classList.add('low');
348
+ else if (percentage < 80) this.globalConfidence.classList.add('medium');
349
+
350
+ // Update table
351
+ this.confidenceTableBody.innerHTML = '';
352
+
353
+ tokens.forEach(tk => {
354
+ const row = document.createElement('tr');
355
+ const prob = Math.round(tk.probability * 100);
356
+
357
+ let probClass = 'confidence-value';
358
+ if (prob < 50) probClass += ' low';
359
+ else if (prob < 80) probClass += ' medium';
360
+
361
+ row.innerHTML = `
362
+ <td>${tk.token}</td>
363
+ <td class="${probClass}">${prob}%</td>
364
+ `;
365
+ this.confidenceTableBody.appendChild(row);
366
+ });
367
+ }
368
 
369
  updateUI(state) {
370
  if (state === 'recording') {
static/index.html CHANGED
@@ -3,7 +3,7 @@
3
  <head>
4
  <meta charset="UTF-8" />
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
- <title>الصقر </title>
7
  <meta
8
  name="description"
9
  content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
@@ -51,76 +51,109 @@
51
 
52
  <!-- Main content -->
53
  <main class="main-content">
54
- <!-- Status card -->
55
- <div class="status-card glass-card">
56
- <div class="status-indicator" id="statusIndicator">
57
- <div class="pulse-ring"></div>
58
- <div class="status-icon">
59
- <svg
60
- viewBox="0 0 24 24"
61
- fill="none"
62
- stroke="currentColor"
63
- stroke-width="2"
64
- >
65
- <path
66
- d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
67
- />
68
- <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
69
- </svg>
 
 
 
 
 
 
 
 
70
  </div>
71
- </div>
72
- <p class="status-message" id="statusMessage">اضغط للبدء</p>
73
- <div class="probability-bar" id="probabilityBar">
74
- <div class="probability-fill" id="probabilityFill"></div>
75
- </div>
76
- </div>
77
 
78
- <!-- Microphone button -->
79
- <button class="mic-button" id="micButton" aria-label="Start recording">
80
- <div class="mic-button-inner">
81
- <svg
82
- class="mic-icon"
83
- viewBox="0 0 24 24"
84
- fill="none"
85
- stroke="currentColor"
86
- stroke-width="2"
87
  >
88
- <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
89
- <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
90
- <line x1="12" y1="19" x2="12" y2="23" />
91
- <line x1="8" y1="23" x2="16" y2="23" />
92
- </svg>
93
- <svg
94
- class="stop-icon hidden"
95
- viewBox="0 0 24 24"
96
- fill="currentColor"
97
- >
98
- <rect x="6" y="6" width="12" height="12" rx="2" />
99
- </svg>
100
- </div>
101
- <div class="mic-ripple"></div>
102
- </button>
 
 
 
 
 
 
 
 
 
 
103
 
104
- <!-- Waveform visualization -->
105
- <div class="waveform-container glass-card" id="waveformContainer">
106
- <canvas id="waveformCanvas"></canvas>
107
- </div>
108
 
109
- <!-- Transcription result -->
110
- <div class="transcription-card glass-card" id="transcriptionCard">
111
- <h2>النتيجة</h2>
112
- <div class="transcription-content" id="transcriptionContent">
113
- <p class="placeholder-text">سيظهر النص هنا بعد انتهاء الكلام...</p>
 
 
 
 
 
 
 
114
  </div>
115
- <div class="transcription-history" id="transcriptionHistory">
116
- <!-- Previous transcriptions will be added here -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  </div>
118
  </div>
119
  </main>
120
 
121
  <!-- Footer -->
122
- <footer class="footer">
123
- </footer>
124
  </div>
125
 
126
  <script src="/static/app.js"></script>
 
3
  <head>
4
  <meta charset="UTF-8" />
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>الصقر</title>
7
  <meta
8
  name="description"
9
  content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
 
51
 
52
  <!-- Main content -->
53
  <main class="main-content">
54
+ <div class="content-wrapper">
55
+ <div class="center-panel">
56
+ <!-- Status card -->
57
+ <div class="status-card glass-card">
58
+ <div class="status-indicator" id="statusIndicator">
59
+ <div class="pulse-ring"></div>
60
+ <div class="status-icon">
61
+ <svg
62
+ viewBox="0 0 24 24"
63
+ fill="none"
64
+ stroke="currentColor"
65
+ stroke-width="2"
66
+ >
67
+ <path
68
+ d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
69
+ />
70
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
71
+ </svg>
72
+ </div>
73
+ </div>
74
+ <p class="status-message" id="statusMessage">اضغط للبدء</p>
75
+ <div class="probability-bar" id="probabilityBar">
76
+ <div class="probability-fill" id="probabilityFill"></div>
77
+ </div>
78
  </div>
 
 
 
 
 
 
79
 
80
+ <!-- Microphone button -->
81
+ <button
82
+ class="mic-button"
83
+ id="micButton"
84
+ aria-label="Start recording"
 
 
 
 
85
  >
86
+ <div class="mic-button-inner">
87
+ <svg
88
+ class="mic-icon"
89
+ viewBox="0 0 24 24"
90
+ fill="none"
91
+ stroke="currentColor"
92
+ stroke-width="2"
93
+ >
94
+ <path
95
+ d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
96
+ />
97
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
98
+ <line x1="12" y1="19" x2="12" y2="23" />
99
+ <line x1="8" y1="23" x2="16" y2="23" />
100
+ </svg>
101
+ <svg
102
+ class="stop-icon hidden"
103
+ viewBox="0 0 24 24"
104
+ fill="currentColor"
105
+ >
106
+ <rect x="6" y="6" width="12" height="12" rx="2" />
107
+ </svg>
108
+ </div>
109
+ <div class="mic-ripple"></div>
110
+ </button>
111
 
112
+ <!-- Waveform visualization -->
113
+ <div class="waveform-container glass-card" id="waveformContainer">
114
+ <canvas id="waveformCanvas"></canvas>
115
+ </div>
116
 
117
+ <!-- Transcription result -->
118
+ <div class="transcription-card glass-card" id="transcriptionCard">
119
+ <h2>النتيجة</h2>
120
+ <div class="transcription-content" id="transcriptionContent">
121
+ <p class="placeholder-text">
122
+ سيظهر النص هنا بعد انتهاء الكلام...
123
+ </p>
124
+ </div>
125
+ <div class="transcription-history" id="transcriptionHistory">
126
+ <!-- Previous transcriptions will be added here -->
127
+ </div>
128
+ </div>
129
  </div>
130
+
131
+ <!-- Confidence Side Panel -->
132
+ <div class="side-panel glass-card hidden" id="confidencePanel">
133
+ <h3>تفاصيل الدقة</h3>
134
+ <div class="confidence-summary">
135
+ <span>نسبة الثقة العامة:</span>
136
+ <span id="globalConfidence" class="confidence-value">0%</span>
137
+ </div>
138
+ <div class="table-container">
139
+ <table class="confidence-table">
140
+ <thead>
141
+ <tr>
142
+ <th>الرمز</th>
143
+ <th>الثقة</th>
144
+ </tr>
145
+ </thead>
146
+ <tbody id="confidenceTableBody">
147
+ <!-- Rows will be added dynamically -->
148
+ </tbody>
149
+ </table>
150
+ </div>
151
  </div>
152
  </div>
153
  </main>
154
 
155
  <!-- Footer -->
156
+ <footer class="footer"></footer>
 
157
  </div>
158
 
159
  <script src="/static/app.js"></script>
static/styles.css CHANGED
@@ -81,7 +81,7 @@ body {
81
  display: flex;
82
  flex-direction: column;
83
  padding: var(--spacing-md);
84
- max-width: 800px;
85
  margin: 0 auto;
86
  }
87
 
@@ -227,11 +227,121 @@ body {
227
 
228
  /* Main content */
229
  .main-content {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  flex: 1;
231
  display: flex;
232
  flex-direction: column;
233
  align-items: center;
234
  gap: var(--spacing-xl);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  }
236
 
237
  /* Status card */
 
81
  display: flex;
82
  flex-direction: column;
83
  padding: var(--spacing-md);
84
+ max-width: 1400px;
85
  margin: 0 auto;
86
  }
87
 
 
227
 
228
  /* Main content */
229
  .main-content {
230
+ flex: 1;
231
+ display: flex;
232
+ flex-direction: column;
233
+ align-items: center;
234
+ width: 100%;
235
+ }
236
+
237
+ .content-wrapper {
238
+ display: flex;
239
+ flex-direction: row;
240
+ width: 100%;
241
+ gap: var(--spacing-lg);
242
+ justify-content: center;
243
+ align-items: flex-start;
244
+ flex-wrap: wrap;
245
+ }
246
+
247
+ .center-panel {
248
  flex: 1;
249
  display: flex;
250
  flex-direction: column;
251
  align-items: center;
252
  gap: var(--spacing-xl);
253
+ min-width: 300px;
254
+ max-width: 800px; /* Keep the original limit for the center part */
255
+ }
256
+
257
+ /* Side Panel & Confidence Table */
258
+ .side-panel {
259
+ width: 300px;
260
+ padding: var(--spacing-lg);
261
+ display: flex;
262
+ flex-direction: column;
263
+ gap: var(--spacing-md);
264
+ max-height: 80vh;
265
+ position: sticky;
266
+ top: var(--spacing-md);
267
+ margin-top: var(--spacing-xl); /* Align with content */
268
+ }
269
+
270
+ .side-panel h3 {
271
+ font-size: 1.1rem;
272
+ font-weight: 600;
273
+ color: var(--text-primary);
274
+ margin-bottom: var(--spacing-xs);
275
+ text-align: center;
276
+ }
277
+
278
+ .confidence-summary {
279
+ display: flex;
280
+ justify-content: space-between;
281
+ align-items: center;
282
+ padding: var(--spacing-sm) var(--spacing-md);
283
+ background: rgba(255, 255, 255, 0.05);
284
+ border-radius: var(--radius-sm);
285
+ margin-bottom: var(--spacing-sm);
286
+ }
287
+
288
+ .confidence-value {
289
+ font-weight: 700;
290
+ color: var(--success);
291
+ }
292
+ .confidence-value.low {
293
+ color: var(--error);
294
+ }
295
+ .confidence-value.medium {
296
+ color: var(--warning);
297
+ }
298
+
299
+ .table-container {
300
+ overflow-y: auto;
301
+ flex: 1;
302
+ border-radius: var(--radius-sm);
303
+ max-height: 400px;
304
+ }
305
+
306
+ .confidence-table {
307
+ width: 100%;
308
+ border-collapse: collapse;
309
+ font-size: 0.9rem;
310
+ }
311
+
312
+ .confidence-table th,
313
+ .confidence-table td {
314
+ padding: var(--spacing-xs) var(--spacing-sm);
315
+ text-align: right;
316
+ border-bottom: 1px solid rgba(255, 255, 255, 0.05);
317
+ }
318
+
319
+ .confidence-table th {
320
+ color: var(--text-secondary);
321
+ font-weight: 500;
322
+ position: sticky;
323
+ top: 0;
324
+ background: rgba(0, 0, 0, 0.2);
325
+ backdrop-filter: blur(5px);
326
+ }
327
+
328
+ .confidence-table td:last-child {
329
+ text-align: left;
330
+ direction: ltr; /* numbers look better LTR */
331
+ font-variant-numeric: tabular-nums;
332
+ }
333
+
334
+ @media (max-width: 900px) {
335
+ .content-wrapper {
336
+ flex-direction: column;
337
+ align-items: center;
338
+ }
339
+ .side-panel {
340
+ width: 100%;
341
+ max-width: 500px;
342
+ position: static;
343
+ margin-top: 0;
344
+ }
345
  }
346
 
347
  /* Status card */
vad/HumAwareVAD.jit ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82630272369a072c4d0ff8d3df7cb7894812fc2007b24b8c1823ec3c61430d47
3
+ size 2271580