sherif31 commited on
Commit
f1fc6e5
·
1 Parent(s): de55e06

Deploy to HF Spaces with Docker

Browse files
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ git \
9
+ git-lfs \
10
+ ffmpeg \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Create a non-root user (required by HF Spaces)
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ WORKDIR /home/user/app
20
+
21
+ # Copy requirements first for caching
22
+ COPY --chown=user:user requirements.txt .
23
+
24
+ # Install Python dependencies
25
+ RUN pip install --no-cache-dir --upgrade pip && \
26
+ pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Copy the rest of the application
29
+ COPY --chown=user:user . .
30
+
31
+ # Expose the port (HF Spaces expects port 7860)
32
+ EXPOSE 7860
33
+
34
+ # Run the application
35
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,20 @@
1
  ---
2
  title: Car Plate ASR
3
- emoji: 🌖
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Car Plate ASR
3
+ emoji: 🚗
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
  ---
9
 
10
+ # Car Plate ASR
11
+
12
+ Real-time Arabic car plate speech recognition using Whisper and Silero VAD.
13
+
14
+ ## Features
15
+
16
+ - 🎤 Real-time voice activity detection (VAD)
17
+ - 🗣️ Arabic speech recognition for car plate numbers
18
+ - ⚡ WebSocket-based streaming for low latency
19
+
20
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,106 +1,256 @@
1
- import gradio as gr
2
  import torch
3
  import torchaudio
4
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
  import numpy as np
 
 
 
 
 
 
 
 
6
 
7
- # Load model and processor globally
8
- MODEL_PATH = "arabic_car_plate"
9
- print("Loading model and processor...")
10
 
11
- # Use GPU if available
 
 
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  print(f"Using device: {device}")
14
 
15
- # Load processor and model with optimizations
 
16
  processor = WhisperProcessor.from_pretrained(
17
- "openai/whisper-base",
18
- language="arabic",
19
  task="transcribe"
20
  )
21
 
22
- model = WhisperForConditionalGeneration.from_pretrained(
23
- MODEL_PATH,
24
- ).to(device)
25
-
26
  model.generation_config.suppress_tokens = []
27
  model.generation_config.begin_suppress_tokens = [220, 50257]
28
-
29
- # Set model to evaluation mode
30
  model.eval()
31
 
32
- print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- def transcribe_audio(audio):
35
- if audio is None:
36
- return "No audio recorded. Please record audio first."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- sample_rate, audio_data = audio
39
-
40
- if len(audio_data.shape) > 1:
41
- audio_data = audio_data.mean(axis=1)
42
-
43
- audio_data = audio_data.astype(np.float32)
44
- audio_data /= np.max(np.abs(audio_data))
45
-
46
- # Resample to 16 kHz if needed
47
- if sample_rate != 16000:
48
- audio_tensor = torch.from_numpy(audio_data).float()
49
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
50
- audio_data = resampler(audio_tensor).numpy()
51
-
52
- # Preprocess
53
- inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")
54
- input_features = inputs.input_features.to(device)
55
- attention_mask = inputs.get("attention_mask", None)
56
- if attention_mask is not None:
57
- attention_mask = attention_mask.to(device)
58
-
59
- # Generate transcription
60
- with torch.no_grad():
61
- predicted_ids = model.generate(
62
- input_features,
63
- attention_mask=attention_mask,
64
- language="arabic",
65
- task="transcribe"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
69
- return transcription
70
 
71
- # Create Gradio interface
72
- with gr.Blocks(title="تفريع نصي للوح السياره") as demo:
 
 
 
 
73
 
74
- with gr.Row(equal_height=True):
75
- with gr.Column(scale=1):
76
- audio_input = gr.Audio(
77
- sources=["microphone"],
78
- type="numpy",
79
- label="🎤 Record Audio",
80
- show_download_button=True,
81
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- with gr.Column(scale=1):
84
- transcribe_btn = gr.Button(
85
- "تم",
86
- variant="primary",
87
- size="lg"
88
- )
89
- output_text = gr.Textbox(
90
- label="📝لوحه السياره",
91
- placeholder="تفريغ هيظهر هنا",
92
- lines=2,
93
- max_lines=2,
94
- scale=1,
95
- interactive=False
96
- )
97
 
98
- transcribe_btn.click(
99
- fn=transcribe_audio,
100
- inputs=audio_input,
101
- outputs=output_text
102
- )
103
 
104
- # Launch the app
105
  if __name__ == "__main__":
106
- demo.launch()
 
 
 
1
  import torch
2
  import torchaudio
 
3
  import numpy as np
4
+ import asyncio
5
+ import time
6
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.responses import FileResponse
9
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
10
+ from pathlib import Path
11
+ import struct
12
 
13
+ # Set torch threads
14
+ torch.set_num_threads(1)
 
15
 
16
+ # ============== LOAD MODELS ==============
17
+ print("Loading models...")
18
+
19
+ # Device setup
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
  print(f"Using device: {device}")
22
 
23
+ # Load Whisper ASR model
24
+ MODEL_PATH = "arabic_car_plate"
25
  processor = WhisperProcessor.from_pretrained(
26
+ "openai/whisper-base",
27
+ language="arabic",
28
  task="transcribe"
29
  )
30
 
31
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
 
 
 
32
  model.generation_config.suppress_tokens = []
33
  model.generation_config.begin_suppress_tokens = [220, 50257]
 
 
34
  model.eval()
35
 
36
+ # Load Silero VAD model
37
+ def load_vad(model_path="vad/silero_vad.jit"):
38
+ vad = torch.jit.load(model_path, map_location="cpu")
39
+ vad.eval()
40
+ return vad
41
+
42
+ vad_model = load_vad()
43
+
44
+ print("Models loaded successfully!")
45
+
46
+ # ============== FASTAPI APP ==============
47
+ app = FastAPI(title="Real-time VAD-ASR Pipeline")
48
+
49
+ # Mount static files
50
+ static_path = Path(__file__).parent / "static"
51
+ static_path.mkdir(exist_ok=True)
52
+ app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
53
+
54
 
55
+ @app.get("/")
56
+ async def root():
57
+ """Serve the main HTML page"""
58
+ return FileResponse(static_path / "index.html")
59
+
60
+
61
+ # ============== AUDIO PROCESSOR ==============
62
+ class AudioProcessor:
63
+ """
64
+ Real-time audio processor with VAD and ASR.
65
+ - Detects speech using Silero VAD
66
+ - Accumulates audio while speaking
67
+ - After 0.7s of silence, triggers ASR inference
68
+ """
69
+
70
+ SAMPLE_RATE = 16000
71
+ VAD_CHUNK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
72
+ SILENCE_THRESHOLD = 0.5 # seconds
73
+ VAD_THRESHOLD = 0.7 # Speech probability threshold
74
 
75
+ def __init__(self):
76
+ self.reset()
77
+
78
+ def reset(self):
79
+ """Reset the processor state"""
80
+ self.audio_buffer = [] # For ASR (full audio)
81
+ self.pending_samples = np.array([], dtype=np.float32) # Buffer for incomplete VAD chunks
82
+ self.is_speaking = False
83
+ self.silence_start = None
84
+ self.speech_detected = False
85
+ self.last_result = {"status": "listening", "probability": 0.0}
86
+ # Reset VAD model state
87
+ vad_model.reset_states()
88
+
89
+ def process_chunk(self, audio_chunk: np.ndarray) -> dict:
90
+ """
91
+ Process an audio chunk and return the current state.
92
+ Splits incoming audio into 512-sample chunks for VAD.
93
+
94
+ Returns:
95
+ dict with keys:
96
+ - 'status': 'speaking', 'silence', 'listening', or 'transcription'
97
+ - 'transcription': str (only if status is 'transcription')
98
+ """
99
+ # Add incoming audio to pending buffer
100
+ self.pending_samples = np.concatenate([self.pending_samples, audio_chunk])
101
+
102
+ # Also add to audio buffer for later ASR
103
+ self.audio_buffer.append(audio_chunk)
104
+
105
+ # Process all complete 512-sample chunks
106
+ result = self.last_result
107
+
108
+ while len(self.pending_samples) >= self.VAD_CHUNK_SIZE:
109
+ # Extract exactly 512 samples
110
+ vad_chunk = self.pending_samples[:self.VAD_CHUNK_SIZE]
111
+ self.pending_samples = self.pending_samples[self.VAD_CHUNK_SIZE:]
112
+
113
+ # Process this chunk through VAD
114
+ result = self._process_vad_chunk(vad_chunk)
115
+
116
+ # If we got a transcription, return immediately
117
+ if result["status"] == "transcription":
118
+ return result
119
+
120
+ self.last_result = result
121
+ return result
122
+
123
+ def _process_vad_chunk(self, audio_chunk: np.ndarray) -> dict:
124
+ """Process a single 512-sample chunk through VAD"""
125
+ # Convert to tensor
126
+ audio_tensor = torch.from_numpy(audio_chunk).float()
127
+
128
+ # Normalize audio
129
+ if audio_tensor.abs().max() > 0:
130
+ audio_tensor = audio_tensor / audio_tensor.abs().max()
131
+
132
+ # Run VAD on the chunk
133
+ speech_prob = vad_model(audio_tensor, self.SAMPLE_RATE).item()
134
+
135
+ current_time = time.time()
136
+
137
+ if speech_prob >= self.VAD_THRESHOLD:
138
+ # Speech detected
139
+ self.is_speaking = True
140
+ self.speech_detected = True
141
+ self.silence_start = None
142
+ return {"status": "speaking", "probability": speech_prob}
143
+ else:
144
+ # Silence detected
145
+ if self.is_speaking:
146
+ # Just stopped speaking
147
+ if self.silence_start is None:
148
+ self.silence_start = current_time
149
+ return {"status": "silence", "probability": speech_prob}
150
+
151
+ # Check if silence duration exceeded threshold
152
+ silence_duration = current_time - self.silence_start
153
+
154
+ if silence_duration >= self.SILENCE_THRESHOLD:
155
+ # Trigger ASR inference
156
+ if self.speech_detected and len(self.audio_buffer) > 0:
157
+ transcription = self._transcribe()
158
+ self.reset()
159
+ result = {
160
+ "status": "transcription",
161
+ "transcription": transcription,
162
+ "probability": speech_prob
163
+ }
164
+ print(f"Sending transcription to client: {result}")
165
+ return result
166
+ else:
167
+ # Still accumulating silence
168
+ remaining = self.SILENCE_THRESHOLD - silence_duration
169
+ return {
170
+ "status": "silence",
171
+ "probability": speech_prob,
172
+ "remaining": round(remaining, 2)
173
+ }
174
+
175
+ return {"status": "listening", "probability": speech_prob}
176
+
177
+ def _transcribe(self) -> str:
178
+ """Run ASR on accumulated audio"""
179
+ if not self.audio_buffer:
180
+ return ""
181
+
182
+ # Concatenate all audio chunks
183
+ audio_data = np.concatenate(self.audio_buffer)
184
+
185
+ # Normalize
186
+ if np.max(np.abs(audio_data)) > 0:
187
+ audio_data = audio_data / np.max(np.abs(audio_data))
188
+
189
+ # Preprocess for Whisper
190
+ inputs = processor(
191
+ audio_data,
192
+ sampling_rate=self.SAMPLE_RATE,
193
+ return_tensors="pt"
194
  )
195
+ input_features = inputs.input_features.to(device)
196
+
197
+ # Generate transcription
198
+ with torch.no_grad():
199
+ predicted_ids = model.generate(
200
+ input_features,
201
+ language="arabic",
202
+ task="transcribe"
203
+ )
204
+
205
+ transcription = processor.batch_decode(
206
+ predicted_ids,
207
+ skip_special_tokens=True
208
+ )[0]
209
 
210
+ return transcription
 
211
 
212
+
213
+ # ============== WEBSOCKET ENDPOINT ==============
214
+ @app.websocket("/ws/audio")
215
+ async def websocket_audio(websocket: WebSocket):
216
+ """
217
+ WebSocket endpoint for real-time audio processing.
218
 
219
+ Expects binary audio data:
220
+ - Format: 16-bit PCM, mono, 16kHz
221
+ - Chunk size: 512 samples (1024 bytes)
222
+ """
223
+ await websocket.accept()
224
+ processor_instance = AudioProcessor()
225
+
226
+ try:
227
+ while True:
228
+ # Receive binary audio data
229
+ data = await websocket.receive_bytes()
230
+
231
+ # Convert bytes to numpy array (16-bit PCM)
232
+ audio_chunk = np.frombuffer(data, dtype=np.int16).astype(np.float32)
233
+ audio_chunk = audio_chunk / 32768.0 # Normalize to [-1, 1]
234
+
235
+ # Process the chunk
236
+ result = processor_instance.process_chunk(audio_chunk)
237
+
238
+ # Send result back
239
+ await websocket.send_json(result)
240
+
241
+ except WebSocketDisconnect:
242
+ print("Client disconnected")
243
+ except Exception as e:
244
+ print(f"Error: {e}")
245
+ await websocket.close()
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ # ============== HEALTH CHECK ==============
249
+ @app.get("/health")
250
+ async def health():
251
+ return {"status": "healthy", "device": device}
252
+
253
 
 
254
  if __name__ == "__main__":
255
+ import uvicorn
256
+ uvicorn.run(app, host="0.0.0.0", port=7860)
arabic_car_plate/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b672757645b5a8dee35933502414666eebd835eedd811456949a976b5f8f250
3
+ size 290403936
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
  torch
2
  torchaudio
3
  transformers
4
- numpy
 
 
 
 
 
1
  torch
2
  torchaudio
3
  transformers
4
+ numpy
5
+ fastapi
6
+ uvicorn[standard]
7
+ websockets
8
+ python-multipart
static/app.js ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Real-time VAD-ASR Pipeline - Frontend Application
3
+ * Handles microphone capture, WebSocket communication, and UI updates
4
+ */
5
+
6
+ class AudioRecorder {
7
+ constructor() {
8
+ // Audio settings
9
+ this.sampleRate = 16000;
10
+ this.chunkSize = 512; // Samples per chunk
11
+ this.bufferSize = 4096;
12
+
13
+ // State
14
+ this.isRecording = false;
15
+ this.audioContext = null;
16
+ this.mediaStream = null;
17
+ this.processor = null;
18
+ this.websocket = null;
19
+
20
+ // UI elements
21
+ this.micButton = document.getElementById('micButton');
22
+ this.micIcon = document.querySelector('.mic-icon');
23
+ this.stopIcon = document.querySelector('.stop-icon');
24
+ this.statusIndicator = document.getElementById('statusIndicator');
25
+ this.statusMessage = document.getElementById('statusMessage');
26
+ this.probabilityFill = document.getElementById('probabilityFill');
27
+ this.connectionStatus = document.getElementById('connectionStatus');
28
+ this.transcriptionContent = document.getElementById('transcriptionContent');
29
+ this.transcriptionHistory = document.getElementById('transcriptionHistory');
30
+ this.waveformCanvas = document.getElementById('waveformCanvas');
31
+ this.waveformCtx = this.waveformCanvas.getContext('2d');
32
+
33
+ // Audio buffer for visualization
34
+ this.audioDataBuffer = new Float32Array(128);
35
+
36
+ // Bind events
37
+ this.micButton.addEventListener('click', () => this.toggleRecording());
38
+
39
+ // Initialize canvas
40
+ this.initCanvas();
41
+ window.addEventListener('resize', () => this.initCanvas());
42
+ }
43
+
44
+ initCanvas() {
45
+ const container = this.waveformCanvas.parentElement;
46
+ this.waveformCanvas.width = container.clientWidth - 32;
47
+ this.waveformCanvas.height = 80;
48
+ this.drawIdleWaveform();
49
+ }
50
+
51
+ drawIdleWaveform() {
52
+ const { width, height } = this.waveformCanvas;
53
+ this.waveformCtx.fillStyle = 'rgba(99, 102, 241, 0.1)';
54
+ this.waveformCtx.fillRect(0, 0, width, height);
55
+
56
+ this.waveformCtx.strokeStyle = 'rgba(99, 102, 241, 0.3)';
57
+ this.waveformCtx.lineWidth = 2;
58
+ this.waveformCtx.beginPath();
59
+ this.waveformCtx.moveTo(0, height / 2);
60
+ this.waveformCtx.lineTo(width, height / 2);
61
+ this.waveformCtx.stroke();
62
+ }
63
+
64
+ drawWaveform(audioData) {
65
+ const { width, height } = this.waveformCanvas;
66
+ const ctx = this.waveformCtx;
67
+
68
+ // Clear canvas
69
+ ctx.fillStyle = 'rgba(10, 10, 26, 0.3)';
70
+ ctx.fillRect(0, 0, width, height);
71
+
72
+ // Draw waveform
73
+ const gradient = ctx.createLinearGradient(0, 0, width, 0);
74
+ gradient.addColorStop(0, '#6366f1');
75
+ gradient.addColorStop(0.5, '#8b5cf6');
76
+ gradient.addColorStop(1, '#a855f7');
77
+
78
+ ctx.strokeStyle = gradient;
79
+ ctx.lineWidth = 2;
80
+ ctx.beginPath();
81
+
82
+ const sliceWidth = width / audioData.length;
83
+ let x = 0;
84
+
85
+ for (let i = 0; i < audioData.length; i++) {
86
+ const v = audioData[i] * 0.5 + 0.5;
87
+ const y = v * height;
88
+
89
+ if (i === 0) {
90
+ ctx.moveTo(x, y);
91
+ } else {
92
+ ctx.lineTo(x, y);
93
+ }
94
+
95
+ x += sliceWidth;
96
+ }
97
+
98
+ ctx.stroke();
99
+
100
+ // Add glow effect
101
+ ctx.shadowColor = '#6366f1';
102
+ ctx.shadowBlur = 10;
103
+ ctx.stroke();
104
+ ctx.shadowBlur = 0;
105
+ }
106
+
107
+ async toggleRecording() {
108
+ if (this.isRecording) {
109
+ this.stopRecording();
110
+ } else {
111
+ await this.startRecording();
112
+ }
113
+ }
114
+
115
+ async startRecording() {
116
+ try {
117
+ // Request microphone access
118
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({
119
+ audio: {
120
+ channelCount: 1,
121
+ sampleRate: this.sampleRate,
122
+ echoCancellation: true,
123
+ noiseSuppression: true
124
+ }
125
+ });
126
+
127
+ // Create audio context
128
+ this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
129
+ sampleRate: this.sampleRate
130
+ });
131
+
132
+ // Connect WebSocket
133
+ await this.connectWebSocket();
134
+
135
+ // Create audio processing pipeline
136
+ const source = this.audioContext.createMediaStreamSource(this.mediaStream);
137
+
138
+ // Use ScriptProcessorNode for audio processing
139
+ this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
140
+
141
+ this.processor.onaudioprocess = (e) => {
142
+ if (!this.isRecording) return;
143
+
144
+ const inputData = e.inputBuffer.getChannelData(0);
145
+
146
+ // Update visualization buffer
147
+ this.audioDataBuffer = new Float32Array(inputData.slice(0, 128));
148
+ this.drawWaveform(this.audioDataBuffer);
149
+
150
+ // Send audio chunks to server
151
+ this.sendAudioChunk(inputData);
152
+ };
153
+
154
+ source.connect(this.processor);
155
+ this.processor.connect(this.audioContext.destination);
156
+
157
+ // Update UI
158
+ this.isRecording = true;
159
+ this.updateUI('recording');
160
+
161
+ } catch (error) {
162
+ console.error('Error starting recording:', error);
163
+ this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
164
+ }
165
+ }
166
+
167
+ stopRecording() {
168
+ this.isRecording = false;
169
+
170
+ // Stop audio processing
171
+ if (this.processor) {
172
+ this.processor.disconnect();
173
+ this.processor = null;
174
+ }
175
+
176
+ if (this.audioContext) {
177
+ this.audioContext.close();
178
+ this.audioContext = null;
179
+ }
180
+
181
+ if (this.mediaStream) {
182
+ this.mediaStream.getTracks().forEach(track => track.stop());
183
+ this.mediaStream = null;
184
+ }
185
+
186
+ // Close WebSocket
187
+ if (this.websocket) {
188
+ this.websocket.close();
189
+ this.websocket = null;
190
+ }
191
+
192
+ // Update UI
193
+ this.updateUI('stopped');
194
+ this.drawIdleWaveform();
195
+ }
196
+
197
+ async connectWebSocket() {
198
+ return new Promise((resolve, reject) => {
199
+ const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
200
+ const wsUrl = `${protocol}//${window.location.host}/ws/audio`;
201
+
202
+ this.websocket = new WebSocket(wsUrl);
203
+
204
+ this.websocket.onopen = () => {
205
+ console.log('WebSocket connected');
206
+ this.connectionStatus.classList.add('connected');
207
+ this.connectionStatus.querySelector('.status-text').textContent = 'متصل';
208
+ resolve();
209
+ };
210
+
211
+ this.websocket.onclose = () => {
212
+ console.log('WebSocket disconnected');
213
+ this.connectionStatus.classList.remove('connected');
214
+ this.connectionStatus.querySelector('.status-text').textContent = 'غير متصل';
215
+ };
216
+
217
+ this.websocket.onerror = (error) => {
218
+ console.error('WebSocket error:', error);
219
+ reject(error);
220
+ };
221
+
222
+ this.websocket.onmessage = (event) => {
223
+ const data = JSON.parse(event.data);
224
+ this.handleServerMessage(data);
225
+ };
226
+ });
227
+ }
228
+
229
+ sendAudioChunk(audioData) {
230
+ if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
231
+ return;
232
+ }
233
+
234
+ // Convert Float32 to Int16 for transmission
235
+ const int16Data = new Int16Array(audioData.length);
236
+ for (let i = 0; i < audioData.length; i++) {
237
+ const s = Math.max(-1, Math.min(1, audioData[i]));
238
+ int16Data[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
239
+ }
240
+
241
+ // Send as binary
242
+ this.websocket.send(int16Data.buffer);
243
+ }
244
+
245
+ handleServerMessage(data) {
246
+ const { status, probability, transcription, remaining } = data;
247
+
248
+ // Update probability bar
249
+ if (probability !== undefined) {
250
+ this.probabilityFill.style.width = `${probability * 100}%`;
251
+ }
252
+
253
+ // Update status
254
+ switch (status) {
255
+ case 'speaking':
256
+ this.updateStatus('speaking', 'جاري التحدث...');
257
+ break;
258
+ case 'silence':
259
+ const remainingText = remaining ? ` (${remaining}s)` : '';
260
+ this.updateStatus('silence', `صمت${remainingText}`);
261
+ break;
262
+ case 'listening':
263
+ this.updateStatus('listening', 'في انتظار الكلام...');
264
+ break;
265
+ case 'transcription':
266
+ this.updateStatus('listening', 'تم التعرف على الكلام');
267
+ this.showTranscription(transcription);
268
+ break;
269
+ }
270
+ }
271
+
272
+ updateStatus(state, message) {
273
+ // Update status indicator class
274
+ this.statusIndicator.className = 'status-indicator';
275
+ if (state === 'speaking' || state === 'silence') {
276
+ this.statusIndicator.classList.add(state);
277
+ }
278
+
279
+ // Update message
280
+ this.statusMessage.textContent = message;
281
+ }
282
+
283
+ showTranscription(text) {
284
+ if (!text || text.trim() === '') return;
285
+
286
+ // Move current transcription to history
287
+ const currentText = this.transcriptionContent.querySelector('p:not(.placeholder-text)');
288
+ if (currentText && currentText.textContent.trim()) {
289
+ const historyItem = document.createElement('div');
290
+ historyItem.className = 'history-item new';
291
+ historyItem.textContent = currentText.textContent;
292
+ this.transcriptionHistory.insertBefore(historyItem, this.transcriptionHistory.firstChild);
293
+
294
+ // Limit history to 10 items
295
+ while (this.transcriptionHistory.children.length > 10) {
296
+ this.transcriptionHistory.removeChild(this.transcriptionHistory.lastChild);
297
+ }
298
+ }
299
+
300
+ // Show new transcription
301
+ this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
302
+ }
303
+
304
+ updateUI(state) {
305
+ if (state === 'recording') {
306
+ this.micButton.classList.add('recording');
307
+ this.micIcon.classList.add('hidden');
308
+ this.stopIcon.classList.remove('hidden');
309
+ this.statusMessage.textContent = 'في انتظار الكلام...';
310
+ } else {
311
+ this.micButton.classList.remove('recording');
312
+ this.micIcon.classList.remove('hidden');
313
+ this.stopIcon.classList.add('hidden');
314
+ this.statusMessage.textContent = 'اضغط للبدء';
315
+ this.statusIndicator.className = 'status-indicator';
316
+ this.probabilityFill.style.width = '0%';
317
+ }
318
+ }
319
+ }
320
+
321
+ // Initialize on page load
322
+ document.addEventListener('DOMContentLoaded', () => {
323
+ new AudioRecorder();
324
+ });
static/index.html ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="ar" dir="rtl">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>الصقر </title>
7
+ <meta
8
+ name="description"
9
+ content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
10
+ />
11
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
12
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
13
+ <link
14
+ href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Noto+Sans+Arabic:wght@300;400;500;600;700&display=swap"
15
+ rel="stylesheet"
16
+ />
17
+ <link rel="stylesheet" href="/static/styles.css" />
18
+ </head>
19
+ <body>
20
+ <div class="app-container">
21
+ <!-- Animated background -->
22
+ <div class="bg-gradient"></div>
23
+ <div class="bg-orbs">
24
+ <div class="orb orb-1"></div>
25
+ <div class="orb orb-2"></div>
26
+ <div class="orb orb-3"></div>
27
+ </div>
28
+
29
+ <!-- Header -->
30
+ <header class="header">
31
+ <div class="logo">
32
+ <svg
33
+ class="logo-icon"
34
+ viewBox="0 0 24 24"
35
+ fill="none"
36
+ stroke="currentColor"
37
+ stroke-width="2"
38
+ >
39
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
40
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
41
+ <line x1="12" y1="19" x2="12" y2="23" />
42
+ <line x1="8" y1="23" x2="16" y2="23" />
43
+ </svg>
44
+ <h1>التعرف على رقم السيارة</h1>
45
+ </div>
46
+ <div class="connection-status" id="connectionStatus">
47
+ <span class="status-dot"></span>
48
+ <span class="status-text">غير متصل</span>
49
+ </div>
50
+ </header>
51
+
52
+ <!-- Main content -->
53
+ <main class="main-content">
54
+ <!-- Status card -->
55
+ <div class="status-card glass-card">
56
+ <div class="status-indicator" id="statusIndicator">
57
+ <div class="pulse-ring"></div>
58
+ <div class="status-icon">
59
+ <svg
60
+ viewBox="0 0 24 24"
61
+ fill="none"
62
+ stroke="currentColor"
63
+ stroke-width="2"
64
+ >
65
+ <path
66
+ d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
67
+ />
68
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
69
+ </svg>
70
+ </div>
71
+ </div>
72
+ <p class="status-message" id="statusMessage">اضغط للبدء</p>
73
+ <div class="probability-bar" id="probabilityBar">
74
+ <div class="probability-fill" id="probabilityFill"></div>
75
+ </div>
76
+ </div>
77
+
78
+ <!-- Microphone button -->
79
+ <button class="mic-button" id="micButton" aria-label="Start recording">
80
+ <div class="mic-button-inner">
81
+ <svg
82
+ class="mic-icon"
83
+ viewBox="0 0 24 24"
84
+ fill="none"
85
+ stroke="currentColor"
86
+ stroke-width="2"
87
+ >
88
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
89
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
90
+ <line x1="12" y1="19" x2="12" y2="23" />
91
+ <line x1="8" y1="23" x2="16" y2="23" />
92
+ </svg>
93
+ <svg
94
+ class="stop-icon hidden"
95
+ viewBox="0 0 24 24"
96
+ fill="currentColor"
97
+ >
98
+ <rect x="6" y="6" width="12" height="12" rx="2" />
99
+ </svg>
100
+ </div>
101
+ <div class="mic-ripple"></div>
102
+ </button>
103
+
104
+ <!-- Waveform visualization -->
105
+ <div class="waveform-container glass-card" id="waveformContainer">
106
+ <canvas id="waveformCanvas"></canvas>
107
+ </div>
108
+
109
+ <!-- Transcription result -->
110
+ <div class="transcription-card glass-card" id="transcriptionCard">
111
+ <h2>النتيجة</h2>
112
+ <div class="transcription-content" id="transcriptionContent">
113
+ <p class="placeholder-text">سيظهر النص هنا بعد انتهاء الكلام...</p>
114
+ </div>
115
+ <div class="transcription-history" id="transcriptionHistory">
116
+ <!-- Previous transcriptions will be added here -->
117
+ </div>
118
+ </div>
119
+ </main>
120
+
121
+ <!-- Footer -->
122
+ <footer class="footer">
123
+ </footer>
124
+ </div>
125
+
126
+ <script src="/static/app.js"></script>
127
+ </body>
128
+ </html>
static/styles.css ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ============================================
2
+ Real-time VAD-ASR Pipeline - Styles
3
+ Premium dark theme with glassmorphism
4
+ ============================================ */
5
+
6
+ :root {
7
+ /* Color palette */
8
+ --bg-primary: #0a0a1a;
9
+ --bg-secondary: #12122a;
10
+ --bg-gradient-start: #0f0f23;
11
+ --bg-gradient-end: #1a1a3e;
12
+
13
+ --accent-primary: #6366f1;
14
+ --accent-secondary: #8b5cf6;
15
+ --accent-tertiary: #a855f7;
16
+ --accent-glow: rgba(99, 102, 241, 0.3);
17
+
18
+ --text-primary: #ffffff;
19
+ --text-secondary: #a1a1aa;
20
+ --text-muted: #71717a;
21
+
22
+ --success: #22c55e;
23
+ --warning: #f59e0b;
24
+ --error: #ef4444;
25
+
26
+ --glass-bg: rgba(255, 255, 255, 0.05);
27
+ --glass-border: rgba(255, 255, 255, 0.1);
28
+ --glass-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
29
+
30
+ /* Typography */
31
+ --font-primary: "Inter", "Noto Sans Arabic", sans-serif;
32
+ --font-arabic: "Noto Sans Arabic", "Inter", sans-serif;
33
+
34
+ /* Spacing */
35
+ --spacing-xs: 0.25rem;
36
+ --spacing-sm: 0.5rem;
37
+ --spacing-md: 1rem;
38
+ --spacing-lg: 1.5rem;
39
+ --spacing-xl: 2rem;
40
+ --spacing-2xl: 3rem;
41
+
42
+ /* Border radius */
43
+ --radius-sm: 0.5rem;
44
+ --radius-md: 1rem;
45
+ --radius-lg: 1.5rem;
46
+ --radius-full: 50%;
47
+
48
+ /* Transitions */
49
+ --transition-fast: 150ms ease;
50
+ --transition-normal: 300ms ease;
51
+ --transition-slow: 500ms ease;
52
+ }
53
+
54
+ /* Reset and base styles */
55
+ *,
56
+ *::before,
57
+ *::after {
58
+ box-sizing: border-box;
59
+ margin: 0;
60
+ padding: 0;
61
+ }
62
+
63
+ html {
64
+ font-size: 16px;
65
+ scroll-behavior: smooth;
66
+ }
67
+
68
+ body {
69
+ font-family: var(--font-arabic);
70
+ background: var(--bg-primary);
71
+ color: var(--text-primary);
72
+ min-height: 100vh;
73
+ overflow-x: hidden;
74
+ line-height: 1.6;
75
+ }
76
+
77
+ /* App container */
78
+ .app-container {
79
+ position: relative;
80
+ min-height: 100vh;
81
+ display: flex;
82
+ flex-direction: column;
83
+ padding: var(--spacing-md);
84
+ max-width: 800px;
85
+ margin: 0 auto;
86
+ }
87
+
88
+ /* Animated background */
89
+ .bg-gradient {
90
+ position: fixed;
91
+ inset: 0;
92
+ background: linear-gradient(
93
+ 135deg,
94
+ var(--bg-gradient-start) 0%,
95
+ var(--bg-gradient-end) 100%
96
+ );
97
+ z-index: -2;
98
+ }
99
+
100
+ .bg-orbs {
101
+ position: fixed;
102
+ inset: 0;
103
+ z-index: -1;
104
+ overflow: hidden;
105
+ pointer-events: none;
106
+ }
107
+
108
+ .orb {
109
+ position: absolute;
110
+ border-radius: var(--radius-full);
111
+ filter: blur(80px);
112
+ opacity: 0.4;
113
+ animation: float 20s infinite ease-in-out;
114
+ }
115
+
116
+ .orb-1 {
117
+ width: 400px;
118
+ height: 400px;
119
+ background: var(--accent-primary);
120
+ top: -100px;
121
+ right: -100px;
122
+ animation-delay: 0s;
123
+ }
124
+
125
+ .orb-2 {
126
+ width: 300px;
127
+ height: 300px;
128
+ background: var(--accent-secondary);
129
+ bottom: 20%;
130
+ left: -100px;
131
+ animation-delay: -7s;
132
+ }
133
+
134
+ .orb-3 {
135
+ width: 250px;
136
+ height: 250px;
137
+ background: var(--accent-tertiary);
138
+ bottom: -50px;
139
+ right: 20%;
140
+ animation-delay: -14s;
141
+ }
142
+
143
+ @keyframes float {
144
+ 0%,
145
+ 100% {
146
+ transform: translate(0, 0) scale(1);
147
+ }
148
+ 33% {
149
+ transform: translate(30px, -30px) scale(1.1);
150
+ }
151
+ 66% {
152
+ transform: translate(-20px, 20px) scale(0.9);
153
+ }
154
+ }
155
+
156
+ /* Glass card effect */
157
+ .glass-card {
158
+ background: var(--glass-bg);
159
+ backdrop-filter: blur(20px);
160
+ -webkit-backdrop-filter: blur(20px);
161
+ border: 1px solid var(--glass-border);
162
+ border-radius: var(--radius-lg);
163
+ box-shadow: var(--glass-shadow);
164
+ }
165
+
166
+ /* Header */
167
+ .header {
168
+ display: flex;
169
+ justify-content: space-between;
170
+ align-items: center;
171
+ padding: var(--spacing-md) 0;
172
+ margin-bottom: var(--spacing-xl);
173
+ }
174
+
175
+ .logo {
176
+ display: flex;
177
+ align-items: center;
178
+ gap: var(--spacing-sm);
179
+ }
180
+
181
+ .logo-icon {
182
+ width: 32px;
183
+ height: 32px;
184
+ color: var(--accent-primary);
185
+ }
186
+
187
+ .logo h1 {
188
+ font-size: 1.25rem;
189
+ font-weight: 600;
190
+ background: linear-gradient(
191
+ 135deg,
192
+ var(--accent-primary),
193
+ var(--accent-tertiary)
194
+ );
195
+ -webkit-background-clip: text;
196
+ -webkit-text-fill-color: transparent;
197
+ background-clip: text;
198
+ }
199
+
200
+ .connection-status {
201
+ display: flex;
202
+ align-items: center;
203
+ gap: var(--spacing-xs);
204
+ padding: var(--spacing-xs) var(--spacing-md);
205
+ border-radius: var(--radius-full);
206
+ background: var(--glass-bg);
207
+ border: 1px solid var(--glass-border);
208
+ font-size: 0.875rem;
209
+ }
210
+
211
+ .status-dot {
212
+ width: 8px;
213
+ height: 8px;
214
+ border-radius: var(--radius-full);
215
+ background: var(--error);
216
+ transition: var(--transition-normal);
217
+ }
218
+
219
+ .connection-status.connected .status-dot {
220
+ background: var(--success);
221
+ box-shadow: 0 0 10px var(--success);
222
+ }
223
+
224
+ .status-text {
225
+ color: var(--text-secondary);
226
+ }
227
+
228
+ /* Main content */
229
+ .main-content {
230
+ flex: 1;
231
+ display: flex;
232
+ flex-direction: column;
233
+ align-items: center;
234
+ gap: var(--spacing-xl);
235
+ }
236
+
237
+ /* Status card */
238
+ .status-card {
239
+ width: 100%;
240
+ padding: var(--spacing-xl);
241
+ text-align: center;
242
+ }
243
+
244
+ .status-indicator {
245
+ position: relative;
246
+ width: 80px;
247
+ height: 80px;
248
+ margin: 0 auto var(--spacing-lg);
249
+ display: flex;
250
+ align-items: center;
251
+ justify-content: center;
252
+ }
253
+
254
+ .pulse-ring {
255
+ position: absolute;
256
+ inset: 0;
257
+ border-radius: var(--radius-full);
258
+ border: 2px solid var(--accent-primary);
259
+ opacity: 0;
260
+ transform: scale(0.8);
261
+ transition: var(--transition-normal);
262
+ }
263
+
264
+ .status-indicator.speaking .pulse-ring {
265
+ animation: pulse 1.5s infinite;
266
+ }
267
+
268
+ @keyframes pulse {
269
+ 0% {
270
+ transform: scale(0.8);
271
+ opacity: 1;
272
+ }
273
+ 100% {
274
+ transform: scale(1.5);
275
+ opacity: 0;
276
+ }
277
+ }
278
+
279
+ .status-icon {
280
+ width: 48px;
281
+ height: 48px;
282
+ color: var(--text-secondary);
283
+ transition: var(--transition-normal);
284
+ }
285
+
286
+ .status-indicator.speaking .status-icon {
287
+ color: var(--success);
288
+ filter: drop-shadow(0 0 10px var(--success));
289
+ }
290
+
291
+ .status-indicator.silence .status-icon {
292
+ color: var(--warning);
293
+ filter: drop-shadow(0 0 10px var(--warning));
294
+ }
295
+
296
+ .status-indicator.transcribing .status-icon {
297
+ color: var(--accent-primary);
298
+ filter: drop-shadow(0 0 10px var(--accent-primary));
299
+ animation: spin 1s linear infinite;
300
+ }
301
+
302
+ @keyframes spin {
303
+ from {
304
+ transform: rotate(0deg);
305
+ }
306
+ to {
307
+ transform: rotate(360deg);
308
+ }
309
+ }
310
+
311
+ .status-message {
312
+ font-size: 1.125rem;
313
+ color: var(--text-secondary);
314
+ margin-bottom: var(--spacing-md);
315
+ }
316
+
317
+ .probability-bar {
318
+ width: 100%;
319
+ max-width: 300px;
320
+ height: 4px;
321
+ background: var(--glass-bg);
322
+ border-radius: var(--radius-full);
323
+ margin: 0 auto;
324
+ overflow: hidden;
325
+ }
326
+
327
+ .probability-fill {
328
+ height: 100%;
329
+ width: 0%;
330
+ background: linear-gradient(
331
+ 90deg,
332
+ var(--accent-primary),
333
+ var(--accent-tertiary)
334
+ );
335
+ border-radius: var(--radius-full);
336
+ transition: width var(--transition-fast);
337
+ }
338
+
339
+ /* Microphone button */
340
+ .mic-button {
341
+ position: relative;
342
+ width: 100px;
343
+ height: 100px;
344
+ border: none;
345
+ border-radius: var(--radius-full);
346
+ background: linear-gradient(
347
+ 135deg,
348
+ var(--accent-primary),
349
+ var(--accent-secondary)
350
+ );
351
+ cursor: pointer;
352
+ transition: var(--transition-normal);
353
+ box-shadow: 0 4px 30px var(--accent-glow);
354
+ }
355
+
356
+ .mic-button:hover {
357
+ transform: scale(1.05);
358
+ box-shadow: 0 8px 40px var(--accent-glow);
359
+ }
360
+
361
+ .mic-button:active {
362
+ transform: scale(0.98);
363
+ }
364
+
365
+ .mic-button-inner {
366
+ position: relative;
367
+ width: 100%;
368
+ height: 100%;
369
+ display: flex;
370
+ align-items: center;
371
+ justify-content: center;
372
+ }
373
+
374
+ .mic-icon,
375
+ .stop-icon {
376
+ width: 40px;
377
+ height: 40px;
378
+ color: white;
379
+ transition: var(--transition-normal);
380
+ }
381
+
382
+ .hidden {
383
+ display: none !important;
384
+ }
385
+
386
+ .mic-button.recording {
387
+ background: linear-gradient(135deg, var(--error), #dc2626);
388
+ animation: glow 1.5s infinite;
389
+ }
390
+
391
+ @keyframes glow {
392
+ 0%,
393
+ 100% {
394
+ box-shadow: 0 4px 30px rgba(239, 68, 68, 0.4);
395
+ }
396
+ 50% {
397
+ box-shadow: 0 8px 50px rgba(239, 68, 68, 0.6);
398
+ }
399
+ }
400
+
401
+ .mic-ripple {
402
+ position: absolute;
403
+ inset: 0;
404
+ border-radius: var(--radius-full);
405
+ border: 2px solid currentColor;
406
+ color: var(--accent-primary);
407
+ opacity: 0;
408
+ transform: scale(1);
409
+ pointer-events: none;
410
+ }
411
+
412
+ .mic-button.recording .mic-ripple {
413
+ animation: ripple 1.5s infinite;
414
+ color: var(--error);
415
+ }
416
+
417
+ @keyframes ripple {
418
+ 0% {
419
+ transform: scale(1);
420
+ opacity: 0.5;
421
+ }
422
+ 100% {
423
+ transform: scale(1.8);
424
+ opacity: 0;
425
+ }
426
+ }
427
+
428
+ /* Waveform container */
429
+ .waveform-container {
430
+ width: 100%;
431
+ padding: var(--spacing-md);
432
+ overflow: hidden;
433
+ }
434
+
435
+ #waveformCanvas {
436
+ width: 100%;
437
+ height: 80px;
438
+ display: block;
439
+ }
440
+
441
+ /* Transcription card */
442
+ .transcription-card {
443
+ width: 100%;
444
+ padding: var(--spacing-xl);
445
+ }
446
+
447
+ .transcription-card h2 {
448
+ font-size: 1rem;
449
+ font-weight: 500;
450
+ color: var(--text-secondary);
451
+ margin-bottom: var(--spacing-md);
452
+ }
453
+
454
+ .transcription-content {
455
+ min-height: 60px;
456
+ padding: var(--spacing-lg);
457
+ background: rgba(0, 0, 0, 0.2);
458
+ border-radius: var(--radius-md);
459
+ margin-bottom: var(--spacing-md);
460
+ }
461
+
462
+ .transcription-content p {
463
+ font-size: 1.5rem;
464
+ font-weight: 600;
465
+ text-align: center;
466
+ margin: 0;
467
+ }
468
+
469
+ .placeholder-text {
470
+ color: var(--text-muted) !important;
471
+ font-size: 1rem !important;
472
+ font-weight: 400 !important;
473
+ }
474
+
475
+ .transcription-history {
476
+ display: flex;
477
+ flex-direction: column;
478
+ gap: var(--spacing-sm);
479
+ max-height: 200px;
480
+ overflow-y: auto;
481
+ }
482
+
483
+ .history-item {
484
+ padding: var(--spacing-sm) var(--spacing-md);
485
+ background: rgba(0, 0, 0, 0.1);
486
+ border-radius: var(--radius-sm);
487
+ font-size: 0.875rem;
488
+ color: var(--text-secondary);
489
+ border-right: 3px solid var(--accent-primary);
490
+ }
491
+
492
+ /* Footer */
493
+ .footer {
494
+ text-align: center;
495
+ padding: var(--spacing-lg) 0;
496
+ color: var(--text-muted);
497
+ font-size: 0.875rem;
498
+ }
499
+
500
+ /* Scrollbar styling */
501
+ ::-webkit-scrollbar {
502
+ width: 6px;
503
+ }
504
+
505
+ ::-webkit-scrollbar-track {
506
+ background: var(--glass-bg);
507
+ border-radius: var(--radius-full);
508
+ }
509
+
510
+ ::-webkit-scrollbar-thumb {
511
+ background: var(--accent-primary);
512
+ border-radius: var(--radius-full);
513
+ }
514
+
515
+ ::-webkit-scrollbar-thumb:hover {
516
+ background: var(--accent-secondary);
517
+ }
518
+
519
+ /* Responsive adjustments */
520
+ @media (max-width: 640px) {
521
+ .header {
522
+ flex-direction: column;
523
+ gap: var(--spacing-md);
524
+ text-align: center;
525
+ }
526
+
527
+ .mic-button {
528
+ width: 80px;
529
+ height: 80px;
530
+ }
531
+
532
+ .mic-icon,
533
+ .stop-icon {
534
+ width: 32px;
535
+ height: 32px;
536
+ }
537
+
538
+ .transcription-content p {
539
+ font-size: 1.25rem;
540
+ }
541
+ }
542
+
543
+ /* Animations for new transcriptions */
544
+ @keyframes slideIn {
545
+ from {
546
+ opacity: 0;
547
+ transform: translateY(-10px);
548
+ }
549
+ to {
550
+ opacity: 1;
551
+ transform: translateY(0);
552
+ }
553
+ }
554
+
555
+ .transcription-content.new {
556
+ animation: slideIn 0.3s ease;
557
+ }
558
+
559
+ .history-item.new {
560
+ animation: slideIn 0.3s ease;
561
+ }
vad/silero_vad.jit ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1122837f4154c511485fe0b9c64455f7b929c96fbb8d79fbdb336383ebd3720
3
+ size 2272526