Spaces:

shredder-31
/

Car-plate-ASR

Sleeping

App Files Files Community

sherif31 commited on Dec 26, 2025

Commit

f1fc6e5

1 Parent(s): de55e06

Deploy to HF Spaces with Docker

Browse files

Files changed (9) hide show

Dockerfile +35 -0
README.md +12 -4
app.py +230 -80
arabic_car_plate/model.safetensors +3 -0
requirements.txt +5 -1
static/app.js +324 -0
static/index.html +128 -0
static/styles.css +561 -0
vad/silero_vad.jit +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    git-lfs \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /home/user/app
+# Copy requirements first for caching
+COPY --chown=user:user requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY --chown=user:user . .
+# Expose the port (HF Spaces expects port 7860)
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
 title: Car Plate ASR
-emoji: 🌖
 colorFrom: blue
 colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Car Plate ASR
+emoji: 🚗
 colorFrom: blue
 colorTo: indigo
+sdk: docker
 pinned: false
 ---
+# Car Plate ASR
+Real-time Arabic car plate speech recognition using Whisper and Silero VAD.
+## Features
+- 🎤 Real-time voice activity detection (VAD)
+- 🗣️ Arabic speech recognition for car plate numbers
+- ⚡ WebSocket-based streaming for low latency
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,106 +1,256 @@
-import gradio as gr
 import torch
 import torchaudio
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import numpy as np
-# Load model and processor globally
-MODEL_PATH = "arabic_car_plate"
-print("Loading model and processor...")
-# Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-# Load processor and model with optimizations
 processor = WhisperProcessor.from_pretrained(
-    "openai/whisper-base",
-    language="arabic",
     task="transcribe"
 )
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-).to(device)
 model.generation_config.suppress_tokens = []
 model.generation_config.begin_suppress_tokens = [220, 50257]
-# Set model to evaluation mode
 model.eval()
-print("Model loaded successfully!")
-def transcribe_audio(audio):
-    if audio is None:
-        return "No audio recorded. Please record audio first."
-    sample_rate, audio_data = audio
-    if len(audio_data.shape) > 1:
-        audio_data = audio_data.mean(axis=1)
-    audio_data = audio_data.astype(np.float32)
-    audio_data /= np.max(np.abs(audio_data))
-    # Resample to 16 kHz if needed
-    if sample_rate != 16000:
-        audio_tensor = torch.from_numpy(audio_data).float()
-        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-        audio_data = resampler(audio_tensor).numpy()
-    # Preprocess
-    inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")
-    input_features = inputs.input_features.to(device)
-    attention_mask = inputs.get("attention_mask", None)
-    if attention_mask is not None:
-        attention_mask = attention_mask.to(device)
-    # Generate transcription
-    with torch.no_grad():
-        predicted_ids = model.generate(
-            input_features,
-            attention_mask=attention_mask,
-            language="arabic",
-            task="transcribe"
         )
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription
-# Create Gradio interface
-with gr.Blocks(title="تفريع نصي للوح السياره") as demo:
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            audio_input = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                label="🎤 Record Audio",
-                show_download_button=True,
-            )
-        with gr.Column(scale=1):
-            transcribe_btn = gr.Button(
-                "تم",
-                variant="primary",
-                size="lg"
-            )
-            output_text = gr.Textbox(
-                label="📝لوحه السياره",
-                placeholder="تفريغ هيظهر هنا",
-                lines=2,
-                max_lines=2,
-                scale=1,
-                interactive=False
-            )
-    transcribe_btn.click(
-        fn=transcribe_audio,
-        inputs=audio_input,
-        outputs=output_text
-    )
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import torch
 import torchaudio
 import numpy as np
+import asyncio
+import time
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from pathlib import Path
+import struct
+# Set torch threads
+torch.set_num_threads(1)
+# ============== LOAD MODELS ==============
+print("Loading models...")
+# Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# Load Whisper ASR model
+MODEL_PATH = "arabic_car_plate"
 processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-base",
+    language="arabic",
     task="transcribe"
 )
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
 model.generation_config.suppress_tokens = []
 model.generation_config.begin_suppress_tokens = [220, 50257]
 model.eval()
+# Load Silero VAD model
+def load_vad(model_path="vad/silero_vad.jit"):
+    vad = torch.jit.load(model_path, map_location="cpu")
+    vad.eval()
+    return vad
+vad_model = load_vad()
+print("Models loaded successfully!")
+# ============== FASTAPI APP ==============
+app = FastAPI(title="Real-time VAD-ASR Pipeline")
+# Mount static files
+static_path = Path(__file__).parent / "static"
+static_path.mkdir(exist_ok=True)
+app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
+@app.get("/")
+async def root():
+    """Serve the main HTML page"""
+    return FileResponse(static_path / "index.html")
+# ============== AUDIO PROCESSOR ==============
+class AudioProcessor:
+    """
+    Real-time audio processor with VAD and ASR.
+    - Detects speech using Silero VAD
+    - Accumulates audio while speaking
+    - After 0.7s of silence, triggers ASR inference
+    """
+    SAMPLE_RATE = 16000
+    VAD_CHUNK_SIZE = 512  # Silero VAD requires exactly 512 samples at 16kHz
+    SILENCE_THRESHOLD = 0.5  # seconds
+    VAD_THRESHOLD = 0.7  # Speech probability threshold
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        """Reset the processor state"""
+        self.audio_buffer = []  # For ASR (full audio)
+        self.pending_samples = np.array([], dtype=np.float32)  # Buffer for incomplete VAD chunks
+        self.is_speaking = False
+        self.silence_start = None
+        self.speech_detected = False
+        self.last_result = {"status": "listening", "probability": 0.0}
+        # Reset VAD model state
+        vad_model.reset_states()
+    def process_chunk(self, audio_chunk: np.ndarray) -> dict:
+        """
+        Process an audio chunk and return the current state.
+        Splits incoming audio into 512-sample chunks for VAD.
+        Returns:
+            dict with keys:
+            - 'status': 'speaking', 'silence', 'listening', or 'transcription'
+            - 'transcription': str (only if status is 'transcription')
+        """
+        # Add incoming audio to pending buffer
+        self.pending_samples = np.concatenate([self.pending_samples, audio_chunk])
+        # Also add to audio buffer for later ASR
+        self.audio_buffer.append(audio_chunk)
+        # Process all complete 512-sample chunks
+        result = self.last_result
+        while len(self.pending_samples) >= self.VAD_CHUNK_SIZE:
+            # Extract exactly 512 samples
+            vad_chunk = self.pending_samples[:self.VAD_CHUNK_SIZE]
+            self.pending_samples = self.pending_samples[self.VAD_CHUNK_SIZE:]
+            # Process this chunk through VAD
+            result = self._process_vad_chunk(vad_chunk)
+            # If we got a transcription, return immediately
+            if result["status"] == "transcription":
+                return result
+        self.last_result = result
+        return result
+    def _process_vad_chunk(self, audio_chunk: np.ndarray) -> dict:
+        """Process a single 512-sample chunk through VAD"""
+        # Convert to tensor
+        audio_tensor = torch.from_numpy(audio_chunk).float()
+        # Normalize audio
+        if audio_tensor.abs().max() > 0:
+            audio_tensor = audio_tensor / audio_tensor.abs().max()
+        # Run VAD on the chunk
+        speech_prob = vad_model(audio_tensor, self.SAMPLE_RATE).item()
+        current_time = time.time()
+        if speech_prob >= self.VAD_THRESHOLD:
+            # Speech detected
+            self.is_speaking = True
+            self.speech_detected = True
+            self.silence_start = None
+            return {"status": "speaking", "probability": speech_prob}
+        else:
+            # Silence detected
+            if self.is_speaking:
+                # Just stopped speaking
+                if self.silence_start is None:
+                    self.silence_start = current_time
+                    return {"status": "silence", "probability": speech_prob}
+                # Check if silence duration exceeded threshold
+                silence_duration = current_time - self.silence_start
+                if silence_duration >= self.SILENCE_THRESHOLD:
+                    # Trigger ASR inference
+                    if self.speech_detected and len(self.audio_buffer) > 0:
+                        transcription = self._transcribe()
+                        self.reset()
+                        result = {
+                            "status": "transcription",
+                            "transcription": transcription,
+                            "probability": speech_prob
+                        }
+                        print(f"Sending transcription to client: {result}")
+                        return result
+                else:
+                    # Still accumulating silence
+                    remaining = self.SILENCE_THRESHOLD - silence_duration
+                    return {
+                        "status": "silence",
+                        "probability": speech_prob,
+                        "remaining": round(remaining, 2)
+                    }
+            return {"status": "listening", "probability": speech_prob}
+    def _transcribe(self) -> str:
+        """Run ASR on accumulated audio"""
+        if not self.audio_buffer:
+            return ""
+        # Concatenate all audio chunks
+        audio_data = np.concatenate(self.audio_buffer)
+        # Normalize
+        if np.max(np.abs(audio_data)) > 0:
+            audio_data = audio_data / np.max(np.abs(audio_data))
+        # Preprocess for Whisper
+        inputs = processor(
+            audio_data,
+            sampling_rate=self.SAMPLE_RATE,
+            return_tensors="pt"
         )
+        input_features = inputs.input_features.to(device)
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                input_features,
+                language="arabic",
+                task="transcribe"
+            )
+        transcription = processor.batch_decode(
+            predicted_ids,
+            skip_special_tokens=True
+        )[0]
+        return transcription
+# ============== WEBSOCKET ENDPOINT ==============
+@app.websocket("/ws/audio")
+async def websocket_audio(websocket: WebSocket):
+    """
+    WebSocket endpoint for real-time audio processing.
+    Expects binary audio data:
+    - Format: 16-bit PCM, mono, 16kHz
+    - Chunk size: 512 samples (1024 bytes)
+    """
+    await websocket.accept()
+    processor_instance = AudioProcessor()
+    try:
+        while True:
+            # Receive binary audio data
+            data = await websocket.receive_bytes()
+            # Convert bytes to numpy array (16-bit PCM)
+            audio_chunk = np.frombuffer(data, dtype=np.int16).astype(np.float32)
+            audio_chunk = audio_chunk / 32768.0  # Normalize to [-1, 1]
+            # Process the chunk
+            result = processor_instance.process_chunk(audio_chunk)
+            # Send result back
+            await websocket.send_json(result)
+    except WebSocketDisconnect:
+        print("Client disconnected")
+    except Exception as e:
+        print(f"Error: {e}")
+        await websocket.close()
+# ============== HEALTH CHECK ==============
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "device": device}
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

arabic_car_plate/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b672757645b5a8dee35933502414666eebd835eedd811456949a976b5f8f250
+size 290403936

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 torch
 torchaudio
 transformers
-numpy

 torch
 torchaudio
 transformers
+numpy
+fastapi
+uvicorn[standard]
+websockets
+python-multipart

static/app.js ADDED Viewed

	@@ -0,0 +1,324 @@

+/**
+ * Real-time VAD-ASR Pipeline - Frontend Application
+ * Handles microphone capture, WebSocket communication, and UI updates
+ */
+class AudioRecorder {
+    constructor() {
+        // Audio settings
+        this.sampleRate = 16000;
+        this.chunkSize = 512; // Samples per chunk
+        this.bufferSize = 4096;
+        // State
+        this.isRecording = false;
+        this.audioContext = null;
+        this.mediaStream = null;
+        this.processor = null;
+        this.websocket = null;
+        // UI elements
+        this.micButton = document.getElementById('micButton');
+        this.micIcon = document.querySelector('.mic-icon');
+        this.stopIcon = document.querySelector('.stop-icon');
+        this.statusIndicator = document.getElementById('statusIndicator');
+        this.statusMessage = document.getElementById('statusMessage');
+        this.probabilityFill = document.getElementById('probabilityFill');
+        this.connectionStatus = document.getElementById('connectionStatus');
+        this.transcriptionContent = document.getElementById('transcriptionContent');
+        this.transcriptionHistory = document.getElementById('transcriptionHistory');
+        this.waveformCanvas = document.getElementById('waveformCanvas');
+        this.waveformCtx = this.waveformCanvas.getContext('2d');
+        // Audio buffer for visualization
+        this.audioDataBuffer = new Float32Array(128);
+        // Bind events
+        this.micButton.addEventListener('click', () => this.toggleRecording());
+        // Initialize canvas
+        this.initCanvas();
+        window.addEventListener('resize', () => this.initCanvas());
+    }
+    initCanvas() {
+        const container = this.waveformCanvas.parentElement;
+        this.waveformCanvas.width = container.clientWidth - 32;
+        this.waveformCanvas.height = 80;
+        this.drawIdleWaveform();
+    }
+    drawIdleWaveform() {
+        const { width, height } = this.waveformCanvas;
+        this.waveformCtx.fillStyle = 'rgba(99, 102, 241, 0.1)';
+        this.waveformCtx.fillRect(0, 0, width, height);
+        this.waveformCtx.strokeStyle = 'rgba(99, 102, 241, 0.3)';
+        this.waveformCtx.lineWidth = 2;
+        this.waveformCtx.beginPath();
+        this.waveformCtx.moveTo(0, height / 2);
+        this.waveformCtx.lineTo(width, height / 2);
+        this.waveformCtx.stroke();
+    }
+    drawWaveform(audioData) {
+        const { width, height } = this.waveformCanvas;
+        const ctx = this.waveformCtx;
+        // Clear canvas
+        ctx.fillStyle = 'rgba(10, 10, 26, 0.3)';
+        ctx.fillRect(0, 0, width, height);
+        // Draw waveform
+        const gradient = ctx.createLinearGradient(0, 0, width, 0);
+        gradient.addColorStop(0, '#6366f1');
+        gradient.addColorStop(0.5, '#8b5cf6');
+        gradient.addColorStop(1, '#a855f7');
+        ctx.strokeStyle = gradient;
+        ctx.lineWidth = 2;
+        ctx.beginPath();
+        const sliceWidth = width / audioData.length;
+        let x = 0;
+        for (let i = 0; i < audioData.length; i++) {
+            const v = audioData[i] * 0.5 + 0.5;
+            const y = v * height;
+            if (i === 0) {
+                ctx.moveTo(x, y);
+            } else {
+                ctx.lineTo(x, y);
+            }
+            x += sliceWidth;
+        }
+        ctx.stroke();
+        // Add glow effect
+        ctx.shadowColor = '#6366f1';
+        ctx.shadowBlur = 10;
+        ctx.stroke();
+        ctx.shadowBlur = 0;
+    }
+    async toggleRecording() {
+        if (this.isRecording) {
+            this.stopRecording();
+        } else {
+            await this.startRecording();
+        }
+    }
+    async startRecording() {
+        try {
+            // Request microphone access
+            this.mediaStream = await navigator.mediaDevices.getUserMedia({
+                audio: {
+                    channelCount: 1,
+                    sampleRate: this.sampleRate,
+                    echoCancellation: true,
+                    noiseSuppression: true
+                }
+            });
+            // Create audio context
+            this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
+                sampleRate: this.sampleRate
+            });
+            // Connect WebSocket
+            await this.connectWebSocket();
+            // Create audio processing pipeline
+            const source = this.audioContext.createMediaStreamSource(this.mediaStream);
+            // Use ScriptProcessorNode for audio processing
+            this.processor = this.audioContext.createScriptProcessor(this.bufferSize, 1, 1);
+            this.processor.onaudioprocess = (e) => {
+                if (!this.isRecording) return;
+                const inputData = e.inputBuffer.getChannelData(0);
+                // Update visualization buffer
+                this.audioDataBuffer = new Float32Array(inputData.slice(0, 128));
+                this.drawWaveform(this.audioDataBuffer);
+                // Send audio chunks to server
+                this.sendAudioChunk(inputData);
+            };
+            source.connect(this.processor);
+            this.processor.connect(this.audioContext.destination);
+            // Update UI
+            this.isRecording = true;
+            this.updateUI('recording');
+        } catch (error) {
+            console.error('Error starting recording:', error);
+            this.updateStatus('listening', 'خطأ في الوصول للميكروفون');
+        }
+    }
+    stopRecording() {
+        this.isRecording = false;
+        // Stop audio processing
+        if (this.processor) {
+            this.processor.disconnect();
+            this.processor = null;
+        }
+        if (this.audioContext) {
+            this.audioContext.close();
+            this.audioContext = null;
+        }
+        if (this.mediaStream) {
+            this.mediaStream.getTracks().forEach(track => track.stop());
+            this.mediaStream = null;
+        }
+        // Close WebSocket
+        if (this.websocket) {
+            this.websocket.close();
+            this.websocket = null;
+        }
+        // Update UI
+        this.updateUI('stopped');
+        this.drawIdleWaveform();
+    }
+    async connectWebSocket() {
+        return new Promise((resolve, reject) => {
+            const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+            const wsUrl = `${protocol}//${window.location.host}/ws/audio`;
+            this.websocket = new WebSocket(wsUrl);
+            this.websocket.onopen = () => {
+                console.log('WebSocket connected');
+                this.connectionStatus.classList.add('connected');
+                this.connectionStatus.querySelector('.status-text').textContent = 'متصل';
+                resolve();
+            };
+            this.websocket.onclose = () => {
+                console.log('WebSocket disconnected');
+                this.connectionStatus.classList.remove('connected');
+                this.connectionStatus.querySelector('.status-text').textContent = 'غير متصل';
+            };
+            this.websocket.onerror = (error) => {
+                console.error('WebSocket error:', error);
+                reject(error);
+            };
+            this.websocket.onmessage = (event) => {
+                const data = JSON.parse(event.data);
+                this.handleServerMessage(data);
+            };
+        });
+    }
+    sendAudioChunk(audioData) {
+        if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
+            return;
+        }
+        // Convert Float32 to Int16 for transmission
+        const int16Data = new Int16Array(audioData.length);
+        for (let i = 0; i < audioData.length; i++) {
+            const s = Math.max(-1, Math.min(1, audioData[i]));
+            int16Data[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+        }
+        // Send as binary
+        this.websocket.send(int16Data.buffer);
+    }
+    handleServerMessage(data) {
+        const { status, probability, transcription, remaining } = data;
+        // Update probability bar
+        if (probability !== undefined) {
+            this.probabilityFill.style.width = `${probability * 100}%`;
+        }
+        // Update status
+        switch (status) {
+            case 'speaking':
+                this.updateStatus('speaking', 'جاري التحدث...');
+                break;
+            case 'silence':
+                const remainingText = remaining ? ` (${remaining}s)` : '';
+                this.updateStatus('silence', `صمت${remainingText}`);
+                break;
+            case 'listening':
+                this.updateStatus('listening', 'في انتظار الكلام...');
+                break;
+            case 'transcription':
+                this.updateStatus('listening', 'تم التعرف على الكلام');
+                this.showTranscription(transcription);
+                break;
+        }
+    }
+    updateStatus(state, message) {
+        // Update status indicator class
+        this.statusIndicator.className = 'status-indicator';
+        if (state === 'speaking' || state === 'silence') {
+            this.statusIndicator.classList.add(state);
+        }
+        // Update message
+        this.statusMessage.textContent = message;
+    }
+    showTranscription(text) {
+        if (!text || text.trim() === '') return;
+        // Move current transcription to history
+        const currentText = this.transcriptionContent.querySelector('p:not(.placeholder-text)');
+        if (currentText && currentText.textContent.trim()) {
+            const historyItem = document.createElement('div');
+            historyItem.className = 'history-item new';
+            historyItem.textContent = currentText.textContent;
+            this.transcriptionHistory.insertBefore(historyItem, this.transcriptionHistory.firstChild);
+            // Limit history to 10 items
+            while (this.transcriptionHistory.children.length > 10) {
+                this.transcriptionHistory.removeChild(this.transcriptionHistory.lastChild);
+            }
+        }
+        // Show new transcription
+        this.transcriptionContent.innerHTML = `<p class="new">${text}</p>`;
+    }
+    updateUI(state) {
+        if (state === 'recording') {
+            this.micButton.classList.add('recording');
+            this.micIcon.classList.add('hidden');
+            this.stopIcon.classList.remove('hidden');
+            this.statusMessage.textContent = 'في انتظار الكلام...';
+        } else {
+            this.micButton.classList.remove('recording');
+            this.micIcon.classList.remove('hidden');
+            this.stopIcon.classList.add('hidden');
+            this.statusMessage.textContent = 'اضغط للبدء';
+            this.statusIndicator.className = 'status-indicator';
+            this.probabilityFill.style.width = '0%';
+        }
+    }
+}
+// Initialize on page load
+document.addEventListener('DOMContentLoaded', () => {
+    new AudioRecorder();
+});

static/index.html ADDED Viewed

	@@ -0,0 +1,128 @@

+<!DOCTYPE html>
+<html lang="ar" dir="rtl">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>الصقر </title>
+    <meta
+      name="description"
+      content="Real-time Arabic speech recognition for car plate numbers using VAD and Whisper ASR"
+    />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Noto+Sans+Arabic:wght@300;400;500;600;700&display=swap"
+      rel="stylesheet"
+    />
+    <link rel="stylesheet" href="/static/styles.css" />
+  </head>
+  <body>
+    <div class="app-container">
+      <!-- Animated background -->
+      <div class="bg-gradient"></div>
+      <div class="bg-orbs">
+        <div class="orb orb-1"></div>
+        <div class="orb orb-2"></div>
+        <div class="orb orb-3"></div>
+      </div>
+      <!-- Header -->
+      <header class="header">
+        <div class="logo">
+          <svg
+            class="logo-icon"
+            viewBox="0 0 24 24"
+            fill="none"
+            stroke="currentColor"
+            stroke-width="2"
+          >
+            <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
+            <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+            <line x1="12" y1="19" x2="12" y2="23" />
+            <line x1="8" y1="23" x2="16" y2="23" />
+          </svg>
+          <h1>التعرف على رقم السيارة</h1>
+        </div>
+        <div class="connection-status" id="connectionStatus">
+          <span class="status-dot"></span>
+          <span class="status-text">غير متصل</span>
+        </div>
+      </header>
+      <!-- Main content -->
+      <main class="main-content">
+        <!-- Status card -->
+        <div class="status-card glass-card">
+          <div class="status-indicator" id="statusIndicator">
+            <div class="pulse-ring"></div>
+            <div class="status-icon">
+              <svg
+                viewBox="0 0 24 24"
+                fill="none"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"
+                />
+                <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+              </svg>
+            </div>
+          </div>
+          <p class="status-message" id="statusMessage">اضغط للبدء</p>
+          <div class="probability-bar" id="probabilityBar">
+            <div class="probability-fill" id="probabilityFill"></div>
+          </div>
+        </div>
+        <!-- Microphone button -->
+        <button class="mic-button" id="micButton" aria-label="Start recording">
+          <div class="mic-button-inner">
+            <svg
+              class="mic-icon"
+              viewBox="0 0 24 24"
+              fill="none"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
+              <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+              <line x1="12" y1="19" x2="12" y2="23" />
+              <line x1="8" y1="23" x2="16" y2="23" />
+            </svg>
+            <svg
+              class="stop-icon hidden"
+              viewBox="0 0 24 24"
+              fill="currentColor"
+            >
+              <rect x="6" y="6" width="12" height="12" rx="2" />
+            </svg>
+          </div>
+          <div class="mic-ripple"></div>
+        </button>
+        <!-- Waveform visualization -->
+        <div class="waveform-container glass-card" id="waveformContainer">
+          <canvas id="waveformCanvas"></canvas>
+        </div>
+        <!-- Transcription result -->
+        <div class="transcription-card glass-card" id="transcriptionCard">
+          <h2>النتيجة</h2>
+          <div class="transcription-content" id="transcriptionContent">
+            <p class="placeholder-text">سيظهر النص هنا بعد انتهاء الكلام...</p>
+          </div>
+          <div class="transcription-history" id="transcriptionHistory">
+            <!-- Previous transcriptions will be added here -->
+          </div>
+        </div>
+      </main>
+      <!-- Footer -->
+      <footer class="footer">
+      </footer>
+    </div>
+    <script src="/static/app.js"></script>
+  </body>
+</html>

static/styles.css ADDED Viewed

	@@ -0,0 +1,561 @@

+/* ============================================
+   Real-time VAD-ASR Pipeline - Styles
+   Premium dark theme with glassmorphism
+   ============================================ */
+:root {
+  /* Color palette */
+  --bg-primary: #0a0a1a;
+  --bg-secondary: #12122a;
+  --bg-gradient-start: #0f0f23;
+  --bg-gradient-end: #1a1a3e;
+  --accent-primary: #6366f1;
+  --accent-secondary: #8b5cf6;
+  --accent-tertiary: #a855f7;
+  --accent-glow: rgba(99, 102, 241, 0.3);
+  --text-primary: #ffffff;
+  --text-secondary: #a1a1aa;
+  --text-muted: #71717a;
+  --success: #22c55e;
+  --warning: #f59e0b;
+  --error: #ef4444;
+  --glass-bg: rgba(255, 255, 255, 0.05);
+  --glass-border: rgba(255, 255, 255, 0.1);
+  --glass-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
+  /* Typography */
+  --font-primary: "Inter", "Noto Sans Arabic", sans-serif;
+  --font-arabic: "Noto Sans Arabic", "Inter", sans-serif;
+  /* Spacing */
+  --spacing-xs: 0.25rem;
+  --spacing-sm: 0.5rem;
+  --spacing-md: 1rem;
+  --spacing-lg: 1.5rem;
+  --spacing-xl: 2rem;
+  --spacing-2xl: 3rem;
+  /* Border radius */
+  --radius-sm: 0.5rem;
+  --radius-md: 1rem;
+  --radius-lg: 1.5rem;
+  --radius-full: 50%;
+  /* Transitions */
+  --transition-fast: 150ms ease;
+  --transition-normal: 300ms ease;
+  --transition-slow: 500ms ease;
+}
+/* Reset and base styles */
+*,
+*::before,
+*::after {
+  box-sizing: border-box;
+  margin: 0;
+  padding: 0;
+}
+html {
+  font-size: 16px;
+  scroll-behavior: smooth;
+}
+body {
+  font-family: var(--font-arabic);
+  background: var(--bg-primary);
+  color: var(--text-primary);
+  min-height: 100vh;
+  overflow-x: hidden;
+  line-height: 1.6;
+}
+/* App container */
+.app-container {
+  position: relative;
+  min-height: 100vh;
+  display: flex;
+  flex-direction: column;
+  padding: var(--spacing-md);
+  max-width: 800px;
+  margin: 0 auto;
+}
+/* Animated background */
+.bg-gradient {
+  position: fixed;
+  inset: 0;
+  background: linear-gradient(
+    135deg,
+    var(--bg-gradient-start) 0%,
+    var(--bg-gradient-end) 100%
+  );
+  z-index: -2;
+}
+.bg-orbs {
+  position: fixed;
+  inset: 0;
+  z-index: -1;
+  overflow: hidden;
+  pointer-events: none;
+}
+.orb {
+  position: absolute;
+  border-radius: var(--radius-full);
+  filter: blur(80px);
+  opacity: 0.4;
+  animation: float 20s infinite ease-in-out;
+}
+.orb-1 {
+  width: 400px;
+  height: 400px;
+  background: var(--accent-primary);
+  top: -100px;
+  right: -100px;
+  animation-delay: 0s;
+}
+.orb-2 {
+  width: 300px;
+  height: 300px;
+  background: var(--accent-secondary);
+  bottom: 20%;
+  left: -100px;
+  animation-delay: -7s;
+}
+.orb-3 {
+  width: 250px;
+  height: 250px;
+  background: var(--accent-tertiary);
+  bottom: -50px;
+  right: 20%;
+  animation-delay: -14s;
+}
+@keyframes float {
+  0%,
+  100% {
+    transform: translate(0, 0) scale(1);
+  }
+  33% {
+    transform: translate(30px, -30px) scale(1.1);
+  }
+  66% {
+    transform: translate(-20px, 20px) scale(0.9);
+  }
+}
+/* Glass card effect */
+.glass-card {
+  background: var(--glass-bg);
+  backdrop-filter: blur(20px);
+  -webkit-backdrop-filter: blur(20px);
+  border: 1px solid var(--glass-border);
+  border-radius: var(--radius-lg);
+  box-shadow: var(--glass-shadow);
+}
+/* Header */
+.header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: var(--spacing-md) 0;
+  margin-bottom: var(--spacing-xl);
+}
+.logo {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-sm);
+}
+.logo-icon {
+  width: 32px;
+  height: 32px;
+  color: var(--accent-primary);
+}
+.logo h1 {
+  font-size: 1.25rem;
+  font-weight: 600;
+  background: linear-gradient(
+    135deg,
+    var(--accent-primary),
+    var(--accent-tertiary)
+  );
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
+}
+.connection-status {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-xs);
+  padding: var(--spacing-xs) var(--spacing-md);
+  border-radius: var(--radius-full);
+  background: var(--glass-bg);
+  border: 1px solid var(--glass-border);
+  font-size: 0.875rem;
+}
+.status-dot {
+  width: 8px;
+  height: 8px;
+  border-radius: var(--radius-full);
+  background: var(--error);
+  transition: var(--transition-normal);
+}
+.connection-status.connected .status-dot {
+  background: var(--success);
+  box-shadow: 0 0 10px var(--success);
+}
+.status-text {
+  color: var(--text-secondary);
+}
+/* Main content */
+.main-content {
+  flex: 1;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: var(--spacing-xl);
+}
+/* Status card */
+.status-card {
+  width: 100%;
+  padding: var(--spacing-xl);
+  text-align: center;
+}
+.status-indicator {
+  position: relative;
+  width: 80px;
+  height: 80px;
+  margin: 0 auto var(--spacing-lg);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+}
+.pulse-ring {
+  position: absolute;
+  inset: 0;
+  border-radius: var(--radius-full);
+  border: 2px solid var(--accent-primary);
+  opacity: 0;
+  transform: scale(0.8);
+  transition: var(--transition-normal);
+}
+.status-indicator.speaking .pulse-ring {
+  animation: pulse 1.5s infinite;
+}
+@keyframes pulse {
+  0% {
+    transform: scale(0.8);
+    opacity: 1;
+  }
+  100% {
+    transform: scale(1.5);
+    opacity: 0;
+  }
+}
+.status-icon {
+  width: 48px;
+  height: 48px;
+  color: var(--text-secondary);
+  transition: var(--transition-normal);
+}
+.status-indicator.speaking .status-icon {
+  color: var(--success);
+  filter: drop-shadow(0 0 10px var(--success));
+}
+.status-indicator.silence .status-icon {
+  color: var(--warning);
+  filter: drop-shadow(0 0 10px var(--warning));
+}
+.status-indicator.transcribing .status-icon {
+  color: var(--accent-primary);
+  filter: drop-shadow(0 0 10px var(--accent-primary));
+  animation: spin 1s linear infinite;
+}
+@keyframes spin {
+  from {
+    transform: rotate(0deg);
+  }
+  to {
+    transform: rotate(360deg);
+  }
+}
+.status-message {
+  font-size: 1.125rem;
+  color: var(--text-secondary);
+  margin-bottom: var(--spacing-md);
+}
+.probability-bar {
+  width: 100%;
+  max-width: 300px;
+  height: 4px;
+  background: var(--glass-bg);
+  border-radius: var(--radius-full);
+  margin: 0 auto;
+  overflow: hidden;
+}
+.probability-fill {
+  height: 100%;
+  width: 0%;
+  background: linear-gradient(
+    90deg,
+    var(--accent-primary),
+    var(--accent-tertiary)
+  );
+  border-radius: var(--radius-full);
+  transition: width var(--transition-fast);
+}
+/* Microphone button */
+.mic-button {
+  position: relative;
+  width: 100px;
+  height: 100px;
+  border: none;
+  border-radius: var(--radius-full);
+  background: linear-gradient(
+    135deg,
+    var(--accent-primary),
+    var(--accent-secondary)
+  );
+  cursor: pointer;
+  transition: var(--transition-normal);
+  box-shadow: 0 4px 30px var(--accent-glow);
+}
+.mic-button:hover {
+  transform: scale(1.05);
+  box-shadow: 0 8px 40px var(--accent-glow);
+}
+.mic-button:active {
+  transform: scale(0.98);
+}
+.mic-button-inner {
+  position: relative;
+  width: 100%;
+  height: 100%;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+}
+.mic-icon,
+.stop-icon {
+  width: 40px;
+  height: 40px;
+  color: white;
+  transition: var(--transition-normal);
+}
+.hidden {
+  display: none !important;
+}
+.mic-button.recording {
+  background: linear-gradient(135deg, var(--error), #dc2626);
+  animation: glow 1.5s infinite;
+}
+@keyframes glow {
+  0%,
+  100% {
+    box-shadow: 0 4px 30px rgba(239, 68, 68, 0.4);
+  }
+  50% {
+    box-shadow: 0 8px 50px rgba(239, 68, 68, 0.6);
+  }
+}
+.mic-ripple {
+  position: absolute;
+  inset: 0;
+  border-radius: var(--radius-full);
+  border: 2px solid currentColor;
+  color: var(--accent-primary);
+  opacity: 0;
+  transform: scale(1);
+  pointer-events: none;
+}
+.mic-button.recording .mic-ripple {
+  animation: ripple 1.5s infinite;
+  color: var(--error);
+}
+@keyframes ripple {
+  0% {
+    transform: scale(1);
+    opacity: 0.5;
+  }
+  100% {
+    transform: scale(1.8);
+    opacity: 0;
+  }
+}
+/* Waveform container */
+.waveform-container {
+  width: 100%;
+  padding: var(--spacing-md);
+  overflow: hidden;
+}
+#waveformCanvas {
+  width: 100%;
+  height: 80px;
+  display: block;
+}
+/* Transcription card */
+.transcription-card {
+  width: 100%;
+  padding: var(--spacing-xl);
+}
+.transcription-card h2 {
+  font-size: 1rem;
+  font-weight: 500;
+  color: var(--text-secondary);
+  margin-bottom: var(--spacing-md);
+}
+.transcription-content {
+  min-height: 60px;
+  padding: var(--spacing-lg);
+  background: rgba(0, 0, 0, 0.2);
+  border-radius: var(--radius-md);
+  margin-bottom: var(--spacing-md);
+}
+.transcription-content p {
+  font-size: 1.5rem;
+  font-weight: 600;
+  text-align: center;
+  margin: 0;
+}
+.placeholder-text {
+  color: var(--text-muted) !important;
+  font-size: 1rem !important;
+  font-weight: 400 !important;
+}
+.transcription-history {
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-sm);
+  max-height: 200px;
+  overflow-y: auto;
+}
+.history-item {
+  padding: var(--spacing-sm) var(--spacing-md);
+  background: rgba(0, 0, 0, 0.1);
+  border-radius: var(--radius-sm);
+  font-size: 0.875rem;
+  color: var(--text-secondary);
+  border-right: 3px solid var(--accent-primary);
+}
+/* Footer */
+.footer {
+  text-align: center;
+  padding: var(--spacing-lg) 0;
+  color: var(--text-muted);
+  font-size: 0.875rem;
+}
+/* Scrollbar styling */
+::-webkit-scrollbar {
+  width: 6px;
+}
+::-webkit-scrollbar-track {
+  background: var(--glass-bg);
+  border-radius: var(--radius-full);
+}
+::-webkit-scrollbar-thumb {
+  background: var(--accent-primary);
+  border-radius: var(--radius-full);
+}
+::-webkit-scrollbar-thumb:hover {
+  background: var(--accent-secondary);
+}
+/* Responsive adjustments */
+@media (max-width: 640px) {
+  .header {
+    flex-direction: column;
+    gap: var(--spacing-md);
+    text-align: center;
+  }
+  .mic-button {
+    width: 80px;
+    height: 80px;
+  }
+  .mic-icon,
+  .stop-icon {
+    width: 32px;
+    height: 32px;
+  }
+  .transcription-content p {
+    font-size: 1.25rem;
+  }
+}
+/* Animations for new transcriptions */
+@keyframes slideIn {
+  from {
+    opacity: 0;
+    transform: translateY(-10px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
+}
+.transcription-content.new {
+  animation: slideIn 0.3s ease;
+}
+.history-item.new {
+  animation: slideIn 0.3s ease;
+}

vad/silero_vad.jit ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1122837f4154c511485fe0b9c64455f7b929c96fbb8d79fbdb336383ebd3720
+size 2272526