alaatiger989 commited on
Commit
04c4cd1
·
verified ·
1 Parent(s): 36ea430

Upload folder using huggingface_hub

Browse files
Files changed (24) hide show
  1. .gitattributes +2 -0
  2. aqib-nemo-asr.py +601 -0
  3. aqib-whipser4-arabic.py +654 -0
  4. aqib-whipser_ft-arabic_denoiser_meta.py +787 -0
  5. aqib-whipser_ft-arabic_noise_reducer.py +746 -0
  6. asr_websocket_client.html +606 -0
  7. best_nemo_whisper_jambonz.py +1338 -0
  8. best_nemo_whisper_jambonz_denoiser.py +1357 -0
  9. denoiser_model.py +8 -0
  10. improved_asr_web_ui.html +729 -0
  11. pretrained_models/asr-whisper-large-v2-commonvoice-ar/hyperparams.yaml +58 -0
  12. pretrained_models/asr-whisper-large-v2-commonvoice-ar/whisper.ckpt +3 -0
  13. requirements_denoiser.txt +3 -0
  14. speech_brain_whisper_denoiser.py +741 -0
  15. stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo +3 -0
  16. w_nemo.py +1033 -0
  17. whisper_checkpoints/models--openai--whisper-large-v2/.no_exist/ae4642769ce2ad8fc292556ccea8e901f1530655/processor_config.json +0 -0
  18. whisper_checkpoints/models--openai--whisper-large-v2/blobs/1ce74630ed587e80f3db2b3d434f7026327f131e +144 -0
  19. whisper_checkpoints/models--openai--whisper-large-v2/blobs/57a1ba2a82c093cabff2541409ae778c97145378b9ddfa722763cb1cb8f9020b +3 -0
  20. whisper_checkpoints/models--openai--whisper-large-v2/blobs/c2048dfa9fd94a052e62e908d2c4dfb18534b4d2 +0 -0
  21. whisper_checkpoints/models--openai--whisper-large-v2/refs/main +1 -0
  22. whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/config.json +144 -0
  23. whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/model.safetensors +3 -0
  24. whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/preprocessor_config.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo filter=lfs diff=lfs merge=lfs -text
37
+ whisper_checkpoints/models--openai--whisper-large-v2/blobs/57a1ba2a82c093cabff2541409ae778c97145378b9ddfa722763cb1cb8f9020b filter=lfs diff=lfs merge=lfs -text
aqib-nemo-asr.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import websockets
3
+ import json
4
+ import threading
5
+ import numpy as np
6
+ import logging
7
+ import time
8
+ import tempfile
9
+ import os
10
+ import re
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import nemo.collections.asr as nemo_asr
13
+ import soundfile as sf
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # ===== Arabic number mapping (expanded) =====
21
+ arabic_numbers = {
22
+ # Basic digits
23
+ "صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0",
24
+ "واحد": "1", "واحدة": "1", "١": "1",
25
+ "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
26
+ "تلاتة": "3", "ثلاثة": "3", "٣": "3",
27
+ "اربعة": "4", "أربعة": "4", "٤": "4",
28
+ "خمسة": "5", "٥": "5",
29
+ "ستة": "6", "٦": "6",
30
+ "سبعة": "7", "٧": "7",
31
+ "تمانية": "8", "ثمانية": "8", "٨": "8",
32
+ "تسعة": "9", "٩": "9",
33
+
34
+ # Teens
35
+ "عشرة": "10", "١٠": "10",
36
+ "حداشر": "11", "احد عشر": "11","احداشر": "11",
37
+ "اتناشر": "12", "اثنا عشر": "12",
38
+ "تلتاشر": "13", "ثلاثة عشر": "13",
39
+ "اربعتاشر": "14", "أربعة عشر": "14",
40
+ "خمستاشر": "15", "خمسة عشر": "15",
41
+ "ستاشر": "16", "ستة عشر": "16",
42
+ "سبعتاشر": "17", "سبعة عشر": "17",
43
+ "طمنتاشر": "18", "ثمانية عشر": "18",
44
+ "تسعتاشر": "19", "تسعة عشر": "19",
45
+
46
+ # Tens
47
+ "عشرين": "20", "٢٠": "20",
48
+ "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
49
+ "اربعين": "40", "أربعين": "40", "٤٠": "40",
50
+ "خمسين": "50", "٥٠": "50",
51
+ "ستين": "60", "٦٠": "60",
52
+ "سبعين": "70", "٧٠": "70",
53
+ "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
54
+ "تسعين": "90", "٩٠": "90",
55
+
56
+ # Hundreds
57
+ "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
58
+ "ميتين": "200", "مائتين": "200",
59
+ "تلاتمية": "300", "ثلاثمائة": "300",
60
+ "اربعمية": "400", "أربعمائة": "400",
61
+ "خمسمية": "500", "خمسمائة": "500",
62
+ "ستمية": "600", "ستمائة": "600",
63
+ "سبعمية": "700", "سبعمائة": "700",
64
+ "تمانمية": "800", "ثمانمائة": "800",
65
+ "تسعمية": "900", "تسعمائة": "900",
66
+
67
+ # Thousands
68
+ "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
69
+ "ألفين": "2000", "الفين": "2000",
70
+ "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
71
+ "اربعة آلاف": "4000", "أربعة آلاف": "4000",
72
+ "خمسة آلاف": "5000",
73
+ "ستة آلاف": "6000",
74
+ "سبعة آلاف": "7000",
75
+ "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
76
+ "تسعة آلاف": "9000",
77
+
78
+ # Large numbers
79
+ "عشرة آلاف": "10000",
80
+ "مية ألف": "100000", "مائة ألف": "100000",
81
+ "مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
82
+ "ملايين": "1000000",
83
+ "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000"
84
+ }
85
+
86
+ def replace_arabic_numbers(text: str) -> str:
87
+ for word, digit in arabic_numbers.items():
88
+ text = re.sub(rf"\b{word}\b", digit, text)
89
+ return text
90
+
91
+
92
+ # Global NeMo model
93
+ asr_model = None
94
+
95
+ def initialize_nemo_model():
96
+ """Initialize NeMo FastConformer model"""
97
+ global asr_model
98
+
99
+ logger.info("Loading NeMo FastConformer Arabic ASR model...")
100
+
101
+ # Model path - adjust this to your model location
102
+ model_path = os.getenv(
103
+ "NEMO_MODEL_PATH",
104
+ "/path/to/stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo" # Update this path
105
+ )
106
+
107
+ if not os.path.exists(model_path):
108
+ logger.error(f"Model not found at: {model_path}")
109
+ logger.info("Please download the model from: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_ar_fastconformer_hybrid_large_pcd")
110
+ raise FileNotFoundError(f"NeMo model not found: {model_path}")
111
+
112
+ try:
113
+ asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
114
+ logger.info("NeMo FastConformer model loaded successfully")
115
+
116
+ # Set model to eval mode for inference
117
+ asr_model.eval()
118
+
119
+ except Exception as e:
120
+ logger.error(f"Failed to load NeMo model: {e}")
121
+ raise
122
+
123
+ # Initialize model on startup
124
+ initialize_nemo_model()
125
+
126
+ # Thread pool for processing
127
+ executor = ThreadPoolExecutor(max_workers=4)
128
+
129
+ class JambonzAudioBuffer:
130
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
131
+ self.sample_rate = sample_rate
132
+ self.chunk_duration = chunk_duration
133
+ self.chunk_samples = int(chunk_duration * sample_rate)
134
+
135
+ self.buffer = np.array([], dtype=np.float32)
136
+ self.lock = threading.Lock()
137
+ self.total_audio = np.array([], dtype=np.float32)
138
+
139
+ # Voice Activity Detection
140
+ self.silence_threshold = 0.05
141
+ self.min_speech_samples = int(0.5 * sample_rate)
142
+
143
+ def add_audio(self, audio_data):
144
+ with self.lock:
145
+ self.buffer = np.concatenate([self.buffer, audio_data])
146
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
147
+
148
+ def has_chunk_ready(self):
149
+ with self.lock:
150
+ return len(self.buffer) >= self.chunk_samples
151
+
152
+ def is_speech(self, audio_chunk):
153
+ """Simple VAD based on energy"""
154
+ if len(audio_chunk) < self.min_speech_samples:
155
+ return False
156
+ energy = np.mean(np.abs(audio_chunk))
157
+ return energy > self.silence_threshold
158
+
159
+ def get_chunk_for_processing(self):
160
+ """Get audio chunk for processing"""
161
+ with self.lock:
162
+ if len(self.buffer) < self.chunk_samples:
163
+ return None
164
+ return np.array([1]) # Signal that chunk is ready
165
+
166
+ def get_all_audio(self):
167
+ """Get all accumulated audio"""
168
+ with self.lock:
169
+ return self.total_audio.copy()
170
+
171
+ def clear(self):
172
+ with self.lock:
173
+ self.buffer = np.array([], dtype=np.float32)
174
+ self.total_audio = np.array([], dtype=np.float32)
175
+
176
+ def reset_for_new_segment(self):
177
+ """Reset buffers for new transcription segment"""
178
+ with self.lock:
179
+ self.buffer = np.array([], dtype=np.float32)
180
+ self.total_audio = np.array([], dtype=np.float32)
181
+
182
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
183
+ """Convert LINEAR16 PCM bytes to numpy array"""
184
+ try:
185
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
186
+ audio_array = audio_array.astype(np.float32) / 32768.0
187
+ return audio_array
188
+ except Exception as e:
189
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
190
+ return np.array([], dtype=np.float32)
191
+
192
+ def resample_audio(audio_data, source_rate, target_rate):
193
+ """Resample audio to target sample rate"""
194
+ if source_rate == target_rate:
195
+ return audio_data
196
+
197
+ if source_rate == 8000 and target_rate == 16000:
198
+ # Simple 2x upsampling for common case
199
+ upsampled = np.repeat(audio_data, 2)
200
+ return upsampled.astype(np.float32)
201
+
202
+ # Fallback: Linear interpolation resampling
203
+ ratio = target_rate / source_rate
204
+ indices = np.arange(0, len(audio_data), 1/ratio)
205
+ indices = indices[indices < len(audio_data)]
206
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
207
+
208
+ return resampled.astype(np.float32)
209
+
210
+ def transcribe_with_nemo(audio_data, source_sample_rate=8000, target_sample_rate=16000):
211
+ """Transcribe audio using NeMo FastConformer"""
212
+ try:
213
+ if len(audio_data) == 0:
214
+ return ""
215
+
216
+ # Resample to 16kHz (NeMo models typically expect 16kHz)
217
+ resampled_audio = resample_audio(audio_data, 8000, 16000)
218
+
219
+ # Skip very short audio
220
+ min_samples = int(0.3 * 16000)
221
+ if len(resampled_audio) < min_samples:
222
+ return ""
223
+
224
+ start_time = time.time()
225
+
226
+ # Save audio to temporary file (NeMo expects file path)
227
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
228
+ # Write audio as WAV file
229
+ sf.write(tmp_file.name, resampled_audio, target_sample_rate)
230
+ tmp_path = tmp_file.name
231
+
232
+ try:
233
+ # Transcribe with NeMo
234
+ result = asr_model.transcribe([tmp_path])
235
+
236
+ # Debug logging to understand result format
237
+ logger.info(f"NeMo result type: {type(result)}")
238
+ if result and len(result) > 0:
239
+ logger.info(f"First result type: {type(result[0])}")
240
+ logger.info(f"First result content: {result[0]}")
241
+
242
+ if result and len(result) > 0:
243
+ # Handle different NeMo result formats
244
+ if hasattr(result[0], 'text'):
245
+ # If result has .text attribute (newer NeMo versions)
246
+ raw_text = result[0].text
247
+ logger.info(f"Using .text attribute: {raw_text}")
248
+ elif isinstance(result[0], str):
249
+ # If result is directly a string
250
+ raw_text = result[0]
251
+ logger.info(f"Using direct string: {raw_text}")
252
+ else:
253
+ # If result is some other format, convert to string
254
+ raw_text = str(result[0])
255
+ logger.info(f"Using str() conversion: {raw_text}")
256
+
257
+ # Ensure raw_text is a string before processing
258
+ if not isinstance(raw_text, str):
259
+ raw_text = str(raw_text)
260
+
261
+ # Only process if we have actual text content
262
+ if raw_text and raw_text.strip():
263
+ # Convert Arabic numbers to digits
264
+
265
+ logger.info(f"before sending to FXN--- {raw_text}")
266
+ cleaned_text = replace_arabic_numbers(raw_text)
267
+ logger.info(f"after FXN--- {cleaned_text}")
268
+ end_time = time.time()
269
+
270
+ if cleaned_text.strip():
271
+ logger.info(f"NeMo transcription: '{cleaned_text}' (processed in {end_time - start_time:.2f}s)")
272
+
273
+ return cleaned_text.strip()
274
+ else:
275
+ logger.info("No transcription text found")
276
+ return ""
277
+ else:
278
+ logger.info("No results from NeMo transcription")
279
+ return ""
280
+
281
+ finally:
282
+ # Clean up temporary file
283
+ if os.path.exists(tmp_path):
284
+ os.remove(tmp_path)
285
+
286
+ except Exception as e:
287
+ logger.error(f"Error during NeMo transcription: {e}")
288
+ return ""
289
+
290
+ class JambonzSTTHandler:
291
+ def __init__(self, websocket):
292
+ self.websocket = websocket
293
+ self.audio_buffer = None
294
+ self.config = {}
295
+ self.running = False
296
+ self.transcription_task = None
297
+
298
+ # Auto-final detection variables
299
+ self.interim_count = 0
300
+ self.last_interim_time = None
301
+ self.silence_timeout = 2.0
302
+ self.min_interim_count = 2
303
+ self.auto_final_task = None
304
+ self.accumulated_transcript = ""
305
+ self.final_sent = False
306
+ self.segment_number = 0
307
+ self.last_partial = ""
308
+
309
+ # Processing tracking
310
+ self.processing_count = 0
311
+
312
+ async def start_processing(self, start_message):
313
+ """Initialize with start message from jambonz"""
314
+ self.config = {
315
+ "language": start_message.get("language", "ar-EG"),
316
+ "format": start_message.get("format", "raw"),
317
+ "encoding": start_message.get("encoding", "LINEAR16"),
318
+ "sample_rate": start_message.get("sampleRateHz", 8000),
319
+ "interim_results": True, # Always enable for internal processing
320
+ "options": start_message.get("options", {})
321
+ }
322
+
323
+ logger.info(f"NeMo STT session started with config: {self.config}")
324
+
325
+ # Initialize audio buffer
326
+ self.audio_buffer = JambonzAudioBuffer(
327
+ sample_rate=self.config["sample_rate"],
328
+ chunk_duration=1.0 # 1 second chunks for NeMo
329
+ )
330
+
331
+ # Reset session variables
332
+ self.running = True
333
+ self.interim_count = 0
334
+ self.last_interim_time = None
335
+ self.accumulated_transcript = ""
336
+ self.final_sent = False
337
+ self.segment_number = 0
338
+ self.processing_count = 0
339
+ self.last_partial = ""
340
+
341
+ # Start background transcription task
342
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
343
+
344
+ # Start auto-final detection task
345
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
346
+
347
+ async def stop_processing(self):
348
+ """Stop current processing session"""
349
+ logger.info("Stopping NeMo STT session...")
350
+ self.running = False
351
+
352
+ # Cancel background tasks
353
+ for task in [self.transcription_task, self.auto_final_task]:
354
+ if task:
355
+ task.cancel()
356
+ try:
357
+ await task
358
+ except asyncio.CancelledError:
359
+ pass
360
+
361
+ # Send final transcription if not already sent
362
+ if not self.final_sent and self.accumulated_transcript.strip():
363
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
364
+
365
+ # Process any remaining audio for comprehensive final transcription
366
+ if self.audio_buffer:
367
+ all_audio = self.audio_buffer.get_all_audio()
368
+ if len(all_audio) > 0 and not self.final_sent:
369
+ loop = asyncio.get_event_loop()
370
+ final_transcription = await loop.run_in_executor(
371
+ executor,
372
+ transcribe_with_nemo,
373
+ all_audio,
374
+ self.config["sample_rate"]
375
+ )
376
+
377
+ if final_transcription.strip():
378
+ await self.send_transcription(final_transcription, is_final=True)
379
+
380
+ # Clear audio buffer
381
+ if self.audio_buffer:
382
+ self.audio_buffer.clear()
383
+
384
+ logger.info("NeMo STT session stopped")
385
+
386
+ async def start_new_segment(self):
387
+ """Start a new transcription segment"""
388
+ self.segment_number += 1
389
+ self.interim_count = 0
390
+ self.last_interim_time = None
391
+ self.accumulated_transcript = ""
392
+ self.final_sent = False
393
+ self.last_partial = ""
394
+ self.processing_count = 0
395
+
396
+ if self.audio_buffer:
397
+ self.audio_buffer.reset_for_new_segment()
398
+
399
+ logger.info(f"Started new transcription segment #{self.segment_number}")
400
+
401
+ async def add_audio_data(self, audio_bytes):
402
+ """Add audio data to buffer"""
403
+ if self.audio_buffer and self.running:
404
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
405
+ self.audio_buffer.add_audio(audio_data)
406
+
407
+ async def _process_audio_chunks(self):
408
+ """Process audio chunks for interim results"""
409
+ while self.running:
410
+ try:
411
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
412
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
413
+ if chunk_signal is not None:
414
+ all_audio = self.audio_buffer.get_all_audio()
415
+
416
+ if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
417
+ loop = asyncio.get_event_loop()
418
+ transcription = await loop.run_in_executor(
419
+ executor,
420
+ transcribe_with_nemo,
421
+ all_audio,
422
+ self.config["sample_rate"]
423
+ )
424
+
425
+ if transcription.strip():
426
+ self.processing_count += 1
427
+ self.accumulated_transcript = transcription
428
+
429
+ if transcription != self.last_partial or self.interim_count == 0:
430
+ self.last_partial = transcription
431
+ self.interim_count += 1
432
+ self.last_interim_time = time.time()
433
+ logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
434
+ else:
435
+ self.last_interim_time = time.time()
436
+
437
+ await asyncio.sleep(0.1) # Check every 100ms
438
+
439
+ except Exception as e:
440
+ logger.error(f"Error in chunk processing: {e}")
441
+ await asyncio.sleep(0.1)
442
+
443
+ async def _monitor_for_auto_final(self):
444
+ """Monitor for auto-final conditions"""
445
+ while self.running:
446
+ try:
447
+ current_time = time.time()
448
+
449
+ if (self.interim_count >= self.min_interim_count and
450
+ self.last_interim_time is not None and
451
+ (current_time - self.last_interim_time) >= self.silence_timeout and
452
+ not self.final_sent and
453
+ self.accumulated_transcript.strip()):
454
+
455
+ logger.info(f"Auto-final triggered for segment #{self.segment_number}")
456
+
457
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
458
+ await self.start_new_segment()
459
+
460
+ await asyncio.sleep(0.5) # Check every 500ms
461
+
462
+ except Exception as e:
463
+ logger.error(f"Error in auto-final monitoring: {e}")
464
+ await asyncio.sleep(0.5)
465
+
466
+ async def send_transcription(self, text, is_final=True, confidence=0.9):
467
+ """Send transcription in jambonz format"""
468
+ try:
469
+ message = {
470
+ "type": "transcription",
471
+ "is_final": True, # Always send as final
472
+ "alternatives": [
473
+ {
474
+ "transcript": text,
475
+ "confidence": confidence
476
+ }
477
+ ],
478
+ "language": self.config.get("language", "ar-EG"),
479
+ "channel": 1
480
+ }
481
+
482
+ await self.websocket.send(json.dumps(message))
483
+ self.final_sent = True
484
+
485
+ logger.info(f"Sent FINAL transcription to Jambonz: '{text}'")
486
+
487
+ except Exception as e:
488
+ logger.error(f"Error sending transcription: {e}")
489
+
490
+ async def send_error(self, error_message):
491
+ """Send error message in jambonz format"""
492
+ try:
493
+ message = {
494
+ "type": "error",
495
+ "error": error_message
496
+ }
497
+ await self.websocket.send(json.dumps(message))
498
+ logger.error(f"Sent error: {error_message}")
499
+ except Exception as e:
500
+ logger.error(f"Error sending error message: {e}")
501
+
502
+ async def handle_jambonz_websocket(websocket):
503
+ """Handle jambonz WebSocket connections"""
504
+
505
+ client_id = f"jambonz_{id(websocket)}"
506
+ logger.info(f"New NeMo jambonz connection: {client_id}")
507
+
508
+ handler = JambonzSTTHandler(websocket)
509
+
510
+ try:
511
+ async for message in websocket:
512
+ try:
513
+ if isinstance(message, str):
514
+ data = json.loads(message)
515
+ message_type = data.get("type")
516
+
517
+ if message_type == "start":
518
+ logger.info(f"Received start message: {data}")
519
+ await handler.start_processing(data)
520
+
521
+ elif message_type == "stop":
522
+ logger.info("Received stop message - closing WebSocket")
523
+ await handler.stop_processing()
524
+ await websocket.close(code=1000, reason="Session stopped by client")
525
+ break
526
+
527
+ else:
528
+ logger.warning(f"Unknown message type: {message_type}")
529
+ await handler.send_error(f"Unknown message type: {message_type}")
530
+
531
+ else:
532
+ # Handle binary audio data
533
+ if not handler.running or handler.audio_buffer is None:
534
+ logger.warning("Received audio data outside of active session")
535
+ await handler.send_error("Received audio before start message or after stop")
536
+ continue
537
+
538
+ await handler.add_audio_data(message)
539
+
540
+ except json.JSONDecodeError as e:
541
+ logger.error(f"JSON decode error: {e}")
542
+ await handler.send_error(f"Invalid JSON: {str(e)}")
543
+ except Exception as e:
544
+ logger.error(f"Error processing message: {e}")
545
+ await handler.send_error(f"Processing error: {str(e)}")
546
+
547
+ except websockets.exceptions.ConnectionClosed:
548
+ logger.info(f"NeMo jambonz connection closed: {client_id}")
549
+ except Exception as e:
550
+ logger.error(f"NeMo jambonz WebSocket error: {e}")
551
+ try:
552
+ await handler.send_error(str(e))
553
+ except:
554
+ pass
555
+ finally:
556
+ if handler.running:
557
+ await handler.stop_processing()
558
+ logger.info(f"NeMo jambonz connection ended: {client_id}")
559
+
560
+ async def main():
561
+ """Start the NeMo jambonz STT WebSocket server"""
562
+ logger.info("Starting NeMo Jambonz STT WebSocket server on port 3007...")
563
+
564
+ # Start WebSocket server
565
+ server = await websockets.serve(
566
+ handle_jambonz_websocket,
567
+ "0.0.0.0",
568
+ 3007,
569
+ ping_interval=20,
570
+ ping_timeout=10,
571
+ close_timeout=10
572
+ )
573
+
574
+ logger.info("NeMo Jambonz STT WebSocket server started on ws://0.0.0.0:3007")
575
+ logger.info("Ready to handle jambonz STT requests with NeMo FastConformer")
576
+ logger.info("FEATURES:")
577
+ logger.info("- Arabic ASR using NeMo FastConformer model")
578
+ logger.info("- Arabic number word to digit conversion")
579
+ logger.info("- Continuous transcription with segmentation")
580
+ logger.info("- Voice Activity Detection")
581
+
582
+ # Wait for the server to close
583
+ await server.wait_closed()
584
+
585
+ if __name__ == "__main__":
586
+ print("=" * 80)
587
+ print("NeMo FastConformer Jambonz STT Server")
588
+ print("=" * 80)
589
+ print("Model: NeMo FastConformer Arabic ASR")
590
+ print("WebSocket Port: 3007")
591
+ print("Protocol: jambonz STT API")
592
+ print("Audio Format: LINEAR16 PCM @ 8kHz → 16kHz")
593
+ print("Language: Arabic with number conversion")
594
+ print("=" * 80)
595
+
596
+ try:
597
+ asyncio.run(main())
598
+ except KeyboardInterrupt:
599
+ print("\nShutting down NeMo server...")
600
+ except Exception as e:
601
+ print(f"Server error: {e}")
aqib-whipser4-arabic.py ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import websockets
4
+ import json
5
+ import threading
6
+ import numpy as np
7
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
8
+ import subprocess
9
+ import logging
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import struct
13
+ import re
14
+
15
+ # Arabic number conversion imports
16
+ try:
17
+ from pyarabic.number import text2number
18
+ arabic_numbers_available = True
19
+ print("Arabic number conversion available")
20
+ except ImportError:
21
+ arabic_numbers_available = False
22
+ print("pyarabic not available - install with: pip install pyarabic")
23
+ print("Arabic numbers will not be converted to digits")
24
+
25
+ # Set up logging
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def convert_arabic_numbers_in_sentence(sentence: str) -> str:
31
+ """
32
+ Replace Arabic number words in a sentence with digits,
33
+ preserving all other words and punctuation.
34
+ Handles common spelling variants and zero explicitly.
35
+ """
36
+ try:
37
+ print("Fxn called--------------")
38
+
39
+ # --- Normalization step ---
40
+ replacements = {
41
+ "اربعة": "أربعة",
42
+ "اربع": "أربع",
43
+ "اثنين": "اثنان",
44
+ "اتنين": "اثنان", # Egyptian variant
45
+ "ثلاث": "ثلاثة",
46
+ "خمس": "خمسة",
47
+ "ست": "ستة",
48
+ "سبع": "سبعة",
49
+ "ثمان": "ثمانية",
50
+ "تسع": "تسعة",
51
+ "عشر": "عشرة",
52
+ }
53
+ for wrong, correct in replacements.items():
54
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
55
+
56
+ # --- Split by whitespace but keep spaces ---
57
+ words = re.split(r'(\s+)', sentence)
58
+ converted_words = []
59
+
60
+ for word in words:
61
+ stripped = word.strip()
62
+ if not stripped: # skip spaces
63
+ converted_words.append(word)
64
+ continue
65
+
66
+ try:
67
+ num = text2number(stripped)
68
+
69
+ # Accept valid numbers, including zero explicitly
70
+ if isinstance(num, int):
71
+ if num != 0 or stripped == "صفر":
72
+ converted_words.append(str(num))
73
+ else:
74
+ converted_words.append(word)
75
+ else:
76
+ converted_words.append(word)
77
+
78
+ except Exception:
79
+ converted_words.append(word)
80
+
81
+ return ''.join(converted_words)
82
+
83
+ except Exception as e:
84
+ logger.warning(f"Error converting Arabic numbers: {e}")
85
+ return sentence
86
+
87
+
88
+ # Try to install flash-attn if not available
89
+ try:
90
+ import flash_attn
91
+ use_flash_attn = True
92
+ except ImportError:
93
+ print("Flash attention not available, using standard attention")
94
+ use_flash_attn = False
95
+ try:
96
+ subprocess.run(
97
+ "pip install websockets",
98
+ shell=True,
99
+ check=False
100
+ )
101
+ subprocess.run(
102
+ "pip install flash-attn --no-build-isolation",
103
+ shell=True,
104
+ check=False
105
+ )
106
+ except:
107
+ pass
108
+
109
+ device = "cuda" if torch.cuda.is_available() else "cpu"
110
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
111
+ MODEL_NAME = "openai/whisper-large-v3-turbo"
112
+
113
+ print(f"Using device: {device}")
114
+ print(f"CUDA available: {torch.cuda.is_available()}")
115
+ if torch.cuda.is_available():
116
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
117
+
118
+ # Model initialization with fallback for attention implementation
119
+ try:
120
+ if use_flash_attn and torch.cuda.is_available():
121
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
122
+ MODEL_NAME,
123
+ torch_dtype=torch_dtype,
124
+ low_cpu_mem_usage=True,
125
+ use_safetensors=True,
126
+ attn_implementation="flash_attention_2"
127
+ )
128
+ else:
129
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
130
+ MODEL_NAME,
131
+ torch_dtype=torch_dtype,
132
+ low_cpu_mem_usage=True,
133
+ use_safetensors=True
134
+ )
135
+ except Exception as e:
136
+ print(f"Error loading model with flash attention: {e}")
137
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
138
+ MODEL_NAME,
139
+ torch_dtype=torch_dtype,
140
+ low_cpu_mem_usage=True,
141
+ use_safetensors=True
142
+ )
143
+
144
+ model.to(device)
145
+
146
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
147
+ tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
148
+
149
+ # Thread pool for processing audio
150
+ executor = ThreadPoolExecutor(max_workers=4)
151
+
152
+ class JambonzAudioBuffer:
153
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
154
+ self.sample_rate = sample_rate
155
+ self.chunk_duration = chunk_duration
156
+ self.chunk_samples = int(chunk_duration * sample_rate)
157
+
158
+ self.buffer = np.array([], dtype=np.float32)
159
+ self.lock = threading.Lock()
160
+ self.total_audio = np.array([], dtype=np.float32)
161
+
162
+ # Voice Activity Detection (simple energy-based)
163
+ self.silence_threshold = 0.01
164
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
165
+
166
+ def add_audio(self, audio_data):
167
+ with self.lock:
168
+ self.buffer = np.concatenate([self.buffer, audio_data])
169
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
170
+
171
+ def has_chunk_ready(self):
172
+ with self.lock:
173
+ return len(self.buffer) >= self.chunk_samples
174
+
175
+ def is_speech(self, audio_chunk):
176
+ """Simple VAD based on energy"""
177
+ if len(audio_chunk) < self.min_speech_samples:
178
+ return False
179
+ energy = np.mean(np.abs(audio_chunk))
180
+ return energy > self.silence_threshold
181
+
182
+ def get_chunk_for_processing(self):
183
+ """Get audio chunk for processing - but don't remove it from buffer for interim results"""
184
+ with self.lock:
185
+ if len(self.buffer) < self.chunk_samples:
186
+ return None
187
+
188
+ # For interim results, we want to trigger processing but keep accumulating audio
189
+ # So we just return a signal that we have enough audio, but don't consume it
190
+ return np.array([1]) # Return a dummy array to signal chunk is ready
191
+
192
+ def get_all_audio(self):
193
+ """Get all accumulated audio for final transcription"""
194
+ with self.lock:
195
+ return self.total_audio.copy()
196
+
197
+ def clear(self):
198
+ with self.lock:
199
+ self.buffer = np.array([], dtype=np.float32)
200
+ self.total_audio = np.array([], dtype=np.float32)
201
+
202
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
203
+ """Convert LINEAR16 PCM bytes to numpy array (jambonz format)"""
204
+ try:
205
+ # jambonz sends LINEAR16 PCM at 8kHz
206
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
207
+ # Convert to float32 and normalize
208
+ audio_array = audio_array.astype(np.float32) / 32768.0
209
+ return audio_array
210
+ except Exception as e:
211
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
212
+ return np.array([], dtype=np.float32)
213
+
214
+ def resample_audio(audio_data, source_rate, target_rate):
215
+ """Simple resampling from 8kHz to 16kHz for Whisper"""
216
+ if source_rate == target_rate:
217
+ return audio_data
218
+
219
+ # Simple linear interpolation resampling
220
+ ratio = target_rate / source_rate
221
+ indices = np.arange(0, len(audio_data), 1/ratio)
222
+ indices = indices[indices < len(audio_data)]
223
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
224
+
225
+ # Ensure proper float32 dtype for consistency
226
+ return resampled.astype(np.float32)
227
+
228
+ def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
229
+ """Transcribe audio chunk using model's generate method directly"""
230
+ try:
231
+ if len(audio_data) == 0:
232
+ return ""
233
+
234
+ # Resample from 8kHz to 16kHz for Whisper
235
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
236
+
237
+ # Ensure minimum length for Whisper
238
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
239
+ if len(resampled_audio) < min_samples:
240
+ return ""
241
+
242
+ start_time = time.time()
243
+
244
+ # Prepare input features with proper dtype
245
+ input_features = processor(
246
+ resampled_audio,
247
+ sampling_rate=target_sample_rate,
248
+ return_tensors="pt"
249
+ ).input_features
250
+
251
+ # Ensure correct dtype and device
252
+ input_features = input_features.to(device=device, dtype=torch_dtype)
253
+
254
+ # Create attention mask to avoid warnings
255
+ attention_mask = torch.ones(
256
+ input_features.shape[:-1],
257
+ dtype=torch.long,
258
+ device=device
259
+ )
260
+
261
+ # Generate transcription using model directly
262
+ with torch.no_grad():
263
+ predicted_ids = model.generate(
264
+ input_features,
265
+ attention_mask=attention_mask,
266
+ max_new_tokens=128,
267
+ do_sample=False,
268
+ temperature=0.0,
269
+ num_beams=1,
270
+ language="ar",
271
+ task="transcribe",
272
+ pad_token_id=tokenizer.pad_token_id,
273
+ eos_token_id=tokenizer.eos_token_id
274
+ )
275
+
276
+ # Decode the transcription
277
+ transcription = tokenizer.batch_decode(
278
+ predicted_ids,
279
+ skip_special_tokens=True
280
+ )[0].strip()
281
+
282
+ end_time = time.time()
283
+
284
+ logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
285
+ return transcription
286
+
287
+ except Exception as e:
288
+ logger.error(f"Error during direct transcription: {e}")
289
+ return ""
290
+
291
+ class JambonzSTTHandler:
292
+ def __init__(self, websocket):
293
+ self.websocket = websocket
294
+ self.audio_buffer = None
295
+ self.config = {}
296
+ self.running = True
297
+ self.transcription_task = None
298
+ self.full_transcript = ""
299
+ self.last_partial = ""
300
+
301
+ # Auto-final detection variables
302
+ self.interim_count = 0
303
+ self.last_interim_time = None
304
+ self.silence_timeout = 1.5 # 3 seconds of silence to trigger final
305
+ self.min_interim_count = 1 # Minimum interim results before considering final
306
+ self.auto_final_task = None
307
+ self.accumulated_transcript = ""
308
+ self.final_sent = False
309
+
310
+ async def start_processing(self, start_message):
311
+ """Initialize with start message from jambonz"""
312
+ self.config = {
313
+ "language": start_message.get("language", "ar-EG"),
314
+ "format": start_message.get("format", "raw"),
315
+ "encoding": start_message.get("encoding", "LINEAR16"),
316
+ "sample_rate": start_message.get("sampleRateHz", 8000),
317
+ "interim_results": start_message.get("interimResults", True),
318
+ "options": start_message.get("options", {})
319
+ }
320
+
321
+ logger.info(f"STT session started with config: {self.config}")
322
+
323
+ # Initialize audio buffer
324
+ self.audio_buffer = JambonzAudioBuffer(
325
+ sample_rate=self.config["sample_rate"],
326
+ chunk_duration=1.0 # Process every 1 second
327
+ )
328
+
329
+ # Reset auto-final detection variables
330
+ self.interim_count = 0
331
+ self.last_interim_time = None
332
+ self.accumulated_transcript = ""
333
+ self.final_sent = False
334
+
335
+ # Start background transcription task
336
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
337
+
338
+ # Start auto-final detection task
339
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
340
+
341
+ async def stop_processing(self):
342
+ """Stop processing and send final transcription"""
343
+ self.running = False
344
+
345
+ # Cancel background tasks
346
+ if self.transcription_task:
347
+ self.transcription_task.cancel()
348
+ try:
349
+ await self.transcription_task
350
+ except asyncio.CancelledError:
351
+ pass
352
+
353
+ if self.auto_final_task:
354
+ self.auto_final_task.cancel()
355
+ try:
356
+ await self.auto_final_task
357
+ except asyncio.CancelledError:
358
+ pass
359
+
360
+ # Send final transcription if not already sent
361
+ if not self.final_sent and self.accumulated_transcript.strip():
362
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
363
+
364
+ # Also process any remaining audio for comprehensive final transcription
365
+ if self.audio_buffer:
366
+ all_audio = self.audio_buffer.get_all_audio()
367
+ if len(all_audio) > 0 and not self.final_sent:
368
+ loop = asyncio.get_event_loop()
369
+ final_transcription = await loop.run_in_executor(
370
+ executor,
371
+ transcribe_chunk_direct,
372
+ all_audio,
373
+ self.config["sample_rate"]
374
+ )
375
+
376
+ if final_transcription.strip():
377
+ # Send comprehensive final transcription
378
+ await self.send_transcription(final_transcription, is_final=True)
379
+
380
+ logger.info("STT session ended")
381
+
382
+ async def add_audio_data(self, audio_bytes):
383
+ """Add audio data to buffer"""
384
+ if self.audio_buffer:
385
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
386
+ self.audio_buffer.add_audio(audio_data)
387
+
388
+ async def _process_audio_chunks(self):
389
+ """Process audio chunks for interim results"""
390
+ while self.running and self.config.get("interim_results", False):
391
+ try:
392
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
393
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
394
+ if chunk_signal is not None:
395
+ # Get all accumulated audio so far for complete transcription
396
+ all_audio = self.audio_buffer.get_all_audio()
397
+
398
+ # Only process if we have actual speech content
399
+ if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
400
+ # Run transcription on all accumulated audio
401
+ loop = asyncio.get_event_loop()
402
+ transcription = await loop.run_in_executor(
403
+ executor,
404
+ transcribe_chunk_direct,
405
+ all_audio,
406
+ self.config["sample_rate"]
407
+ )
408
+
409
+ if transcription.strip() and transcription != self.last_partial:
410
+ self.last_partial = transcription
411
+ self.accumulated_transcript = transcription # Update accumulated transcript
412
+ self.interim_count += 1
413
+ self.last_interim_time = time.time()
414
+
415
+ # Send interim result
416
+ await self.send_transcription(transcription, is_final=False)
417
+
418
+ logger.info(f"Interim #{self.interim_count}: '{transcription}'")
419
+
420
+ # Small delay to prevent excessive processing
421
+ await asyncio.sleep(0.1)
422
+
423
+ except Exception as e:
424
+ logger.error(f"Error in chunk processing: {e}")
425
+ await asyncio.sleep(1)
426
+
427
+ async def _monitor_for_auto_final(self):
428
+ """Monitor for auto-final conditions: 3 seconds silence after 3+ interim results"""
429
+ while self.running:
430
+ try:
431
+ current_time = time.time()
432
+
433
+ # Check if we should send auto-final transcription
434
+ if (self.interim_count >= self.min_interim_count and
435
+ self.last_interim_time is not None and
436
+ (current_time - self.last_interim_time) >= self.silence_timeout and
437
+ not self.final_sent and
438
+ self.accumulated_transcript.strip()):
439
+
440
+ logger.info(f"Auto-final triggered: {self.interim_count} interim results, "
441
+ f"{current_time - self.last_interim_time:.1f}s silence")
442
+
443
+ # Send the accumulated transcript as final
444
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
445
+ self.final_sent = True
446
+
447
+ # Reset counters for potential next utterance
448
+ self.interim_count = 0
449
+ self.last_interim_time = None
450
+ self.accumulated_transcript = ""
451
+
452
+ # Check every 0.5 seconds
453
+ await asyncio.sleep(0.5)
454
+
455
+ except Exception as e:
456
+ logger.error(f"Error in auto-final monitoring: {e}")
457
+ await asyncio.sleep(1)
458
+
459
+ # async def send_transcription(self, text, is_final=False, confidence=0.9):
460
+ # """Send transcription in jambonz format with Arabic number conversion"""
461
+ # try:
462
+ # # Convert Arabic numbers to digits before sending
463
+ # original_text = text
464
+ # converted_text = convert_arabic_numbers_in_sentence(text)
465
+
466
+ # # Log the conversion if numbers were found and converted
467
+ # if original_text != converted_text:
468
+ # logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
469
+
470
+ # message = {
471
+ # "type": "transcription",
472
+ # "is_final": is_final,
473
+ # "alternatives": [
474
+ # {
475
+ # "transcript": converted_text,
476
+ # "confidence": confidence
477
+ # }
478
+ # ],
479
+ # "language": self.config.get("language", "ar-EG"),
480
+ # "channel": 1
481
+ # }
482
+
483
+ # await self.websocket.send(json.dumps(message))
484
+ # logger.info(f"Sent {'FINAL' if is_final else 'interim'} transcription: '{converted_text}'")
485
+
486
+ # if is_final:
487
+ # self.final_sent = True
488
+
489
+ # except Exception as e:
490
+ # logger.error(f"Error sending transcription: {e}")
491
+
492
+
493
+
494
+ async def send_transcription(self, text, is_final=False, confidence=0.9):
495
+ """Send transcription in jambonz format with Arabic number conversion, only for final results"""
496
+ try:
497
+ if not is_final:
498
+ # Do nothing for interim results
499
+ logger.debug("Skipping interim transcription (not final).")
500
+ return
501
+
502
+ # Convert Arabic numbers only for final transcripts
503
+ original_text = text
504
+ converted_text = convert_arabic_numbers_in_sentence(text)
505
+
506
+ # Log the conversion if numbers were found and converted
507
+ if original_text != converted_text:
508
+ logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
509
+
510
+ message = {
511
+ "type": "transcription",
512
+ "is_final": True,
513
+ "alternatives": [
514
+ {
515
+ "transcript": converted_text,
516
+ "confidence": confidence
517
+ }
518
+ ],
519
+ "language": self.config.get("language", "ar-EG"),
520
+ "channel": 1
521
+ }
522
+
523
+ # Send only final messages
524
+ await self.websocket.send(json.dumps(message))
525
+ logger.info(f"Sent FINAL transcription: '{converted_text}'")
526
+
527
+ self.final_sent = True
528
+
529
+ except Exception as e:
530
+ logger.error(f"Error sending transcription: {e}")
531
+
532
+
533
+
534
+
535
+ async def send_error(self, error_message):
536
+ """Send error message in jambonz format"""
537
+ try:
538
+ message = {
539
+ "type": "error",
540
+ "error": error_message
541
+ }
542
+ await self.websocket.send(json.dumps(message))
543
+ logger.error(f"Sent error: {error_message}")
544
+ except Exception as e:
545
+ logger.error(f"Error sending error message: {e}")
546
+
547
+ async def handle_jambonz_websocket(websocket):
548
+ """Handle jambonz WebSocket connections"""
549
+
550
+ client_id = f"jambonz_{id(websocket)}"
551
+ logger.info(f"New jambonz connection: {client_id}")
552
+
553
+ handler = JambonzSTTHandler(websocket)
554
+
555
+ try:
556
+ async for message in websocket:
557
+ try:
558
+ if isinstance(message, str):
559
+ # Handle JSON control messages
560
+ data = json.loads(message)
561
+ message_type = data.get("type")
562
+
563
+ if message_type == "start":
564
+ logger.info(f"Received start message: {data}")
565
+ await handler.start_processing(data)
566
+
567
+ elif message_type == "stop":
568
+ logger.info("Received stop message")
569
+ await handler.stop_processing()
570
+ # Close websocket after final transcription
571
+ await websocket.close(code=1000, reason="Session completed")
572
+ break
573
+
574
+ else:
575
+ logger.warning(f"Unknown message type: {message_type}")
576
+ await handler.send_error(f"Unknown message type: {message_type}")
577
+
578
+ else:
579
+ # Handle binary audio data (LINEAR16 PCM)
580
+ if handler.audio_buffer is None:
581
+ await handler.send_error("Received audio before start message")
582
+ continue
583
+
584
+ await handler.add_audio_data(message)
585
+
586
+ except json.JSONDecodeError as e:
587
+ logger.error(f"JSON decode error: {e}")
588
+ await handler.send_error(f"Invalid JSON: {str(e)}")
589
+ except Exception as e:
590
+ logger.error(f"Error processing message: {e}")
591
+ await handler.send_error(f"Processing error: {str(e)}")
592
+
593
+ except websockets.exceptions.ConnectionClosed:
594
+ logger.info(f"jambonz connection closed: {client_id}")
595
+ except Exception as e:
596
+ logger.error(f"jambonz WebSocket error: {e}")
597
+ try:
598
+ await handler.send_error(str(e))
599
+ except:
600
+ pass
601
+ finally:
602
+ if handler.running:
603
+ await handler.stop_processing()
604
+ logger.info(f"jambonz connection ended: {client_id}")
605
+
606
+ async def main():
607
+ """Start the jambonz STT WebSocket server"""
608
+ logger.info("Starting Jambonz Custom STT WebSocket server on port 3006...")
609
+
610
+ # Start WebSocket server
611
+ server = await websockets.serve(
612
+ handle_jambonz_websocket,
613
+ "0.0.0.0",
614
+ 3006,
615
+ ping_interval=20,
616
+ ping_timeout=10,
617
+ close_timeout=10
618
+ )
619
+
620
+ logger.info("Jambonz Custom STT WebSocket server started on ws://0.0.0.0:3006")
621
+ logger.info("Ready to handle jambonz STT requests")
622
+ logger.info("- Expects LINEAR16 PCM audio at 8kHz")
623
+ logger.info("- Supports interim results with auto-final detection")
624
+ logger.info("- Auto-final: 3+ interim results + 1.3s silence")
625
+ logger.info("- Resamples to 16kHz for Whisper processing")
626
+ logger.info("- Converts Arabic numbers to digits before sending")
627
+
628
+ # Wait for the server to close
629
+ await server.wait_closed()
630
+
631
+ if __name__ == "__main__":
632
+ print("=" * 60)
633
+ print("Jambonz Custom STT Server with Whisper + Arabic Numbers")
634
+ print("=" * 60)
635
+ print(f"Model: {MODEL_NAME}")
636
+ print(f"Device: {device}")
637
+ print("WebSocket Port: 3006")
638
+ print("Protocol: jambonz STT API")
639
+ print("Audio Format: LINEAR16 PCM @ 8kHz")
640
+ print("Auto-Final: 2+ speech activities + 1.3s silence")
641
+ print("Arabic Numbers: Converted to digits in FINAL transcriptions only")
642
+ print("Interim Results: DISABLED (final transcription only)")
643
+ if arabic_numbers_available:
644
+ print("✓ pyarabic library available for number conversion")
645
+ else:
646
+ print("✗ pyarabic library not available - install with: pip install pyarabic")
647
+ print("=" * 60)
648
+
649
+ try:
650
+ asyncio.run(main())
651
+ except KeyboardInterrupt:
652
+ print("\nShutting down server...")
653
+ except Exception as e:
654
+ print(f"Server error: {e}")
aqib-whipser_ft-arabic_denoiser_meta.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # import asyncio
3
+ # import websockets
4
+ # import json
5
+ # import threading
6
+ # import numpy as np
7
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline , WhisperForConditionalGeneration, WhisperProcessor
8
+ # import subprocess
9
+ # import logging
10
+ # import time
11
+ # from concurrent.futures import ThreadPoolExecutor
12
+ # import struct
13
+ # import re
14
+ # 3 - 10 - 2025
15
+ import torch
16
+ import asyncio
17
+ import websockets
18
+ import json
19
+ import threading
20
+ import numpy as np
21
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer
22
+ import subprocess
23
+ import logging
24
+ import time
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ import re
27
+
28
+ # --- Denoiser added ---
29
+ try:
30
+ import noisereduce as nr
31
+ denoiser_available = True
32
+ print("Denoiser available (using noisereduce)")
33
+ except ImportError:
34
+ denoiser_available = False
35
+ print("noisereduce not available - install with: pip install noisereduce")
36
+ ##############################################################################################
37
+ # Arabic number conversion imports
38
+ try:
39
+ from pyarabic.number import text2number
40
+ arabic_numbers_available = True
41
+ print("Arabic number conversion available")
42
+ except ImportError:
43
+ arabic_numbers_available = False
44
+ print("pyarabic not available - install with: pip install pyarabic")
45
+ print("Arabic numbers will not be converted to digits")
46
+
47
+ # Set up logging
48
+ logging.basicConfig(level=logging.INFO)
49
+ logger = logging.getLogger(__name__)
50
+ # 3 - 10 - 2025
51
+ # def denoise_audio(audio_data, sample_rate=16000):
52
+ # """Apply noise reduction to audio using noisereduce."""
53
+ # if not denoiser_available or len(audio_data) == 0:
54
+ # return audio_data
55
+ # try:
56
+ # reduced = nr.reduce_noise(y=audio_data, sr=sample_rate)
57
+ # return reduced.astype(np.float32)
58
+ # except Exception as e:
59
+ # logger.warning(f"Denoiser failed: {e}")
60
+ # return audio_data
61
+ #############################################################################################
62
+ def convert_arabic_numbers_in_sentence(sentence: str) -> str:
63
+ """
64
+ Replace Arabic number words in a sentence with digits,
65
+ preserving all other words and punctuation.
66
+ Handles common spelling variants and zero explicitly.
67
+ """
68
+ try:
69
+ print("Fxn called--------------")
70
+
71
+ # --- Normalization step ---
72
+ replacements = {
73
+ "اربعة": "أربعة",
74
+ "اربع": "أربع",
75
+ "اثنين": "اثنان",
76
+ "اتنين": "اثنان", # Egyptian variant
77
+ "ثلاث": "ثلاثة",
78
+ "خمس": "خمسة",
79
+ "ست": "ستة",
80
+ "سبع": "سبعة",
81
+ "ثمان": "ثمانية",
82
+ "تسع": "تسعة",
83
+ "عشر": "عشرة",
84
+ }
85
+ for wrong, correct in replacements.items():
86
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
87
+
88
+ # --- Split by whitespace but keep spaces ---
89
+ words = re.split(r'(\s+)', sentence)
90
+ converted_words = []
91
+
92
+ for word in words:
93
+ stripped = word.strip()
94
+ if not stripped: # skip spaces
95
+ converted_words.append(word)
96
+ continue
97
+
98
+ try:
99
+ num = text2number(stripped)
100
+
101
+ # Accept valid numbers, including zero explicitly
102
+ if isinstance(num, int):
103
+ if num != 0 or stripped == "صفر":
104
+ converted_words.append(str(num))
105
+ else:
106
+ converted_words.append(word)
107
+ else:
108
+ converted_words.append(word)
109
+
110
+ except Exception:
111
+ converted_words.append(word)
112
+
113
+ return ''.join(converted_words)
114
+
115
+ except Exception as e:
116
+ logger.warning(f"Error converting Arabic numbers: {e}")
117
+ return sentence
118
+
119
+
120
+ # Try to install flash-attn if not available
121
+ try:
122
+ import flash_attn
123
+ use_flash_attn = True
124
+ except ImportError:
125
+ print("Flash attention not available, using standard attention")
126
+ use_flash_attn = False
127
+ try:
128
+ subprocess.run(
129
+ "pip install websockets",
130
+ shell=True,
131
+ check=False
132
+ )
133
+ subprocess.run(
134
+ "pip install flash-attn --no-build-isolation",
135
+ shell=True,
136
+ check=False
137
+ )
138
+ except:
139
+ pass
140
+
141
+ device = "cuda" if torch.cuda.is_available() else "cpu"
142
+ # --- Facebook Denoiser added ---
143
+ try:
144
+ import torchaudio
145
+ from denoiser import pretrained
146
+ # Load DNS64 pretrained model (auto-downloads if not cached)
147
+ denoiser_model = pretrained.dns64().to(device)
148
+ denoiser_model.eval()
149
+ denoiser_available = True
150
+ print("facebook/denoiser loaded successfully")
151
+ except ImportError as e:
152
+ denoiser_available = False
153
+ print("facebook/denoiser not available - install with: pip install denoiser torchaudio")
154
+ denoiser_model = None
155
+
156
+
157
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
158
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"#"openai/whisper-large-v3-turbo"
159
+
160
+ print(f"Using device: {device}")
161
+ print(f"CUDA available: {torch.cuda.is_available()}")
162
+ if torch.cuda.is_available():
163
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
164
+
165
+ # Model initialization with fallback for attention implementation
166
+ try:
167
+ if use_flash_attn and torch.cuda.is_available():
168
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
169
+ MODEL_NAME,
170
+ torch_dtype=torch_dtype,
171
+ low_cpu_mem_usage=True,
172
+ use_safetensors=True,
173
+ attn_implementation="flash_attention_2"
174
+ )
175
+ else:
176
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
177
+ MODEL_NAME,
178
+ torch_dtype=torch_dtype,
179
+ low_cpu_mem_usage=True,
180
+ use_safetensors=True
181
+ )
182
+ except Exception as e:
183
+ print(f"Error loading model with flash attention: {e}")
184
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
185
+ MODEL_NAME,
186
+ torch_dtype=torch_dtype,
187
+ low_cpu_mem_usage=True,
188
+ use_safetensors=True
189
+ )
190
+
191
+ model.to(device)
192
+
193
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
194
+ tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
195
+ def denoise_audio(audio_data, sample_rate=16000):
196
+ """Apply denoising using facebook/denoiser pretrained model."""
197
+ if denoiser_model is None or len(audio_data) == 0:
198
+ return audio_data
199
+ try:
200
+ audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device=device).unsqueeze(0)
201
+ with torch.no_grad():
202
+ denoised_tensor = denoiser_model(audio_tensor, sample_rate=sample_rate)[0]
203
+ return denoised_tensor.squeeze().cpu().numpy().astype("float32")
204
+ except Exception as e:
205
+ print(f"[WARN] Denoiser failed: {e}")
206
+ return audio_data
207
+ # def denoise_audio(audio_data, sample_rate=16000):
208
+ # """Apply denoising using facebook/denoiser pretrained model."""
209
+ # if not denoiser_available or denoiser_model is None or len(audio_data) == 0:
210
+ # return audio_data
211
+ # try:
212
+ # # Convert numpy -> torch tensor
213
+ # audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device=device).unsqueeze(0)
214
+ # with torch.no_grad():
215
+ # denoised_tensor = denoiser_model(audio_tensor)[0]
216
+ # # Back to numpy
217
+ # denoised_audio = denoised_tensor.squeeze().cpu().numpy().astype(np.float32)
218
+ # return denoised_audio
219
+ # except Exception as e:
220
+ # logger.warning(f"Denoiser failed: {e}")
221
+ # return audio_data
222
+ # Thread pool for processing audio
223
+ executor = ThreadPoolExecutor(max_workers=4)
224
+
225
+ class JambonzAudioBuffer:
226
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
227
+ self.sample_rate = sample_rate
228
+ self.chunk_duration = chunk_duration
229
+ self.chunk_samples = int(chunk_duration * sample_rate)
230
+
231
+ self.buffer = np.array([], dtype=np.float32)
232
+ self.lock = threading.Lock()
233
+ self.total_audio = np.array([], dtype=np.float32)
234
+
235
+ # Voice Activity Detection (simple energy-based)
236
+ self.silence_threshold = 0.01
237
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
238
+
239
+ def add_audio(self, audio_data):
240
+ with self.lock:
241
+ self.buffer = np.concatenate([self.buffer, audio_data])
242
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
243
+
244
+ def has_chunk_ready(self):
245
+ with self.lock:
246
+ return len(self.buffer) >= self.chunk_samples
247
+
248
+ def is_speech(self, audio_chunk):
249
+ """Simple VAD based on energy"""
250
+ if len(audio_chunk) < self.min_speech_samples:
251
+ return False
252
+ energy = np.mean(np.abs(audio_chunk))
253
+ return energy > self.silence_threshold
254
+
255
+ def get_chunk_for_processing(self):
256
+ """Get audio chunk for processing - but don't remove it from buffer for interim results"""
257
+ with self.lock:
258
+ if len(self.buffer) < self.chunk_samples:
259
+ return None
260
+
261
+ # For interim results, we want to trigger processing but keep accumulating audio
262
+ # So we just return a signal that we have enough audio, but don't consume it
263
+ return np.array([1]) # Return a dummy array to signal chunk is ready
264
+
265
+ def get_all_audio(self):
266
+ """Get all accumulated audio for final transcription"""
267
+ with self.lock:
268
+ return self.total_audio.copy()
269
+
270
+ def clear(self):
271
+ with self.lock:
272
+ self.buffer = np.array([], dtype=np.float32)
273
+ self.total_audio = np.array([], dtype=np.float32)
274
+
275
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
276
+ """Convert LINEAR16 PCM bytes to numpy array (jambonz format)"""
277
+ try:
278
+ # jambonz sends LINEAR16 PCM at 8kHz
279
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
280
+ # Convert to float32 and normalize
281
+ audio_array = audio_array.astype(np.float32) / 32768.0
282
+ return audio_array
283
+ except Exception as e:
284
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
285
+ return np.array([], dtype=np.float32)
286
+
287
+ def resample_audio(audio_data, source_rate, target_rate):
288
+ """Simple resampling from 8kHz to 16kHz for Whisper"""
289
+ if source_rate == target_rate:
290
+ return audio_data
291
+
292
+ # Simple linear interpolation resampling
293
+ ratio = target_rate / source_rate
294
+ indices = np.arange(0, len(audio_data), 1/ratio)
295
+ indices = indices[indices < len(audio_data)]
296
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
297
+
298
+ # Ensure proper float32 dtype for consistency
299
+ return resampled.astype(np.float32)
300
+ def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
301
+ """Transcribe audio chunk using model's generate method directly"""
302
+ try:
303
+ if len(audio_data) == 0:
304
+ return ""
305
+
306
+ # Resample from 8kHz to 16kHz for Whisper
307
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
308
+
309
+ # --- Denoiser added ---
310
+ resampled_audio = denoise_audio(resampled_audio, sample_rate=target_sample_rate)
311
+
312
+ # Ensure minimum length for Whisper
313
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
314
+ if len(resampled_audio) < min_samples:
315
+ return ""
316
+
317
+ start_time = time.time()
318
+
319
+ # Prepare input features
320
+ input_features = processor(
321
+ resampled_audio,
322
+ sampling_rate=target_sample_rate,
323
+ return_tensors="pt"
324
+ ).input_features
325
+
326
+ input_features = input_features.to(device=device, dtype=torch_dtype)
327
+
328
+ attention_mask = torch.ones(
329
+ input_features.shape[:-1],
330
+ dtype=torch.long,
331
+ device=device
332
+ )
333
+
334
+ with torch.no_grad():
335
+ predicted_ids = model.generate(
336
+ input_features,
337
+ attention_mask=attention_mask,
338
+ max_new_tokens=128,
339
+ do_sample=False,
340
+ temperature=0.0,
341
+ num_beams=1,
342
+ language="ar",
343
+ task="transcribe",
344
+ pad_token_id=tokenizer.pad_token_id,
345
+ eos_token_id=tokenizer.eos_token_id
346
+ )
347
+
348
+ transcription = tokenizer.batch_decode(
349
+ predicted_ids,
350
+ skip_special_tokens=True
351
+ )[0].strip()
352
+
353
+ end_time = time.time()
354
+
355
+ logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
356
+ return transcription
357
+
358
+ except Exception as e:
359
+ logger.error(f"Error during direct transcription: {e}")
360
+ return ""
361
+ # def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
362
+ # """Transcribe audio chunk using model's generate method directly"""
363
+ # try:
364
+ # if len(audio_data) == 0:
365
+ # return ""
366
+
367
+ # # Resample from 8kHz to 16kHz for Whisper
368
+ # resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
369
+
370
+ # # Ensure minimum length for Whisper
371
+ # min_samples = int(0.1 * target_sample_rate) # 100ms minimum
372
+ # if len(resampled_audio) < min_samples:
373
+ # return ""
374
+
375
+ # start_time = time.time()
376
+
377
+ # # Prepare input features with proper dtype
378
+ # input_features = processor(
379
+ # resampled_audio,
380
+ # sampling_rate=target_sample_rate,
381
+ # return_tensors="pt"
382
+ # ).input_features
383
+
384
+ # # Ensure correct dtype and device
385
+ # input_features = input_features.to(device=device, dtype=torch_dtype)
386
+
387
+ # # Create attention mask to avoid warnings
388
+ # attention_mask = torch.ones(
389
+ # input_features.shape[:-1],
390
+ # dtype=torch.long,
391
+ # device=device
392
+ # )
393
+
394
+ # # Generate transcription using model directly
395
+ # with torch.no_grad():
396
+ # predicted_ids = model.generate(
397
+ # input_features,
398
+ # attention_mask=attention_mask,
399
+ # max_new_tokens=128,
400
+ # do_sample=False,
401
+ # temperature=0.0,
402
+ # num_beams=1,
403
+ # language="ar",
404
+ # task="transcribe",
405
+ # pad_token_id=tokenizer.pad_token_id,
406
+ # eos_token_id=tokenizer.eos_token_id
407
+ # )
408
+
409
+ # # Decode the transcription
410
+ # transcription = tokenizer.batch_decode(
411
+ # predicted_ids,
412
+ # skip_special_tokens=True
413
+ # )[0].strip()
414
+
415
+ # end_time = time.time()
416
+
417
+ # logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
418
+ # return transcription
419
+
420
+ # except Exception as e:
421
+ # logger.error(f"Error during direct transcription: {e}")
422
+ # return ""
423
+
424
+ class JambonzSTTHandler:
425
+ def __init__(self, websocket):
426
+ self.websocket = websocket
427
+ self.audio_buffer = None
428
+ self.config = {}
429
+ self.running = True
430
+ self.transcription_task = None
431
+ self.full_transcript = ""
432
+ self.last_partial = ""
433
+
434
+ # Auto-final detection variables
435
+ self.interim_count = 0
436
+ self.last_interim_time = None
437
+ self.silence_timeout = 1.5 # 3 seconds of silence to trigger final
438
+ self.min_interim_count = 1 # Minimum interim results before considering final
439
+ self.auto_final_task = None
440
+ self.accumulated_transcript = ""
441
+ self.final_sent = False
442
+
443
+ async def start_processing(self, start_message):
444
+ """Initialize with start message from jambonz"""
445
+ self.config = {
446
+ "language": start_message.get("language", "ar-EG"),
447
+ "format": start_message.get("format", "raw"),
448
+ "encoding": start_message.get("encoding", "LINEAR16"),
449
+ "sample_rate": start_message.get("sampleRateHz", 8000),
450
+ "interim_results": start_message.get("interimResults", True),
451
+ "options": start_message.get("options", {})
452
+ }
453
+
454
+ logger.info(f"STT session started with config: {self.config}")
455
+
456
+ # Initialize audio buffer
457
+ self.audio_buffer = JambonzAudioBuffer(
458
+ sample_rate=self.config["sample_rate"],
459
+ chunk_duration=1.0 # Process every 1 second
460
+ )
461
+
462
+ # Reset auto-final detection variables
463
+ self.interim_count = 0
464
+ self.last_interim_time = None
465
+ self.accumulated_transcript = ""
466
+ self.final_sent = False
467
+
468
+ # Start background transcription task
469
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
470
+
471
+ # Start auto-final detection task
472
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
473
+
474
+ async def stop_processing(self):
475
+ """Stop processing and send final transcription"""
476
+ self.running = False
477
+
478
+ # Cancel background tasks
479
+ if self.transcription_task:
480
+ self.transcription_task.cancel()
481
+ try:
482
+ await self.transcription_task
483
+ except asyncio.CancelledError:
484
+ pass
485
+
486
+ if self.auto_final_task:
487
+ self.auto_final_task.cancel()
488
+ try:
489
+ await self.auto_final_task
490
+ except asyncio.CancelledError:
491
+ pass
492
+
493
+ # Send final transcription if not already sent
494
+ if not self.final_sent and self.accumulated_transcript.strip():
495
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
496
+
497
+ # Also process any remaining audio for comprehensive final transcription
498
+ if self.audio_buffer:
499
+ all_audio = self.audio_buffer.get_all_audio()
500
+ if len(all_audio) > 0 and not self.final_sent:
501
+ loop = asyncio.get_event_loop()
502
+ final_transcription = await loop.run_in_executor(
503
+ executor,
504
+ transcribe_chunk_direct,
505
+ all_audio,
506
+ self.config["sample_rate"]
507
+ )
508
+
509
+ if final_transcription.strip():
510
+ # Send comprehensive final transcription
511
+ await self.send_transcription(final_transcription, is_final=True)
512
+
513
+ logger.info("STT session ended")
514
+
515
+ async def add_audio_data(self, audio_bytes):
516
+ """Add audio data to buffer"""
517
+ if self.audio_buffer:
518
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
519
+ self.audio_buffer.add_audio(audio_data)
520
+
521
+ async def _process_audio_chunks(self):
522
+ """Process audio chunks for interim results"""
523
+ while self.running and self.config.get("interim_results", False):
524
+ try:
525
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
526
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
527
+ if chunk_signal is not None:
528
+ # Get all accumulated audio so far for complete transcription
529
+ all_audio = self.audio_buffer.get_all_audio()
530
+
531
+ # Only process if we have actual speech content
532
+ if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
533
+ # Run transcription on all accumulated audio
534
+ loop = asyncio.get_event_loop()
535
+ transcription = await loop.run_in_executor(
536
+ executor,
537
+ transcribe_chunk_direct,
538
+ all_audio,
539
+ self.config["sample_rate"]
540
+ )
541
+
542
+ if transcription.strip() and transcription != self.last_partial:
543
+ self.last_partial = transcription
544
+ self.accumulated_transcript = transcription # Update accumulated transcript
545
+ self.interim_count += 1
546
+ self.last_interim_time = time.time()
547
+
548
+ # Send interim result
549
+ await self.send_transcription(transcription, is_final=False)
550
+
551
+ logger.info(f"Interim #{self.interim_count}: '{transcription}'")
552
+
553
+ # Small delay to prevent excessive processing
554
+ await asyncio.sleep(0.1)
555
+
556
+ except Exception as e:
557
+ logger.error(f"Error in chunk processing: {e}")
558
+ await asyncio.sleep(1)
559
+
560
+ async def _monitor_for_auto_final(self):
561
+ """Monitor for auto-final conditions: 3 seconds silence after 3+ interim results"""
562
+ while self.running:
563
+ try:
564
+ current_time = time.time()
565
+
566
+ # Check if we should send auto-final transcription
567
+ if (self.interim_count >= self.min_interim_count and
568
+ self.last_interim_time is not None and
569
+ (current_time - self.last_interim_time) >= self.silence_timeout and
570
+ not self.final_sent and
571
+ self.accumulated_transcript.strip()):
572
+
573
+ logger.info(f"Auto-final triggered: {self.interim_count} interim results, "
574
+ f"{current_time - self.last_interim_time:.1f}s silence")
575
+
576
+ # Send the accumulated transcript as final
577
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
578
+ self.final_sent = True
579
+
580
+ # Reset counters for potential next utterance
581
+ self.interim_count = 0
582
+ self.last_interim_time = None
583
+ self.accumulated_transcript = ""
584
+
585
+ # Check every 0.5 seconds
586
+ await asyncio.sleep(0.5)
587
+
588
+ except Exception as e:
589
+ logger.error(f"Error in auto-final monitoring: {e}")
590
+ await asyncio.sleep(1)
591
+
592
+ # async def send_transcription(self, text, is_final=False, confidence=0.9):
593
+ # """Send transcription in jambonz format with Arabic number conversion"""
594
+ # try:
595
+ # # Convert Arabic numbers to digits before sending
596
+ # original_text = text
597
+ # converted_text = convert_arabic_numbers_in_sentence(text)
598
+
599
+ # # Log the conversion if numbers were found and converted
600
+ # if original_text != converted_text:
601
+ # logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
602
+
603
+ # message = {
604
+ # "type": "transcription",
605
+ # "is_final": is_final,
606
+ # "alternatives": [
607
+ # {
608
+ # "transcript": converted_text,
609
+ # "confidence": confidence
610
+ # }
611
+ # ],
612
+ # "language": self.config.get("language", "ar-EG"),
613
+ # "channel": 1
614
+ # }
615
+
616
+ # await self.websocket.send(json.dumps(message))
617
+ # logger.info(f"Sent {'FINAL' if is_final else 'interim'} transcription: '{converted_text}'")
618
+
619
+ # if is_final:
620
+ # self.final_sent = True
621
+
622
+ # except Exception as e:
623
+ # logger.error(f"Error sending transcription: {e}")
624
+
625
+
626
+
627
+ async def send_transcription(self, text, is_final=False, confidence=0.9):
628
+ """Send transcription in jambonz format with Arabic number conversion, only for final results"""
629
+ try:
630
+ if not is_final:
631
+ # Do nothing for interim results
632
+ logger.debug("Skipping interim transcription (not final).")
633
+ return
634
+
635
+ # Convert Arabic numbers only for final transcripts
636
+ original_text = text
637
+ converted_text = convert_arabic_numbers_in_sentence(text)
638
+
639
+ # Log the conversion if numbers were found and converted
640
+ if original_text != converted_text:
641
+ logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
642
+
643
+ message = {
644
+ "type": "transcription",
645
+ "is_final": True,
646
+ "alternatives": [
647
+ {
648
+ "transcript": original_text,#converted_text,
649
+ "confidence": confidence
650
+ }
651
+ ],
652
+ "language": self.config.get("language", "ar-EG"),
653
+ "channel": 1
654
+ }
655
+
656
+ # Send only final messages
657
+ await self.websocket.send(json.dumps(message))
658
+ logger.info(f"Sent FINAL transcription: '{converted_text}'")
659
+
660
+ self.final_sent = True
661
+
662
+ except Exception as e:
663
+ logger.error(f"Error sending transcription: {e}")
664
+
665
+
666
+
667
+
668
+ async def send_error(self, error_message):
669
+ """Send error message in jambonz format"""
670
+ try:
671
+ message = {
672
+ "type": "error",
673
+ "error": error_message
674
+ }
675
+ await self.websocket.send(json.dumps(message))
676
+ logger.error(f"Sent error: {error_message}")
677
+ except Exception as e:
678
+ logger.error(f"Error sending error message: {e}")
679
+
680
+ async def handle_jambonz_websocket(websocket):
681
+ """Handle jambonz WebSocket connections"""
682
+
683
+ client_id = f"jambonz_{id(websocket)}"
684
+ logger.info(f"New jambonz connection: {client_id}")
685
+
686
+ handler = JambonzSTTHandler(websocket)
687
+
688
+ try:
689
+ async for message in websocket:
690
+ try:
691
+ if isinstance(message, str):
692
+ # Handle JSON control messages
693
+ data = json.loads(message)
694
+ message_type = data.get("type")
695
+
696
+ if message_type == "start":
697
+ logger.info(f"Received start message: {data}")
698
+ await handler.start_processing(data)
699
+
700
+ elif message_type == "stop":
701
+ logger.info("Received stop message")
702
+ await handler.stop_processing()
703
+ # Close websocket after final transcription
704
+ await websocket.close(code=1000, reason="Session completed")
705
+ break
706
+
707
+ else:
708
+ logger.warning(f"Unknown message type: {message_type}")
709
+ await handler.send_error(f"Unknown message type: {message_type}")
710
+
711
+ else:
712
+ # Handle binary audio data (LINEAR16 PCM)
713
+ if handler.audio_buffer is None:
714
+ await handler.send_error("Received audio before start message")
715
+ continue
716
+
717
+ await handler.add_audio_data(message)
718
+
719
+ except json.JSONDecodeError as e:
720
+ logger.error(f"JSON decode error: {e}")
721
+ await handler.send_error(f"Invalid JSON: {str(e)}")
722
+ except Exception as e:
723
+ logger.error(f"Error processing message: {e}")
724
+ await handler.send_error(f"Processing error: {str(e)}")
725
+
726
+ except websockets.exceptions.ConnectionClosed:
727
+ logger.info(f"jambonz connection closed: {client_id}")
728
+ except Exception as e:
729
+ logger.error(f"jambonz WebSocket error: {e}")
730
+ try:
731
+ await handler.send_error(str(e))
732
+ except:
733
+ pass
734
+ finally:
735
+ if handler.running:
736
+ await handler.stop_processing()
737
+ logger.info(f"jambonz connection ended: {client_id}")
738
+
739
+ async def main():
740
+ """Start the jambonz STT WebSocket server"""
741
+ logger.info("Starting Jambonz Custom STT WebSocket server on port 3006...")
742
+
743
+ # Start WebSocket server
744
+ server = await websockets.serve(
745
+ handle_jambonz_websocket,
746
+ "0.0.0.0",
747
+ 3006,
748
+ ping_interval=20,
749
+ ping_timeout=10,
750
+ close_timeout=10
751
+ )
752
+
753
+ logger.info("Jambonz Custom STT WebSocket server started on ws://0.0.0.0:3006")
754
+ logger.info("Ready to handle jambonz STT requests")
755
+ logger.info("- Expects LINEAR16 PCM audio at 8kHz")
756
+ logger.info("- Supports interim results with auto-final detection")
757
+ logger.info("- Auto-final: 3+ interim results + 1.3s silence")
758
+ logger.info("- Resamples to 16kHz for Whisper processing")
759
+ logger.info("- Converts Arabic numbers to digits before sending")
760
+
761
+ # Wait for the server to close
762
+ await server.wait_closed()
763
+
764
+ if __name__ == "__main__":
765
+ print("=" * 60)
766
+ print("Jambonz Custom STT Server with Whisper + Arabic Numbers")
767
+ print("=" * 60)
768
+ print(f"Model: {MODEL_NAME}")
769
+ print(f"Device: {device}")
770
+ print("WebSocket Port: 3006")
771
+ print("Protocol: jambonz STT API")
772
+ print("Audio Format: LINEAR16 PCM @ 8kHz")
773
+ print("Auto-Final: 2+ speech activities + 1.3s silence")
774
+ print("Arabic Numbers: Converted to digits in FINAL transcriptions only")
775
+ print("Interim Results: DISABLED (final transcription only)")
776
+ if arabic_numbers_available:
777
+ print("✓ pyarabic library available for number conversion")
778
+ else:
779
+ print("✗ pyarabic library not available - install with: pip install pyarabic")
780
+ print("=" * 60)
781
+
782
+ try:
783
+ asyncio.run(main())
784
+ except KeyboardInterrupt:
785
+ print("\nShutting down server...")
786
+ except Exception as e:
787
+ print(f"Server error: {e}")
aqib-whipser_ft-arabic_noise_reducer.py ADDED
@@ -0,0 +1,746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # import asyncio
3
+ # import websockets
4
+ # import json
5
+ # import threading
6
+ # import numpy as np
7
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline , WhisperForConditionalGeneration, WhisperProcessor
8
+ # import subprocess
9
+ # import logging
10
+ # import time
11
+ # from concurrent.futures import ThreadPoolExecutor
12
+ # import struct
13
+ # import re
14
+ # 3 - 10 - 2025
15
+ import torch
16
+ import asyncio
17
+ import websockets
18
+ import json
19
+ import threading
20
+ import numpy as np
21
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer
22
+ import subprocess
23
+ import logging
24
+ import time
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ import re
27
+
28
+ # --- Denoiser added ---
29
+ try:
30
+ import noisereduce as nr
31
+ denoiser_available = True
32
+ print("Denoiser available (using noisereduce)")
33
+ except ImportError:
34
+ denoiser_available = False
35
+ print("noisereduce not available - install with: pip install noisereduce")
36
+ ##############################################################################################
37
+ # Arabic number conversion imports
38
+ try:
39
+ from pyarabic.number import text2number
40
+ arabic_numbers_available = True
41
+ print("Arabic number conversion available")
42
+ except ImportError:
43
+ arabic_numbers_available = False
44
+ print("pyarabic not available - install with: pip install pyarabic")
45
+ print("Arabic numbers will not be converted to digits")
46
+
47
+ # Set up logging
48
+ logging.basicConfig(level=logging.INFO)
49
+ logger = logging.getLogger(__name__)
50
+ # 3 - 10 - 2025
51
+ def denoise_audio(audio_data, sample_rate=16000):
52
+ """Apply noise reduction to audio using noisereduce."""
53
+ if not denoiser_available or len(audio_data) == 0:
54
+ return audio_data
55
+ try:
56
+ reduced = nr.reduce_noise(y=audio_data, sr=sample_rate)
57
+ return reduced.astype(np.float32)
58
+ except Exception as e:
59
+ logger.warning(f"Denoiser failed: {e}")
60
+ return audio_data
61
+ #############################################################################################
62
+ def convert_arabic_numbers_in_sentence(sentence: str) -> str:
63
+ """
64
+ Replace Arabic number words in a sentence with digits,
65
+ preserving all other words and punctuation.
66
+ Handles common spelling variants and zero explicitly.
67
+ """
68
+ try:
69
+ print("Fxn called--------------")
70
+
71
+ # --- Normalization step ---
72
+ replacements = {
73
+ "اربعة": "أربعة",
74
+ "اربع": "أربع",
75
+ "اثنين": "اثنان",
76
+ "اتنين": "اثنان", # Egyptian variant
77
+ "ثلاث": "ثلاثة",
78
+ "خمس": "خمسة",
79
+ "ست": "ستة",
80
+ "سبع": "سبعة",
81
+ "ثمان": "ثمانية",
82
+ "تسع": "تسعة",
83
+ "عشر": "عشرة",
84
+ }
85
+ for wrong, correct in replacements.items():
86
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
87
+
88
+ # --- Split by whitespace but keep spaces ---
89
+ words = re.split(r'(\s+)', sentence)
90
+ converted_words = []
91
+
92
+ for word in words:
93
+ stripped = word.strip()
94
+ if not stripped: # skip spaces
95
+ converted_words.append(word)
96
+ continue
97
+
98
+ try:
99
+ num = text2number(stripped)
100
+
101
+ # Accept valid numbers, including zero explicitly
102
+ if isinstance(num, int):
103
+ if num != 0 or stripped == "صفر":
104
+ converted_words.append(str(num))
105
+ else:
106
+ converted_words.append(word)
107
+ else:
108
+ converted_words.append(word)
109
+
110
+ except Exception:
111
+ converted_words.append(word)
112
+
113
+ return ''.join(converted_words)
114
+
115
+ except Exception as e:
116
+ logger.warning(f"Error converting Arabic numbers: {e}")
117
+ return sentence
118
+
119
+
120
+ # Try to install flash-attn if not available
121
+ try:
122
+ import flash_attn
123
+ use_flash_attn = True
124
+ except ImportError:
125
+ print("Flash attention not available, using standard attention")
126
+ use_flash_attn = False
127
+ try:
128
+ subprocess.run(
129
+ "pip install websockets",
130
+ shell=True,
131
+ check=False
132
+ )
133
+ subprocess.run(
134
+ "pip install flash-attn --no-build-isolation",
135
+ shell=True,
136
+ check=False
137
+ )
138
+ except:
139
+ pass
140
+
141
+ device = "cuda" if torch.cuda.is_available() else "cpu"
142
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
143
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"#"openai/whisper-large-v3-turbo"
144
+
145
+ print(f"Using device: {device}")
146
+ print(f"CUDA available: {torch.cuda.is_available()}")
147
+ if torch.cuda.is_available():
148
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
149
+
150
+ # Model initialization with fallback for attention implementation
151
+ try:
152
+ if use_flash_attn and torch.cuda.is_available():
153
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
154
+ MODEL_NAME,
155
+ torch_dtype=torch_dtype,
156
+ low_cpu_mem_usage=True,
157
+ use_safetensors=True,
158
+ attn_implementation="flash_attention_2"
159
+ )
160
+ else:
161
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
162
+ MODEL_NAME,
163
+ torch_dtype=torch_dtype,
164
+ low_cpu_mem_usage=True,
165
+ use_safetensors=True
166
+ )
167
+ except Exception as e:
168
+ print(f"Error loading model with flash attention: {e}")
169
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
170
+ MODEL_NAME,
171
+ torch_dtype=torch_dtype,
172
+ low_cpu_mem_usage=True,
173
+ use_safetensors=True
174
+ )
175
+
176
+ model.to(device)
177
+
178
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
179
+ tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
180
+
181
+ # Thread pool for processing audio
182
+ executor = ThreadPoolExecutor(max_workers=4)
183
+
184
+ class JambonzAudioBuffer:
185
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
186
+ self.sample_rate = sample_rate
187
+ self.chunk_duration = chunk_duration
188
+ self.chunk_samples = int(chunk_duration * sample_rate)
189
+
190
+ self.buffer = np.array([], dtype=np.float32)
191
+ self.lock = threading.Lock()
192
+ self.total_audio = np.array([], dtype=np.float32)
193
+
194
+ # Voice Activity Detection (simple energy-based)
195
+ self.silence_threshold = 0.01
196
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
197
+
198
+ def add_audio(self, audio_data):
199
+ with self.lock:
200
+ self.buffer = np.concatenate([self.buffer, audio_data])
201
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
202
+
203
+ def has_chunk_ready(self):
204
+ with self.lock:
205
+ return len(self.buffer) >= self.chunk_samples
206
+
207
+ def is_speech(self, audio_chunk):
208
+ """Simple VAD based on energy"""
209
+ if len(audio_chunk) < self.min_speech_samples:
210
+ return False
211
+ energy = np.mean(np.abs(audio_chunk))
212
+ return energy > self.silence_threshold
213
+
214
+ def get_chunk_for_processing(self):
215
+ """Get audio chunk for processing - but don't remove it from buffer for interim results"""
216
+ with self.lock:
217
+ if len(self.buffer) < self.chunk_samples:
218
+ return None
219
+
220
+ # For interim results, we want to trigger processing but keep accumulating audio
221
+ # So we just return a signal that we have enough audio, but don't consume it
222
+ return np.array([1]) # Return a dummy array to signal chunk is ready
223
+
224
+ def get_all_audio(self):
225
+ """Get all accumulated audio for final transcription"""
226
+ with self.lock:
227
+ return self.total_audio.copy()
228
+
229
+ def clear(self):
230
+ with self.lock:
231
+ self.buffer = np.array([], dtype=np.float32)
232
+ self.total_audio = np.array([], dtype=np.float32)
233
+
234
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
235
+ """Convert LINEAR16 PCM bytes to numpy array (jambonz format)"""
236
+ try:
237
+ # jambonz sends LINEAR16 PCM at 8kHz
238
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
239
+ # Convert to float32 and normalize
240
+ audio_array = audio_array.astype(np.float32) / 32768.0
241
+ return audio_array
242
+ except Exception as e:
243
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
244
+ return np.array([], dtype=np.float32)
245
+
246
+ def resample_audio(audio_data, source_rate, target_rate):
247
+ """Simple resampling from 8kHz to 16kHz for Whisper"""
248
+ if source_rate == target_rate:
249
+ return audio_data
250
+
251
+ # Simple linear interpolation resampling
252
+ ratio = target_rate / source_rate
253
+ indices = np.arange(0, len(audio_data), 1/ratio)
254
+ indices = indices[indices < len(audio_data)]
255
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
256
+
257
+ # Ensure proper float32 dtype for consistency
258
+ return resampled.astype(np.float32)
259
+ def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
260
+ """Transcribe audio chunk using model's generate method directly"""
261
+ try:
262
+ if len(audio_data) == 0:
263
+ return ""
264
+
265
+ # Resample from 8kHz to 16kHz for Whisper
266
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
267
+
268
+ # --- Denoiser added ---
269
+ resampled_audio = denoise_audio(resampled_audio, sample_rate=target_sample_rate)
270
+
271
+ # Ensure minimum length for Whisper
272
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
273
+ if len(resampled_audio) < min_samples:
274
+ return ""
275
+
276
+ start_time = time.time()
277
+
278
+ # Prepare input features
279
+ input_features = processor(
280
+ resampled_audio,
281
+ sampling_rate=target_sample_rate,
282
+ return_tensors="pt"
283
+ ).input_features
284
+
285
+ input_features = input_features.to(device=device, dtype=torch_dtype)
286
+
287
+ attention_mask = torch.ones(
288
+ input_features.shape[:-1],
289
+ dtype=torch.long,
290
+ device=device
291
+ )
292
+
293
+ with torch.no_grad():
294
+ predicted_ids = model.generate(
295
+ input_features,
296
+ attention_mask=attention_mask,
297
+ max_new_tokens=128,
298
+ do_sample=False,
299
+ temperature=0.0,
300
+ num_beams=1,
301
+ language="ar",
302
+ task="transcribe",
303
+ pad_token_id=tokenizer.pad_token_id,
304
+ eos_token_id=tokenizer.eos_token_id
305
+ )
306
+
307
+ transcription = tokenizer.batch_decode(
308
+ predicted_ids,
309
+ skip_special_tokens=True
310
+ )[0].strip()
311
+
312
+ end_time = time.time()
313
+
314
+ logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
315
+ return transcription
316
+
317
+ except Exception as e:
318
+ logger.error(f"Error during direct transcription: {e}")
319
+ return ""
320
+ # def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
321
+ # """Transcribe audio chunk using model's generate method directly"""
322
+ # try:
323
+ # if len(audio_data) == 0:
324
+ # return ""
325
+
326
+ # # Resample from 8kHz to 16kHz for Whisper
327
+ # resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
328
+
329
+ # # Ensure minimum length for Whisper
330
+ # min_samples = int(0.1 * target_sample_rate) # 100ms minimum
331
+ # if len(resampled_audio) < min_samples:
332
+ # return ""
333
+
334
+ # start_time = time.time()
335
+
336
+ # # Prepare input features with proper dtype
337
+ # input_features = processor(
338
+ # resampled_audio,
339
+ # sampling_rate=target_sample_rate,
340
+ # return_tensors="pt"
341
+ # ).input_features
342
+
343
+ # # Ensure correct dtype and device
344
+ # input_features = input_features.to(device=device, dtype=torch_dtype)
345
+
346
+ # # Create attention mask to avoid warnings
347
+ # attention_mask = torch.ones(
348
+ # input_features.shape[:-1],
349
+ # dtype=torch.long,
350
+ # device=device
351
+ # )
352
+
353
+ # # Generate transcription using model directly
354
+ # with torch.no_grad():
355
+ # predicted_ids = model.generate(
356
+ # input_features,
357
+ # attention_mask=attention_mask,
358
+ # max_new_tokens=128,
359
+ # do_sample=False,
360
+ # temperature=0.0,
361
+ # num_beams=1,
362
+ # language="ar",
363
+ # task="transcribe",
364
+ # pad_token_id=tokenizer.pad_token_id,
365
+ # eos_token_id=tokenizer.eos_token_id
366
+ # )
367
+
368
+ # # Decode the transcription
369
+ # transcription = tokenizer.batch_decode(
370
+ # predicted_ids,
371
+ # skip_special_tokens=True
372
+ # )[0].strip()
373
+
374
+ # end_time = time.time()
375
+
376
+ # logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
377
+ # return transcription
378
+
379
+ # except Exception as e:
380
+ # logger.error(f"Error during direct transcription: {e}")
381
+ # return ""
382
+
383
+ class JambonzSTTHandler:
384
+ def __init__(self, websocket):
385
+ self.websocket = websocket
386
+ self.audio_buffer = None
387
+ self.config = {}
388
+ self.running = True
389
+ self.transcription_task = None
390
+ self.full_transcript = ""
391
+ self.last_partial = ""
392
+
393
+ # Auto-final detection variables
394
+ self.interim_count = 0
395
+ self.last_interim_time = None
396
+ self.silence_timeout = 1.5 # 3 seconds of silence to trigger final
397
+ self.min_interim_count = 1 # Minimum interim results before considering final
398
+ self.auto_final_task = None
399
+ self.accumulated_transcript = ""
400
+ self.final_sent = False
401
+
402
+ async def start_processing(self, start_message):
403
+ """Initialize with start message from jambonz"""
404
+ self.config = {
405
+ "language": start_message.get("language", "ar-EG"),
406
+ "format": start_message.get("format", "raw"),
407
+ "encoding": start_message.get("encoding", "LINEAR16"),
408
+ "sample_rate": start_message.get("sampleRateHz", 8000),
409
+ "interim_results": start_message.get("interimResults", True),
410
+ "options": start_message.get("options", {})
411
+ }
412
+
413
+ logger.info(f"STT session started with config: {self.config}")
414
+
415
+ # Initialize audio buffer
416
+ self.audio_buffer = JambonzAudioBuffer(
417
+ sample_rate=self.config["sample_rate"],
418
+ chunk_duration=1.0 # Process every 1 second
419
+ )
420
+
421
+ # Reset auto-final detection variables
422
+ self.interim_count = 0
423
+ self.last_interim_time = None
424
+ self.accumulated_transcript = ""
425
+ self.final_sent = False
426
+
427
+ # Start background transcription task
428
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
429
+
430
+ # Start auto-final detection task
431
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
432
+
433
+ async def stop_processing(self):
434
+ """Stop processing and send final transcription"""
435
+ self.running = False
436
+
437
+ # Cancel background tasks
438
+ if self.transcription_task:
439
+ self.transcription_task.cancel()
440
+ try:
441
+ await self.transcription_task
442
+ except asyncio.CancelledError:
443
+ pass
444
+
445
+ if self.auto_final_task:
446
+ self.auto_final_task.cancel()
447
+ try:
448
+ await self.auto_final_task
449
+ except asyncio.CancelledError:
450
+ pass
451
+
452
+ # Send final transcription if not already sent
453
+ if not self.final_sent and self.accumulated_transcript.strip():
454
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
455
+
456
+ # Also process any remaining audio for comprehensive final transcription
457
+ if self.audio_buffer:
458
+ all_audio = self.audio_buffer.get_all_audio()
459
+ if len(all_audio) > 0 and not self.final_sent:
460
+ loop = asyncio.get_event_loop()
461
+ final_transcription = await loop.run_in_executor(
462
+ executor,
463
+ transcribe_chunk_direct,
464
+ all_audio,
465
+ self.config["sample_rate"]
466
+ )
467
+
468
+ if final_transcription.strip():
469
+ # Send comprehensive final transcription
470
+ await self.send_transcription(final_transcription, is_final=True)
471
+
472
+ logger.info("STT session ended")
473
+
474
+ async def add_audio_data(self, audio_bytes):
475
+ """Add audio data to buffer"""
476
+ if self.audio_buffer:
477
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
478
+ self.audio_buffer.add_audio(audio_data)
479
+
480
+ async def _process_audio_chunks(self):
481
+ """Process audio chunks for interim results"""
482
+ while self.running and self.config.get("interim_results", False):
483
+ try:
484
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
485
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
486
+ if chunk_signal is not None:
487
+ # Get all accumulated audio so far for complete transcription
488
+ all_audio = self.audio_buffer.get_all_audio()
489
+
490
+ # Only process if we have actual speech content
491
+ if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
492
+ # Run transcription on all accumulated audio
493
+ loop = asyncio.get_event_loop()
494
+ transcription = await loop.run_in_executor(
495
+ executor,
496
+ transcribe_chunk_direct,
497
+ all_audio,
498
+ self.config["sample_rate"]
499
+ )
500
+
501
+ if transcription.strip() and transcription != self.last_partial:
502
+ self.last_partial = transcription
503
+ self.accumulated_transcript = transcription # Update accumulated transcript
504
+ self.interim_count += 1
505
+ self.last_interim_time = time.time()
506
+
507
+ # Send interim result
508
+ await self.send_transcription(transcription, is_final=False)
509
+
510
+ logger.info(f"Interim #{self.interim_count}: '{transcription}'")
511
+
512
+ # Small delay to prevent excessive processing
513
+ await asyncio.sleep(0.1)
514
+
515
+ except Exception as e:
516
+ logger.error(f"Error in chunk processing: {e}")
517
+ await asyncio.sleep(1)
518
+
519
+ async def _monitor_for_auto_final(self):
520
+ """Monitor for auto-final conditions: 3 seconds silence after 3+ interim results"""
521
+ while self.running:
522
+ try:
523
+ current_time = time.time()
524
+
525
+ # Check if we should send auto-final transcription
526
+ if (self.interim_count >= self.min_interim_count and
527
+ self.last_interim_time is not None and
528
+ (current_time - self.last_interim_time) >= self.silence_timeout and
529
+ not self.final_sent and
530
+ self.accumulated_transcript.strip()):
531
+
532
+ logger.info(f"Auto-final triggered: {self.interim_count} interim results, "
533
+ f"{current_time - self.last_interim_time:.1f}s silence")
534
+
535
+ # Send the accumulated transcript as final
536
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
537
+ self.final_sent = True
538
+
539
+ # Reset counters for potential next utterance
540
+ self.interim_count = 0
541
+ self.last_interim_time = None
542
+ self.accumulated_transcript = ""
543
+
544
+ # Check every 0.5 seconds
545
+ await asyncio.sleep(0.5)
546
+
547
+ except Exception as e:
548
+ logger.error(f"Error in auto-final monitoring: {e}")
549
+ await asyncio.sleep(1)
550
+
551
+ # async def send_transcription(self, text, is_final=False, confidence=0.9):
552
+ # """Send transcription in jambonz format with Arabic number conversion"""
553
+ # try:
554
+ # # Convert Arabic numbers to digits before sending
555
+ # original_text = text
556
+ # converted_text = convert_arabic_numbers_in_sentence(text)
557
+
558
+ # # Log the conversion if numbers were found and converted
559
+ # if original_text != converted_text:
560
+ # logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
561
+
562
+ # message = {
563
+ # "type": "transcription",
564
+ # "is_final": is_final,
565
+ # "alternatives": [
566
+ # {
567
+ # "transcript": converted_text,
568
+ # "confidence": confidence
569
+ # }
570
+ # ],
571
+ # "language": self.config.get("language", "ar-EG"),
572
+ # "channel": 1
573
+ # }
574
+
575
+ # await self.websocket.send(json.dumps(message))
576
+ # logger.info(f"Sent {'FINAL' if is_final else 'interim'} transcription: '{converted_text}'")
577
+
578
+ # if is_final:
579
+ # self.final_sent = True
580
+
581
+ # except Exception as e:
582
+ # logger.error(f"Error sending transcription: {e}")
583
+
584
+
585
+
586
+ async def send_transcription(self, text, is_final=False, confidence=0.9):
587
+ """Send transcription in jambonz format with Arabic number conversion, only for final results"""
588
+ try:
589
+ if not is_final:
590
+ # Do nothing for interim results
591
+ logger.debug("Skipping interim transcription (not final).")
592
+ return
593
+
594
+ # Convert Arabic numbers only for final transcripts
595
+ original_text = text
596
+ converted_text = convert_arabic_numbers_in_sentence(text)
597
+
598
+ # Log the conversion if numbers were found and converted
599
+ if original_text != converted_text:
600
+ logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
601
+
602
+ message = {
603
+ "type": "transcription",
604
+ "is_final": True,
605
+ "alternatives": [
606
+ {
607
+ "transcript": original_text,#converted_text,
608
+ "confidence": confidence
609
+ }
610
+ ],
611
+ "language": self.config.get("language", "ar-EG"),
612
+ "channel": 1
613
+ }
614
+
615
+ # Send only final messages
616
+ await self.websocket.send(json.dumps(message))
617
+ logger.info(f"Sent FINAL transcription: '{converted_text}'")
618
+
619
+ self.final_sent = True
620
+
621
+ except Exception as e:
622
+ logger.error(f"Error sending transcription: {e}")
623
+
624
+
625
+
626
+
627
+ async def send_error(self, error_message):
628
+ """Send error message in jambonz format"""
629
+ try:
630
+ message = {
631
+ "type": "error",
632
+ "error": error_message
633
+ }
634
+ await self.websocket.send(json.dumps(message))
635
+ logger.error(f"Sent error: {error_message}")
636
+ except Exception as e:
637
+ logger.error(f"Error sending error message: {e}")
638
+
639
+ async def handle_jambonz_websocket(websocket):
640
+ """Handle jambonz WebSocket connections"""
641
+
642
+ client_id = f"jambonz_{id(websocket)}"
643
+ logger.info(f"New jambonz connection: {client_id}")
644
+
645
+ handler = JambonzSTTHandler(websocket)
646
+
647
+ try:
648
+ async for message in websocket:
649
+ try:
650
+ if isinstance(message, str):
651
+ # Handle JSON control messages
652
+ data = json.loads(message)
653
+ message_type = data.get("type")
654
+
655
+ if message_type == "start":
656
+ logger.info(f"Received start message: {data}")
657
+ await handler.start_processing(data)
658
+
659
+ elif message_type == "stop":
660
+ logger.info("Received stop message")
661
+ await handler.stop_processing()
662
+ # Close websocket after final transcription
663
+ await websocket.close(code=1000, reason="Session completed")
664
+ break
665
+
666
+ else:
667
+ logger.warning(f"Unknown message type: {message_type}")
668
+ await handler.send_error(f"Unknown message type: {message_type}")
669
+
670
+ else:
671
+ # Handle binary audio data (LINEAR16 PCM)
672
+ if handler.audio_buffer is None:
673
+ await handler.send_error("Received audio before start message")
674
+ continue
675
+
676
+ await handler.add_audio_data(message)
677
+
678
+ except json.JSONDecodeError as e:
679
+ logger.error(f"JSON decode error: {e}")
680
+ await handler.send_error(f"Invalid JSON: {str(e)}")
681
+ except Exception as e:
682
+ logger.error(f"Error processing message: {e}")
683
+ await handler.send_error(f"Processing error: {str(e)}")
684
+
685
+ except websockets.exceptions.ConnectionClosed:
686
+ logger.info(f"jambonz connection closed: {client_id}")
687
+ except Exception as e:
688
+ logger.error(f"jambonz WebSocket error: {e}")
689
+ try:
690
+ await handler.send_error(str(e))
691
+ except:
692
+ pass
693
+ finally:
694
+ if handler.running:
695
+ await handler.stop_processing()
696
+ logger.info(f"jambonz connection ended: {client_id}")
697
+
698
+ async def main():
699
+ """Start the jambonz STT WebSocket server"""
700
+ logger.info("Starting Jambonz Custom STT WebSocket server on port 3006...")
701
+
702
+ # Start WebSocket server
703
+ server = await websockets.serve(
704
+ handle_jambonz_websocket,
705
+ "0.0.0.0",
706
+ 3006,
707
+ ping_interval=20,
708
+ ping_timeout=10,
709
+ close_timeout=10
710
+ )
711
+
712
+ logger.info("Jambonz Custom STT WebSocket server started on ws://0.0.0.0:3006")
713
+ logger.info("Ready to handle jambonz STT requests")
714
+ logger.info("- Expects LINEAR16 PCM audio at 8kHz")
715
+ logger.info("- Supports interim results with auto-final detection")
716
+ logger.info("- Auto-final: 3+ interim results + 1.3s silence")
717
+ logger.info("- Resamples to 16kHz for Whisper processing")
718
+ logger.info("- Converts Arabic numbers to digits before sending")
719
+
720
+ # Wait for the server to close
721
+ await server.wait_closed()
722
+
723
+ if __name__ == "__main__":
724
+ print("=" * 60)
725
+ print("Jambonz Custom STT Server with Whisper + Arabic Numbers")
726
+ print("=" * 60)
727
+ print(f"Model: {MODEL_NAME}")
728
+ print(f"Device: {device}")
729
+ print("WebSocket Port: 3006")
730
+ print("Protocol: jambonz STT API")
731
+ print("Audio Format: LINEAR16 PCM @ 8kHz")
732
+ print("Auto-Final: 2+ speech activities + 1.3s silence")
733
+ print("Arabic Numbers: Converted to digits in FINAL transcriptions only")
734
+ print("Interim Results: DISABLED (final transcription only)")
735
+ if arabic_numbers_available:
736
+ print("✓ pyarabic library available for number conversion")
737
+ else:
738
+ print("✗ pyarabic library not available - install with: pip install pyarabic")
739
+ print("=" * 60)
740
+
741
+ try:
742
+ asyncio.run(main())
743
+ except KeyboardInterrupt:
744
+ print("\nShutting down server...")
745
+ except Exception as e:
746
+ print(f"Server error: {e}")
asr_websocket_client.html ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>ASR WebSocket Testing Client</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
17
+ min-height: 100vh;
18
+ display: flex;
19
+ align-items: center;
20
+ justify-content: center;
21
+ padding: 20px;
22
+ }
23
+
24
+ .container {
25
+ background: rgba(255, 255, 255, 0.95);
26
+ backdrop-filter: blur(10px);
27
+ border-radius: 20px;
28
+ padding: 40px;
29
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
30
+ max-width: 600px;
31
+ width: 100%;
32
+ border: 1px solid rgba(255, 255, 255, 0.2);
33
+ }
34
+
35
+ .header {
36
+ text-align: center;
37
+ margin-bottom: 30px;
38
+ }
39
+
40
+ .header h1 {
41
+ color: #333;
42
+ font-size: 2.5em;
43
+ font-weight: 300;
44
+ margin-bottom: 10px;
45
+ }
46
+
47
+ .header p {
48
+ color: #666;
49
+ font-size: 1.1em;
50
+ }
51
+
52
+ .connection-section {
53
+ margin-bottom: 30px;
54
+ }
55
+
56
+ .input-group {
57
+ margin-bottom: 20px;
58
+ }
59
+
60
+ .input-group label {
61
+ display: block;
62
+ margin-bottom: 8px;
63
+ color: #333;
64
+ font-weight: 500;
65
+ }
66
+
67
+ .input-group input {
68
+ width: 100%;
69
+ padding: 12px 16px;
70
+ border: 2px solid #e1e5e9;
71
+ border-radius: 10px;
72
+ font-size: 16px;
73
+ transition: all 0.3s ease;
74
+ background: rgba(255, 255, 255, 0.8);
75
+ }
76
+
77
+ .input-group input:focus {
78
+ outline: none;
79
+ border-color: #667eea;
80
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
81
+ }
82
+
83
+ .btn {
84
+ padding: 12px 24px;
85
+ border: none;
86
+ border-radius: 10px;
87
+ font-size: 16px;
88
+ font-weight: 500;
89
+ cursor: pointer;
90
+ transition: all 0.3s ease;
91
+ text-transform: uppercase;
92
+ letter-spacing: 0.5px;
93
+ }
94
+
95
+ .btn:disabled {
96
+ opacity: 0.6;
97
+ cursor: not-allowed;
98
+ }
99
+
100
+ .btn-connect {
101
+ background: linear-gradient(135deg, #4CAF50, #45a049);
102
+ color: white;
103
+ width: 100%;
104
+ }
105
+
106
+ .btn-connect:hover:not(:disabled) {
107
+ transform: translateY(-2px);
108
+ box-shadow: 0 5px 15px rgba(76, 175, 80, 0.3);
109
+ }
110
+
111
+ .btn-disconnect {
112
+ background: linear-gradient(135deg, #f44336, #da190b);
113
+ color: white;
114
+ width: 100%;
115
+ }
116
+
117
+ .btn-disconnect:hover:not(:disabled) {
118
+ transform: translateY(-2px);
119
+ box-shadow: 0 5px 15px rgba(244, 67, 54, 0.3);
120
+ }
121
+
122
+ .audio-controls {
123
+ display: flex;
124
+ justify-content: center;
125
+ gap: 20px;
126
+ margin: 30px 0;
127
+ }
128
+
129
+ .btn-mic {
130
+ background: linear-gradient(135deg, #2196F3, #1976D2);
131
+ color: white;
132
+ width: 80px;
133
+ height: 80px;
134
+ border-radius: 50%;
135
+ display: flex;
136
+ align-items: center;
137
+ justify-content: center;
138
+ font-size: 24px;
139
+ }
140
+
141
+ .btn-mic:hover:not(:disabled) {
142
+ transform: scale(1.1);
143
+ box-shadow: 0 10px 25px rgba(33, 150, 243, 0.3);
144
+ }
145
+
146
+ .btn-mic.recording {
147
+ background: linear-gradient(135deg, #f44336, #da190b);
148
+ animation: pulse 1.5s infinite;
149
+ }
150
+
151
+ .btn-stop {
152
+ background: linear-gradient(135deg, #FF9800, #F57C00);
153
+ color: white;
154
+ width: 80px;
155
+ height: 80px;
156
+ border-radius: 50%;
157
+ display: flex;
158
+ align-items: center;
159
+ justify-content: center;
160
+ font-size: 24px;
161
+ }
162
+
163
+ .btn-stop:hover:not(:disabled) {
164
+ transform: scale(1.1);
165
+ box-shadow: 0 10px 25px rgba(255, 152, 0, 0.3);
166
+ }
167
+
168
+ @keyframes pulse {
169
+ 0% { transform: scale(1); }
170
+ 50% { transform: scale(1.05); }
171
+ 100% { transform: scale(1); }
172
+ }
173
+
174
+ .status {
175
+ text-align: center;
176
+ margin: 20px 0;
177
+ padding: 12px;
178
+ border-radius: 10px;
179
+ font-weight: 500;
180
+ }
181
+
182
+ .status.connected {
183
+ background: rgba(76, 175, 80, 0.1);
184
+ color: #4CAF50;
185
+ border: 1px solid rgba(76, 175, 80, 0.3);
186
+ }
187
+
188
+ .status.disconnected {
189
+ background: rgba(244, 67, 54, 0.1);
190
+ color: #f44336;
191
+ border: 1px solid rgba(244, 67, 54, 0.3);
192
+ }
193
+
194
+ .status.recording {
195
+ background: rgba(33, 150, 243, 0.1);
196
+ color: #2196F3;
197
+ border: 1px solid rgba(33, 150, 243, 0.3);
198
+ }
199
+
200
+ .response-section {
201
+ margin-top: 30px;
202
+ }
203
+
204
+ .response-box {
205
+ background: rgba(0, 0, 0, 0.05);
206
+ border-radius: 10px;
207
+ padding: 20px;
208
+ min-height: 120px;
209
+ border: 1px solid rgba(0, 0, 0, 0.1);
210
+ font-family: 'Courier New', monospace;
211
+ white-space: pre-wrap;
212
+ word-wrap: break-word;
213
+ }
214
+
215
+ .loading {
216
+ display: flex;
217
+ align-items: center;
218
+ justify-content: center;
219
+ color: #666;
220
+ }
221
+
222
+ .loading::after {
223
+ content: '';
224
+ width: 20px;
225
+ height: 20px;
226
+ border: 2px solid #f3f3f3;
227
+ border-top: 2px solid #667eea;
228
+ border-radius: 50%;
229
+ animation: spin 1s linear infinite;
230
+ margin-left: 10px;
231
+ }
232
+
233
+ @keyframes spin {
234
+ 0% { transform: rotate(0deg); }
235
+ 100% { transform: rotate(360deg); }
236
+ }
237
+
238
+ .audio-visualizer {
239
+ display: flex;
240
+ align-items: center;
241
+ justify-content: center;
242
+ height: 40px;
243
+ margin: 10px 0;
244
+ }
245
+
246
+ .bar {
247
+ width: 3px;
248
+ height: 10px;
249
+ background: #667eea;
250
+ margin: 0 1px;
251
+ border-radius: 2px;
252
+ animation: wave 1s ease-in-out infinite;
253
+ }
254
+
255
+ .bar:nth-child(2) { animation-delay: 0.1s; }
256
+ .bar:nth-child(3) { animation-delay: 0.2s; }
257
+ .bar:nth-child(4) { animation-delay: 0.3s; }
258
+ .bar:nth-child(5) { animation-delay: 0.4s; }
259
+
260
+ @keyframes wave {
261
+ 0%, 100% { height: 10px; }
262
+ 50% { height: 30px; }
263
+ }
264
+ </style>
265
+ </head>
266
+ <body>
267
+ <div class="container">
268
+ <div class="header">
269
+ <h1>🎤 ASR Tester</h1>
270
+ <p>WebSocket-based Speech Recognition Testing</p>
271
+ </div>
272
+
273
+ <div class="connection-section">
274
+ <div class="input-group">
275
+ <label for="websocketUrl">WebSocket URL:</label>
276
+ <input type="text" id="websocketUrl" value="ws://52.59.169.24:3015" placeholder="ws://localhost:5005/url">
277
+ </div>
278
+ <button id="connectBtn" class="btn btn-connect">Connect</button>
279
+ <button id="disconnectBtn" class="btn btn-disconnect" style="display: none;">Disconnect</button>
280
+ </div>
281
+
282
+ <div id="status" class="status disconnected">Disconnected</div>
283
+
284
+ <div class="audio-controls">
285
+ <button id="micBtn" class="btn btn-mic" disabled title="Start Recording">🎤</button>
286
+ <button id="stopBtn" class="btn btn-stop" disabled title="Stop Recording">⏹️</button>
287
+ </div>
288
+
289
+ <div id="visualizer" class="audio-visualizer" style="display: none;">
290
+ <div class="bar"></div>
291
+ <div class="bar"></div>
292
+ <div class="bar"></div>
293
+ <div class="bar"></div>
294
+ <div class="bar"></div>
295
+ </div>
296
+
297
+ <div class="response-section">
298
+ <h3>ASR Response:</h3>
299
+ <div id="responseBox" class="response-box">Waiting for audio input...</div>
300
+ </div>
301
+ </div>
302
+
303
+ <script>
304
+ class JambonzASRClient {
305
+ constructor() {
306
+ this.websocket = null;
307
+ this.audioContext = null;
308
+ this.mediaRecorder = null;
309
+ this.audioStream = null;
310
+ this.processor = null;
311
+ this.isRecording = false;
312
+ this.isConnected = false;
313
+
314
+ this.initializeElements();
315
+ this.attachEventListeners();
316
+ }
317
+
318
+ initializeElements() {
319
+ this.elements = {
320
+ websocketUrl: document.getElementById('websocketUrl'),
321
+ connectBtn: document.getElementById('connectBtn'),
322
+ disconnectBtn: document.getElementById('disconnectBtn'),
323
+ micBtn: document.getElementById('micBtn'),
324
+ stopBtn: document.getElementById('stopBtn'),
325
+ status: document.getElementById('status'),
326
+ responseBox: document.getElementById('responseBox'),
327
+ visualizer: document.getElementById('visualizer')
328
+ };
329
+ }
330
+
331
+ attachEventListeners() {
332
+ this.elements.connectBtn.addEventListener('click', () => this.connect());
333
+ this.elements.disconnectBtn.addEventListener('click', () => this.disconnect());
334
+ this.elements.micBtn.addEventListener('click', () => this.startRecording());
335
+ this.elements.stopBtn.addEventListener('click', () => this.stopRecording());
336
+ }
337
+
338
+ updateStatus(message, type) {
339
+ this.elements.status.textContent = message;
340
+ this.elements.status.className = `status ${type}`;
341
+ }
342
+
343
+ async connect() {
344
+ const url = this.elements.websocketUrl.value.trim();
345
+ if (!url) {
346
+ alert('Please enter a WebSocket URL');
347
+ return;
348
+ }
349
+
350
+ try {
351
+ this.updateStatus('Connecting...', 'disconnected');
352
+ this.elements.connectBtn.disabled = true;
353
+
354
+ this.websocket = new WebSocket(url);
355
+ this.websocket.binaryType = 'arraybuffer';
356
+
357
+ this.websocket.onopen = () => {
358
+ this.isConnected = true;
359
+ this.updateStatus('Connected - Ready for Jambonz Protocol', 'connected');
360
+ this.elements.connectBtn.style.display = 'none';
361
+ this.elements.disconnectBtn.style.display = 'block';
362
+ this.elements.micBtn.disabled = false;
363
+ this.elements.responseBox.textContent = 'Connected. Ready to start ASR session...';
364
+ };
365
+
366
+ this.websocket.onmessage = (event) => {
367
+ if (typeof event.data === 'string') {
368
+ try {
369
+ const response = JSON.parse(event.data);
370
+ this.displayResponse('JSON Control Message', response);
371
+ } catch (e) {
372
+ this.displayResponse('Text Message', event.data);
373
+ }
374
+ } else {
375
+ // Binary data (should not happen in normal Jambonz flow from server)
376
+ this.displayResponse('Binary Message', `Received binary data: ${event.data.byteLength} bytes`);
377
+ }
378
+ };
379
+
380
+ this.websocket.onerror = (error) => {
381
+ console.error('WebSocket error:', error);
382
+ this.updateStatus('Connection Error', 'disconnected');
383
+ this.resetConnection();
384
+ };
385
+
386
+ this.websocket.onclose = (event) => {
387
+ this.isConnected = false;
388
+ this.updateStatus(`Disconnected (Code: ${event.code})`, 'disconnected');
389
+ this.resetConnection();
390
+ this.displayResponse('Connection Closed', `WebSocket closed with code: ${event.code}, reason: ${event.reason || 'No reason provided'}`);
391
+ };
392
+
393
+ } catch (error) {
394
+ console.error('Connection failed:', error);
395
+ this.updateStatus('Connection Failed', 'disconnected');
396
+ this.resetConnection();
397
+ }
398
+ }
399
+
400
+ disconnect() {
401
+ if (this.isRecording) {
402
+ this.stopRecording();
403
+ }
404
+ if (this.websocket && this.websocket.readyState === WebSocket.OPEN) {
405
+ this.websocket.close(1000, 'Client disconnect');
406
+ }
407
+ this.resetConnection();
408
+ }
409
+
410
+ resetConnection() {
411
+ this.isConnected = false;
412
+ this.elements.connectBtn.disabled = false;
413
+ this.elements.connectBtn.style.display = 'block';
414
+ this.elements.disconnectBtn.style.display = 'none';
415
+ this.elements.micBtn.disabled = true;
416
+ this.elements.stopBtn.disabled = true;
417
+ this.stopRecording();
418
+ }
419
+
420
+ // Convert Float32Array to Int16Array (LINEAR16 PCM)
421
+ floatTo16BitPCM(float32Array) {
422
+ const int16Array = new Int16Array(float32Array.length);
423
+ for (let i = 0; i < float32Array.length; i++) {
424
+ const clipped = Math.max(-1, Math.min(1, float32Array[i]));
425
+ int16Array[i] = clipped * 0x7FFF;
426
+ }
427
+ return int16Array;
428
+ }
429
+
430
+ // Resample audio from source sample rate to 8kHz
431
+ resampleTo8kHz(audioBuffer, sourceSampleRate) {
432
+ const targetSampleRate = 8000;
433
+ const ratio = sourceSampleRate / targetSampleRate;
434
+ const targetLength = Math.round(audioBuffer.length / ratio);
435
+ const resampled = new Float32Array(targetLength);
436
+
437
+ for (let i = 0; i < targetLength; i++) {
438
+ const sourceIndex = i * ratio;
439
+ const sourceIndexFloor = Math.floor(sourceIndex);
440
+ const sourceIndexCeil = Math.min(sourceIndexFloor + 1, audioBuffer.length - 1);
441
+ const weight = sourceIndex - sourceIndexFloor;
442
+
443
+ resampled[i] = audioBuffer[sourceIndexFloor] * (1 - weight) +
444
+ audioBuffer[sourceIndexCeil] * weight;
445
+ }
446
+
447
+ return resampled;
448
+ }
449
+
450
+ async startRecording() {
451
+ if (!this.isConnected) {
452
+ alert('Please connect to WebSocket first');
453
+ return;
454
+ }
455
+
456
+ try {
457
+ // Initialize audio context
458
+ this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
459
+ sampleRate: 44100 // Let browser choose, we'll resample
460
+ });
461
+
462
+ // Get microphone stream
463
+ this.audioStream = await navigator.mediaDevices.getUserMedia({
464
+ audio: {
465
+ echoCancellation: false, // Jambonz typically handles this
466
+ noiseSuppression: false, // Jambonz typically handles this
467
+ autoGainControl: false, // Jambonz typically handles this
468
+ channelCount: 1 // Mono audio
469
+ }
470
+ });
471
+
472
+ const source = this.audioContext.createMediaStreamSource(this.audioStream);
473
+
474
+ // Create ScriptProcessorNode for audio processing
475
+ // Note: ScriptProcessorNode is deprecated but still widely supported
476
+ // In production, consider using AudioWorklet
477
+ this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
478
+
479
+ this.processor.onaudioprocess = (event) => {
480
+ if (!this.isRecording || !this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
481
+ return;
482
+ }
483
+
484
+ const inputBuffer = event.inputBuffer;
485
+ const audioData = inputBuffer.getChannelData(0); // Get mono channel
486
+
487
+ // Resample to 8kHz
488
+ const resampled = this.resampleTo8kHz(audioData, this.audioContext.sampleRate);
489
+
490
+ // Convert to LINEAR16 PCM
491
+ const pcmData = this.floatTo16BitPCM(resampled);
492
+
493
+ // Send binary audio data
494
+ this.websocket.send(pcmData.buffer);
495
+ };
496
+
497
+ // Connect audio nodes
498
+ source.connect(this.processor);
499
+ this.processor.connect(this.audioContext.destination);
500
+
501
+ // Send Jambonz START control message
502
+ const startMessage = {
503
+ type: "start",
504
+ language: "en-US",
505
+ format: "raw",
506
+ encoding: "LINEAR16",
507
+ interimResults: true,
508
+ sampleRateHz: 8000,
509
+ options: {
510
+ callSid: Date.now().toString()
511
+ }
512
+ };
513
+
514
+ this.websocket.send(JSON.stringify(startMessage));
515
+ this.displayResponse('Sent START Message', startMessage);
516
+
517
+ this.isRecording = true;
518
+
519
+ // Update UI
520
+ this.elements.micBtn.classList.add('recording');
521
+ this.elements.micBtn.disabled = true;
522
+ this.elements.stopBtn.disabled = false;
523
+ this.elements.visualizer.style.display = 'flex';
524
+ this.updateStatus('Recording - Sending LINEAR16 PCM @ 8kHz', 'recording');
525
+
526
+ } catch (error) {
527
+ console.error('Failed to start recording:', error);
528
+ alert('Failed to access microphone. Please check permissions.');
529
+ this.stopRecording();
530
+ }
531
+ }
532
+
533
+ stopRecording() {
534
+ if (this.isRecording) {
535
+ this.isRecording = false;
536
+
537
+ // Send Jambonz STOP control message
538
+ if (this.websocket && this.websocket.readyState === WebSocket.OPEN) {
539
+ const stopMessage = {
540
+ type: "stop"
541
+ };
542
+ this.websocket.send(JSON.stringify(stopMessage));
543
+ this.displayResponse('Sent STOP Message', stopMessage);
544
+ }
545
+ }
546
+
547
+ // Clean up audio resources
548
+ if (this.processor) {
549
+ this.processor.disconnect();
550
+ this.processor = null;
551
+ }
552
+
553
+ if (this.audioContext) {
554
+ this.audioContext.close().then(() => {
555
+ this.audioContext = null;
556
+ });
557
+ }
558
+
559
+ if (this.audioStream) {
560
+ this.audioStream.getTracks().forEach(track => track.stop());
561
+ this.audioStream = null;
562
+ }
563
+
564
+ // Update UI
565
+ this.elements.micBtn.classList.remove('recording');
566
+ this.elements.micBtn.disabled = false;
567
+ this.elements.stopBtn.disabled = true;
568
+ this.elements.visualizer.style.display = 'none';
569
+
570
+ if (this.isConnected) {
571
+ this.updateStatus('Connected - Waiting for final transcript...', 'connected');
572
+ }
573
+ }
574
+
575
+ displayResponse(messageType, response) {
576
+ const responseBox = this.elements.responseBox;
577
+ const timestamp = new Date().toLocaleTimeString();
578
+
579
+ let content = `<strong>[${timestamp}] ${messageType}:</strong>\n`;
580
+
581
+ if (typeof response === 'object') {
582
+ content += JSON.stringify(response, null, 2);
583
+ } else {
584
+ content += response;
585
+ }
586
+
587
+ // Append to existing content
588
+ if (responseBox.innerHTML.includes('Connected. Ready to start ASR session...') ||
589
+ responseBox.innerHTML.includes('Processing audio...')) {
590
+ responseBox.innerHTML = content;
591
+ } else {
592
+ responseBox.innerHTML += '\n\n' + content;
593
+ }
594
+
595
+ // Auto-scroll to bottom
596
+ responseBox.scrollTop = responseBox.scrollHeight;
597
+ }
598
+ }
599
+
600
+ // Initialize the client when page loads
601
+ document.addEventListener('DOMContentLoaded', () => {
602
+ new JambonzASRClient();
603
+ });
604
+ </script>
605
+ </body>
606
+ </html>
best_nemo_whisper_jambonz.py ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import websockets
4
+ import json
5
+ import threading
6
+ import numpy as np
7
+ import logging
8
+ import time
9
+ import tempfile
10
+ import os
11
+ import re
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ import subprocess
14
+ import struct
15
+
16
+ # NeMo imports
17
+ import nemo.collections.asr as nemo_asr
18
+ import soundfile as sf
19
+
20
+ # Whisper imports
21
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
22
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
23
+
24
+
25
+ # Arabic number conversion imports for Whisper
26
+ try:
27
+ from pyarabic.number import text2number
28
+ arabic_numbers_available = True
29
+ print("✓ pyarabic library available for Whisper number conversion")
30
+ except ImportError:
31
+ arabic_numbers_available = False
32
+ print("✗ pyarabic not available - install with: pip install pyarabic")
33
+ print("Arabic numbers will not be converted to digits for Whisper")
34
+
35
+ # Set up logging
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # ===== NeMo Arabic number mapping =====
40
+ arabic_numbers_nemo = {
41
+ # Basic digits
42
+ "سفر": "0", "فيرو": "0", "هيرو": "0","صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0",
43
+ "واحد": "1", "واحدة": "1", "١": "1",
44
+ "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
45
+ "تلاتة": "3", "ثلاثة": "3", "٣": "3","تلاته": "3","ثلاثه": "3","ثلاثا": "3","تلاتا": "3",
46
+ "اربعة": "4", "أربعة": "4", "٤": "4","اربعه": "4","أربعه": "4","أربع": "4","اربع": "4","اربعا": "4","أربعا": "4",
47
+ "خمسة": "5", "خمسه": "5", "٥": "5", "خمس": "5", "خمسا": "5",
48
+ "ستة": "6", "سته": "6", "٦": "6", "ست": "6", "ستّا": "6", "ستةً": "6",
49
+ "سبعة": "7", "سبعه": "7", "٧": "7", "سبع": "7", "سبعا": "7",
50
+ "ثمانية": "8", "ثمانيه": "8", "٨": "8", "ثمان": "8", "ثمنية": "8", "ثمنيه": "8", "ثمانيا": "8", "ثمن": "8",
51
+ "تسعة": "9", "تسعه": "9", "٩": "9", "تسع": "9", "تسعا": "9",
52
+
53
+ # Teens
54
+ "عشرة": "10", "١٠": "10",
55
+ "حداشر": "11", "احد عشر": "11","احداشر": "11",
56
+ "اتناشر": "12", "اثنا عشر": "12",
57
+ "تلتاشر": "13", "ثلاثة عشر": "13",
58
+ "اربعتاشر": "14", "أربعة عشر": "14",
59
+ "خمستاشر": "15", "خمسة عشر": "15",
60
+ "ستاشر": "16", "ستة عشر": "16",
61
+ "سبعتاشر": "17", "سبعة عشر": "17",
62
+ "طمنتاشر": "18", "ثمانية عشر": "18",
63
+ "تسعتاشر": "19", "تسعة عشر": "19",
64
+
65
+ # Tens
66
+ "عشرين": "20", "٢٠": "20",
67
+ "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
68
+ "اربعين": "40", "أربعين": "40", "٤٠": "40",
69
+ "خمسين": "50", "٥٠": "50",
70
+ "ستين": "60", "٦٠": "60",
71
+ "سبعين": "70", "٧٠": "70",
72
+ "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
73
+ "تسعين": "90", "٩٠": "90",
74
+
75
+ # Hundreds
76
+ "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
77
+ "ميتين": "200", "مائتين": "200",
78
+ "تلاتمية": "300", "ثلاثمائة": "300",
79
+ "اربعمية": "400", "أربعمائة": "400",
80
+ "خمسمية": "500", "خمسمائة": "500",
81
+ "ستمية": "600", "ستمائة": "600",
82
+ "سبعمية": "700", "سبعمائة": "700",
83
+ "تمانمية": "800", "ثمانمائة": "800",
84
+ "تسعمية": "900", "تسعمائة": "900",
85
+
86
+ # Thousands
87
+ "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
88
+ "ألفين": "2000", "الفين": "2000",
89
+ "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
90
+ "اربعة آلاف": "4000", "أربعة آلاف": "4000",
91
+ "خمسة آلاف": "5000",
92
+ "ستة آلاف": "6000",
93
+ "سبعة آلاف": "7000",
94
+ "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
95
+ "تسعة آلاف": "9000",
96
+
97
+ # Large numbers
98
+ "عشرة آلاف": "10000",
99
+ "مية ألف": "100000", "مائة ألف": "100000",
100
+ "مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
101
+ "ملايين": "1000000",
102
+ "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000"
103
+ }
104
+
105
+ def replace_arabic_numbers_nemo(text: str) -> str:
106
+ """Convert Arabic number words to digits for NeMo"""
107
+ for word, digit in arabic_numbers_nemo.items():
108
+ text = re.sub(rf"\b{word}\b", digit, text)
109
+ return text
110
+
111
+ def convert_arabic_numbers_whisper(sentence: str) -> str:
112
+ """
113
+ Replace Arabic number words in a sentence with digits for Whisper,
114
+ preserving all other words and punctuation.
115
+ """
116
+ if not arabic_numbers_available or not sentence.strip():
117
+ return sentence
118
+
119
+ try:
120
+ # Normalization step
121
+ replacements = {
122
+ "اربعة": "أربعة", "اربع": "أربع", "اثنين": "اثنان",
123
+ "اتنين": "اثنان", "ثلاث": "ثلاثة", "خمس": "خمسة",
124
+ "ست": "ستة", "سبع": "سبعة", "ثمان": "ثمانية",
125
+ "تسع": "تسعة", "عشر": "عشرة",
126
+ }
127
+ for wrong, correct in replacements.items():
128
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
129
+
130
+ # Split by whitespace but keep spaces
131
+ words = re.split(r'(\s+)', sentence)
132
+ converted_words = []
133
+
134
+ for word in words:
135
+ stripped = word.strip()
136
+ if not stripped: # skip spaces
137
+ converted_words.append(word)
138
+ continue
139
+
140
+ try:
141
+ num = text2number(stripped)
142
+ if isinstance(num, int):
143
+ if num != 0 or stripped == "صفر":
144
+ converted_words.append(str(num))
145
+ else:
146
+ converted_words.append(word)
147
+ else:
148
+ converted_words.append(word)
149
+ except Exception:
150
+ converted_words.append(word)
151
+
152
+ return ''.join(converted_words)
153
+
154
+ except Exception as e:
155
+ logger.warning(f"Error converting Arabic numbers: {e}")
156
+ return sentence
157
+
158
+ # Global models
159
+ asr_model_nemo = None
160
+ whisper_model = None
161
+ whisper_processor = None
162
+ whisper_tokenizer = None
163
+ device = None
164
+ torch_dtype = None
165
+
166
+ def initialize_models():
167
+ """Initialize both NeMo and Whisper models"""
168
+ global asr_model_nemo, whisper_model, whisper_processor, whisper_tokenizer, device, torch_dtype
169
+
170
+ # Initialize device settings
171
+ device = "cuda" if torch.cuda.is_available() else "cpu"
172
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
173
+
174
+ logger.info(f"Using device: {device}")
175
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
176
+
177
+ # Initialize NeMo model
178
+ logger.info("Loading NeMo FastConformer Arabic ASR model...")
179
+ model_path = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
180
+
181
+ if os.path.exists(model_path):
182
+ try:
183
+ asr_model_nemo = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
184
+ asr_model_nemo.eval()
185
+ logger.info("✓ NeMo FastConformer model loaded successfully")
186
+ except Exception as e:
187
+ logger.error(f"Failed to load NeMo model: {e}")
188
+ asr_model_nemo = None
189
+ else:
190
+ logger.warning(f"NeMo model not found at: {model_path}")
191
+ asr_model_nemo = None
192
+
193
+ # Initialize Whisper model
194
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
195
+
196
+ logger.info("Loading Whisper large-v3 model...")
197
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"
198
+
199
+ try:
200
+ # Try with flash attention first
201
+ try:
202
+ import flash_attn
203
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
204
+ MODEL_NAME,
205
+ torch_dtype=torch_dtype,
206
+ low_cpu_mem_usage=True,
207
+ use_safetensors=True,
208
+ attn_implementation="flash_attention_2"
209
+ )
210
+ logger.info("✓ Whisper loaded with flash attention")
211
+ except:
212
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
213
+ MODEL_NAME,
214
+ torch_dtype=torch_dtype,
215
+ low_cpu_mem_usage=True,
216
+ use_safetensors=True
217
+ )
218
+ logger.info("✓ Whisper loaded with standard attention")
219
+
220
+ whisper_model.to(device)
221
+ whisper_processor = AutoProcessor.from_pretrained(MODEL_NAME)
222
+
223
+ # Use processor.tokenizer, don’t reload separately
224
+ whisper_tokenizer = whisper_processor.tokenizer
225
+
226
+ logger.info("✓ Whisper model + tokenizer loaded successfully")
227
+
228
+ except Exception as e:
229
+ logger.error(f"Failed to load Whisper model: {e}")
230
+ whisper_model = None
231
+
232
+
233
+
234
+
235
+ # logger.info("Loading Whisper large-v3-turbo model...")
236
+ # MODEL_NAME = "openai/whisper-large-v3-turbo"
237
+
238
+ # try:
239
+ # # Try with flash attention first
240
+ # try:
241
+ # import flash_attn
242
+ # whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
243
+ # MODEL_NAME,
244
+ # torch_dtype=torch_dtype,
245
+ # low_cpu_mem_usage=True,
246
+ # use_safetensors=True,
247
+ # attn_implementation="flash_attention_2"
248
+ # )
249
+ # logger.info("✓ Whisper loaded with flash attention")
250
+ # except:
251
+ # whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
252
+ # MODEL_NAME,
253
+ # torch_dtype=torch_dtype,
254
+ # low_cpu_mem_usage=True,
255
+ # use_safetensors=True
256
+ # )
257
+ # logger.info("✓ Whisper loaded with standard attention")
258
+
259
+ # whisper_model.to(device)
260
+ # whisper_processor = AutoProcessor.from_pretrained(MODEL_NAME)
261
+ # whisper_tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
262
+ # logger.info("✓ Whisper model loaded successfully")
263
+
264
+ # except Exception as e:
265
+ # logger.error(f"Failed to load Whisper model: {e}")
266
+ # whisper_model = None
267
+
268
+ # Initialize models on startup
269
+ initialize_models()
270
+
271
+ # Thread pool for processing
272
+ executor = ThreadPoolExecutor(max_workers=4)
273
+
274
+ # class JambonzAudioBuffer:
275
+ # def __init__(self, sample_rate=8000, chunk_duration=1.0):
276
+ # self.sample_rate = sample_rate
277
+ # self.chunk_duration = chunk_duration
278
+ # self.chunk_samples = int(chunk_duration * sample_rate)
279
+
280
+ # self.buffer = np.array([], dtype=np.float32)
281
+ # self.lock = threading.Lock()
282
+ # self.total_audio = np.array([], dtype=np.float32)
283
+
284
+ # # Voice Activity Detection
285
+ # self.silence_threshold = 0.05
286
+ # self.min_speech_samples = int(0.5 * sample_rate)
287
+
288
+ # def add_audio(self, audio_data):
289
+ # with self.lock:
290
+ # self.buffer = np.concatenate([self.buffer, audio_data])
291
+ # self.total_audio = np.concatenate([self.total_audio, audio_data])
292
+
293
+ # def has_chunk_ready(self):
294
+ # with self.lock:
295
+ # return len(self.buffer) >= self.chunk_samples
296
+
297
+ # def is_speech(self, audio_chunk):
298
+ # """Simple VAD based on energy"""
299
+ # if len(audio_chunk) < self.min_speech_samples:
300
+ # return False
301
+ # energy = np.mean(np.abs(audio_chunk))
302
+ # return energy > self.silence_threshold
303
+
304
+ # def get_chunk_for_processing(self):
305
+ # """Get audio chunk for processing"""
306
+ # with self.lock:
307
+ # if len(self.buffer) < self.chunk_samples:
308
+ # return None
309
+ # return np.array([1]) # Signal that chunk is ready
310
+
311
+ # def get_all_audio(self):
312
+ # """Get all accumulated audio"""
313
+ # with self.lock:
314
+ # return self.total_audio.copy()
315
+
316
+ # def clear(self):
317
+ # with self.lock:
318
+ # self.buffer = np.array([], dtype=np.float32)
319
+ # self.total_audio = np.array([], dtype=np.float32)
320
+
321
+ # def reset_for_new_segment(self):
322
+ # """Reset buffers for new transcription segment"""
323
+ # with self.lock:
324
+ # self.buffer = np.array([], dtype=np.float32)
325
+ # self.total_audio = np.array([], dtype=np.float32)
326
+
327
+ class JambonzAudioBuffer:
328
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
329
+ self.sample_rate = sample_rate
330
+ self.chunk_duration = chunk_duration
331
+ self.chunk_samples = int(chunk_duration * sample_rate)
332
+
333
+ self.buffer = np.array([], dtype=np.float32)
334
+ self.lock = threading.Lock()
335
+ self.total_audio = np.array([], dtype=np.float32)
336
+
337
+ # Voice Activity Detection - ADJUSTED FOR WHISPER
338
+ self.silence_threshold = 0.01 # Lower threshold for Whisper
339
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
340
+
341
+ def add_audio(self, audio_data):
342
+ with self.lock:
343
+ self.buffer = np.concatenate([self.buffer, audio_data])
344
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
345
+
346
+ # Log audio addition for debugging
347
+ logger.debug(f"Added {len(audio_data)} audio samples, total: {len(self.total_audio)}")
348
+
349
+ def has_chunk_ready(self):
350
+ with self.lock:
351
+ ready = len(self.buffer) >= self.chunk_samples
352
+ if ready:
353
+ logger.debug(f"Chunk ready: {len(self.buffer)} >= {self.chunk_samples}")
354
+ return ready
355
+
356
+ def is_speech(self, audio_chunk):
357
+ """Enhanced VAD based on energy - better for Whisper"""
358
+ if len(audio_chunk) < self.min_speech_samples:
359
+ logger.debug(f"Audio too short for VAD: {len(audio_chunk)} < {self.min_speech_samples}")
360
+ return False
361
+
362
+ # Calculate RMS energy
363
+ rms_energy = np.sqrt(np.mean(audio_chunk ** 2))
364
+
365
+ # Also check peak amplitude
366
+ peak_amplitude = np.max(np.abs(audio_chunk))
367
+
368
+ is_speech = rms_energy > self.silence_threshold or peak_amplitude > (self.silence_threshold * 2)
369
+
370
+ logger.debug(f"VAD check - RMS: {rms_energy:.4f}, Peak: {peak_amplitude:.4f}, "
371
+ f"Threshold: {self.silence_threshold}, Speech: {is_speech}")
372
+
373
+ return is_speech
374
+
375
+ def get_chunk_for_processing(self):
376
+ """Get audio chunk for processing"""
377
+ with self.lock:
378
+ if len(self.buffer) < self.chunk_samples:
379
+ return None
380
+
381
+ logger.debug(f"Returning processing signal, buffer size: {len(self.buffer)}")
382
+ return np.array([1]) # Signal that chunk is ready
383
+
384
+ def get_all_audio(self):
385
+ """Get all accumulated audio"""
386
+ with self.lock:
387
+ audio_copy = self.total_audio.copy()
388
+ logger.debug(f"Returning {len(audio_copy)} total audio samples")
389
+ return audio_copy
390
+
391
+ def clear(self):
392
+ with self.lock:
393
+ self.buffer = np.array([], dtype=np.float32)
394
+ self.total_audio = np.array([], dtype=np.float32)
395
+ logger.debug("Audio buffer cleared")
396
+
397
+ def reset_for_new_segment(self):
398
+ """Reset buffers for new transcription segment"""
399
+ with self.lock:
400
+ self.buffer = np.array([], dtype=np.float32)
401
+ self.total_audio = np.array([], dtype=np.float32)
402
+ logger.debug("Audio buffer reset for new segment")
403
+
404
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
405
+ """Convert LINEAR16 PCM bytes to numpy array"""
406
+ try:
407
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
408
+ audio_array = audio_array.astype(np.float32) / 32768.0
409
+ return audio_array
410
+ except Exception as e:
411
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
412
+ return np.array([], dtype=np.float32)
413
+
414
+ def resample_audio(audio_data, source_rate, target_rate):
415
+ """Resample audio to target sample rate"""
416
+ if source_rate == target_rate:
417
+ return audio_data
418
+
419
+ if source_rate == 8000 and target_rate == 16000:
420
+ # Simple 2x upsampling for common case
421
+ upsampled = np.repeat(audio_data, 2)
422
+ return upsampled.astype(np.float32)
423
+
424
+ # Fallback: Linear interpolation resampling
425
+ ratio = target_rate / source_rate
426
+ indices = np.arange(0, len(audio_data), 1/ratio)
427
+ indices = indices[indices < len(audio_data)]
428
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
429
+
430
+ return resampled.astype(np.float32)
431
+
432
+ def transcribe_with_nemo(audio_data, source_sample_rate=8000, target_sample_rate=16000):
433
+ """Transcribe audio using NeMo FastConformer"""
434
+ try:
435
+ if len(audio_data) == 0 or asr_model_nemo is None:
436
+ return ""
437
+
438
+ # Resample to 16kHz (NeMo models typically expect 16kHz)
439
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
440
+
441
+ # Skip very short audio
442
+ min_samples = int(0.3 * target_sample_rate)
443
+ if len(resampled_audio) < min_samples:
444
+ return ""
445
+
446
+ start_time = time.time()
447
+
448
+ # Save audio to temporary file (NeMo expects file path)
449
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
450
+ sf.write(tmp_file.name, resampled_audio, target_sample_rate)
451
+ tmp_path = tmp_file.name
452
+
453
+ try:
454
+ # Transcribe with NeMo
455
+ result = asr_model_nemo.transcribe([tmp_path])
456
+
457
+ if result and len(result) > 0:
458
+ # Handle different NeMo result formats
459
+ if hasattr(result[0], 'text'):
460
+ raw_text = result[0].text
461
+ elif isinstance(result[0], str):
462
+ raw_text = result[0]
463
+ else:
464
+ raw_text = str(result[0])
465
+
466
+ if not isinstance(raw_text, str):
467
+ raw_text = str(raw_text)
468
+
469
+ if raw_text and raw_text.strip():
470
+ # Convert Arabic numbers to digits for NeMo
471
+ cleaned_text = replace_arabic_numbers_nemo(raw_text)
472
+ end_time = time.time()
473
+
474
+ if cleaned_text.strip():
475
+ logger.info(f"NeMo transcription: '{cleaned_text}' (processed in {end_time - start_time:.2f}s)")
476
+
477
+ return cleaned_text.strip()
478
+
479
+ finally:
480
+ # Clean up temporary file
481
+ if os.path.exists(tmp_path):
482
+ os.remove(tmp_path)
483
+
484
+ return ""
485
+
486
+ except Exception as e:
487
+ logger.error(f"Error during NeMo transcription: {e}")
488
+ return ""
489
+
490
+ def transcribe_with_whisper(audio_data, source_sample_rate=8000, target_sample_rate=16000):
491
+ """Transcribe audio chunk using Whisper model directly"""
492
+ try:
493
+ if len(audio_data) == 0 or whisper_model is None:
494
+ return ""
495
+
496
+ # Resample from 8kHz to 16kHz for Whisper
497
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
498
+
499
+ # Ensure minimum length for Whisper
500
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
501
+ if len(resampled_audio) < min_samples:
502
+ return ""
503
+
504
+ start_time = time.time()
505
+
506
+ # Prepare input features with proper dtype
507
+ input_features = whisper_processor(
508
+ resampled_audio,
509
+ sampling_rate=target_sample_rate,
510
+ return_tensors="pt"
511
+ ).input_features
512
+
513
+ # Ensure correct dtype and device
514
+ input_features = input_features.to(device=device, dtype=torch_dtype)
515
+
516
+ # Create attention mask to avoid warnings
517
+ attention_mask = torch.ones(
518
+ input_features.shape[:-1],
519
+ dtype=torch.long,
520
+ device=device
521
+ )
522
+
523
+ # Generate transcription using model directly
524
+ with torch.no_grad():
525
+ predicted_ids = whisper_model.generate(
526
+ input_features,
527
+ attention_mask=attention_mask,
528
+ max_new_tokens=128,
529
+ do_sample=False,
530
+ # temperature=0.0,
531
+ num_beams=1,
532
+ language="english",
533
+ task="translate",
534
+ pad_token_id=whisper_tokenizer.pad_token_id,
535
+ eos_token_id=whisper_tokenizer.eos_token_id
536
+ )
537
+
538
+ # Decode the transcription
539
+ transcription = whisper_tokenizer.batch_decode(
540
+ predicted_ids,
541
+ skip_special_tokens=True
542
+ )[0].strip()
543
+
544
+ end_time = time.time()
545
+
546
+ logger.info(f"Whisper transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
547
+ return transcription
548
+
549
+ except Exception as e:
550
+ logger.error(f"Error during Whisper transcription: {e}")
551
+ return ""
552
+
553
+ class UnifiedSTTHandler:
554
+ def __init__(self, websocket):
555
+ self.websocket = websocket
556
+ self.audio_buffer = None
557
+ self.config = {}
558
+ self.running = False
559
+ self.transcription_task = None
560
+ self.use_nemo = False # Flag to determine which model to use
561
+
562
+ # Auto-final detection variables
563
+ self.interim_count = 0
564
+ self.last_interim_time = None
565
+ self.silence_timeout = 2.9
566
+ self.min_interim_count = 1
567
+ self.auto_final_task = None
568
+ self.accumulated_transcript = ""
569
+ self.final_sent = False
570
+ self.segment_number = 0
571
+ self.last_partial = ""
572
+
573
+ # Processing tracking
574
+ self.processing_count = 0
575
+
576
+ # Add this debugging method to your UnifiedSTTHandler class
577
+
578
+ async def add_audio_data(self, audio_bytes):
579
+ """Add audio data to buffer with enhanced debugging"""
580
+ if self.audio_buffer and self.running:
581
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
582
+ self.audio_buffer.add_audio(audio_data)
583
+
584
+ model_name = "NeMo" if self.use_nemo else "Whisper"
585
+
586
+ # Debug logging every few audio packets
587
+ if len(audio_data) > 0:
588
+ total_samples = len(self.audio_buffer.get_all_audio())
589
+ total_seconds = total_samples / self.config["sample_rate"]
590
+
591
+ # Log every second of audio
592
+ if int(total_seconds) != getattr(self, '_last_logged_second', -1):
593
+ logger.info(f"{model_name} - Accumulated {total_seconds:.1f}s of audio ({total_samples} samples)")
594
+ self._last_logged_second = int(total_seconds)
595
+
596
+ # Check if we should have chunks ready
597
+ chunk_ready = self.audio_buffer.has_chunk_ready()
598
+ logger.info(f"{model_name} - Chunk ready: {chunk_ready}")
599
+ # async def start_processing(self, start_message):
600
+ # """Initialize with start message from jambonz"""
601
+ # self.config = {
602
+ # "language": start_message.get("language", "ar-EG"),
603
+ # "format": start_message.get("format", "raw"),
604
+ # "encoding": start_message.get("encoding", "LINEAR16"),
605
+ # "sample_rate": start_message.get("sampleRateHz", 8000),
606
+ # "interim_results": True, # Always enable for internal processing
607
+ # "options": start_message.get("options", {})
608
+ # }
609
+
610
+ # # Determine which model to use based on language parameter
611
+ # language = self.config["language"]
612
+ # if language == "ar-EG":
613
+ # logger.info("nemooooooooooooooooooooooooooo")
614
+ # self.use_nemo = True
615
+ # model_name = "NeMo FastConformer"
616
+ # elif language == "ar-EG-whis":
617
+ # logger.info("whisperrrrrrrrrrrrrrrrrrrrrrrrrrrrr")
618
+ # self.use_nemo = False
619
+ # model_name = "Whisper large-v3"
620
+ # else:
621
+ # # Default to NeMo for any other Arabic variant
622
+ # self.use_nemo = True
623
+ # model_name = "NeMo FastConformer (default)"
624
+
625
+ # logger.info(f"STT session started with {model_name} for language: {language}")
626
+ # logger.info(f"Config: {self.config}")
627
+
628
+ # # Check if selected model is available
629
+ # if self.use_nemo and asr_model_nemo is None:
630
+ # await self.send_error("NeMo model not available")
631
+ # return
632
+ # elif not self.use_nemo and whisper_model is None:
633
+ # await self.send_error("Whisper model not available")
634
+ # return
635
+
636
+ # # Initialize audio buffer
637
+ # self.audio_buffer = JambonzAudioBuffer(
638
+ # sample_rate=self.config["sample_rate"],
639
+ # chunk_duration=1.0 # 1 second chunks
640
+ # )
641
+
642
+ # # Reset session variables
643
+ # self.running = True
644
+ # self.interim_count = 0
645
+ # self.last_interim_time = None
646
+ # self.accumulated_transcript = ""
647
+ # self.final_sent = False
648
+ # self.segment_number = 0
649
+ # self.processing_count = 0
650
+ # self.last_partial = ""
651
+
652
+ # # Start background transcription task
653
+ # self.transcription_task = asyncio.create_task(self._process_audio_chunks())
654
+
655
+ # # Start auto-final detection task
656
+ # self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
657
+
658
+ # Replace these methods in your UnifiedSTTHandler class
659
+
660
+ async def start_processing(self, start_message):
661
+ """Initialize with start message from jambonz"""
662
+ self.config = {
663
+ "language": start_message.get("language", "ar-EG"),
664
+ "format": start_message.get("format", "raw"),
665
+ "encoding": start_message.get("encoding", "LINEAR16"),
666
+ "sample_rate": start_message.get("sampleRateHz", 8000),
667
+ "interim_results": True, # Always enable for internal processing
668
+ "options": start_message.get("options", {})
669
+ }
670
+
671
+ # Determine which model to use based on language parameter
672
+ language = self.config["language"]
673
+ if language == "ar-EG":
674
+ logger.info("Selected NeMo FastConformer")
675
+ self.use_nemo = True
676
+ model_name = "NeMo FastConformer"
677
+ elif language == "ar-EG-whis":
678
+ logger.info("Selected Whisper large-v3")
679
+ self.use_nemo = False
680
+ model_name = "Whisper large-v3"
681
+ else:
682
+ # Default to NeMo for any other Arabic variant
683
+ self.use_nemo = True
684
+ model_name = "NeMo FastConformer (default)"
685
+
686
+ logger.info(f"STT session started with {model_name} for language: {language}")
687
+ logger.info(f"Config: {self.config}")
688
+
689
+ # Check if selected model is available
690
+ if self.use_nemo and asr_model_nemo is None:
691
+ await self.send_error("NeMo model not available")
692
+ return
693
+ elif not self.use_nemo and whisper_model is None:
694
+ await self.send_error("Whisper model not available")
695
+ return
696
+
697
+ # Initialize audio buffer with model-specific settings
698
+ if self.use_nemo:
699
+ chunk_duration = 1.0 # NeMo processes every 1 second
700
+ else:
701
+ chunk_duration = 2.0 # Whisper processes every 2 seconds for better accuracy
702
+
703
+ self.audio_buffer = JambonzAudioBuffer(
704
+ sample_rate=self.config["sample_rate"],
705
+ chunk_duration=chunk_duration
706
+ )
707
+
708
+ # Adjust VAD threshold for Whisper
709
+ if not self.use_nemo:
710
+ self.audio_buffer.silence_threshold = 0.005 # Lower threshold for Whisper
711
+
712
+ # Reset session variables
713
+ self.running = True
714
+ self.interim_count = 0
715
+ self.last_interim_time = None
716
+ self.accumulated_transcript = ""
717
+ self.final_sent = False
718
+ self.segment_number = 0
719
+ self.processing_count = 0
720
+ self.last_partial = ""
721
+
722
+ # Start background transcription task
723
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
724
+
725
+ # Start auto-final detection task
726
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
727
+
728
+ logger.info(f"Background tasks started for {model_name}")
729
+
730
+
731
+
732
+ async def stop_processing(self):
733
+ """Stop current processing session"""
734
+ logger.info("Stopping STT session...")
735
+ self.running = False
736
+
737
+ # Cancel background tasks
738
+ for task in [self.transcription_task, self.auto_final_task]:
739
+ if task:
740
+ task.cancel()
741
+ try:
742
+ await task
743
+ except asyncio.CancelledError:
744
+ pass
745
+
746
+ # Send final transcription if not already sent
747
+ if not self.final_sent and self.accumulated_transcript.strip():
748
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
749
+
750
+ # Process any remaining audio for comprehensive final transcription
751
+ if self.audio_buffer:
752
+ all_audio = self.audio_buffer.get_all_audio()
753
+ if len(all_audio) > 0 and not self.final_sent:
754
+ loop = asyncio.get_event_loop()
755
+
756
+ if self.use_nemo:
757
+ final_transcription = await loop.run_in_executor(
758
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
759
+ )
760
+ else:
761
+ final_transcription = await loop.run_in_executor(
762
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
763
+ )
764
+
765
+ if final_transcription.strip():
766
+ await self.send_transcription(final_transcription, is_final=True)
767
+
768
+ # Clear audio buffer
769
+ if self.audio_buffer:
770
+ self.audio_buffer.clear()
771
+
772
+ logger.info("STT session stopped")
773
+
774
+ async def start_new_segment(self):
775
+ """Start a new transcription segment"""
776
+ self.segment_number += 1
777
+ self.interim_count = 0
778
+ self.last_interim_time = None
779
+ self.accumulated_transcript = ""
780
+ self.final_sent = False
781
+ self.last_partial = ""
782
+ self.processing_count = 0
783
+
784
+ if self.audio_buffer:
785
+ self.audio_buffer.reset_for_new_segment()
786
+
787
+ logger.info(f"Started new transcription segment #{self.segment_number}")
788
+
789
+ async def add_audio_data(self, audio_bytes):
790
+ """Add audio data to buffer"""
791
+ if self.audio_buffer and self.running:
792
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
793
+ self.audio_buffer.add_audio(audio_data)
794
+
795
+ # async def _process_audio_chunks(self):
796
+ # """Process audio chunks for interim results"""
797
+ # while self.running:
798
+ # try:
799
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
800
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
801
+ # if chunk_signal is not None:
802
+ # all_audio = self.audio_buffer.get_all_audio()
803
+
804
+ # if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
805
+ # loop = asyncio.get_event_loop()
806
+
807
+ # # Choose transcription method based on model selection
808
+ # if self.use_nemo:
809
+ # transcription = await loop.run_in_executor(
810
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
811
+ # )
812
+ # else:
813
+ # transcription = await loop.run_in_executor(
814
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
815
+ # )
816
+
817
+ # if transcription.strip():
818
+ # self.processing_count += 1
819
+ # self.accumulated_transcript = transcription
820
+
821
+ # if transcription != self.last_partial or self.interim_count == 0:
822
+ # self.last_partial = transcription
823
+ # self.interim_count += 1
824
+ # self.last_interim_time = time.time()
825
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
826
+ # else:
827
+ # self.last_interim_time = time.time()
828
+
829
+ # await asyncio.sleep(0.1) # Check every 100ms
830
+
831
+ # except Exception as e:
832
+ # logger.error(f"Error in chunk processing: {e}")
833
+ # await asyncio.sleep(0.1)
834
+
835
+
836
+ # async def _monitor_for_auto_final(self):
837
+ # """Monitor for auto-final conditions"""
838
+ # while self.running:
839
+ # try:
840
+ # current_time = time.time()
841
+
842
+ # if (self.interim_count >= self.min_interim_count and
843
+ # self.last_interim_time is not None and
844
+ # (current_time - self.last_interim_time) >= self.silence_timeout and
845
+ # not self.final_sent and
846
+ # self.accumulated_transcript.strip()):
847
+
848
+ # logger.info(f"Auto-final triggered for segment #{self.segment_number}")
849
+
850
+ # await self.send_transcription(self.accumulated_transcript, is_final=True)
851
+ # await self.start_new_segment()
852
+
853
+ # await asyncio.sleep(0.5) # Check every 500ms
854
+
855
+ # except Exception as e:
856
+ # logger.error(f"Error in auto-final monitoring: {e}")
857
+ # await asyncio.sleep(0.5)
858
+
859
+ # async def _process_audio_chunks(self):
860
+ # """Process audio chunks for interim results - FIXED for Whisper streaming"""
861
+ # logger.info(f"Starting audio chunk processing for {'NeMo' if self.use_nemo else 'Whisper'}")
862
+
863
+ # while self.running:
864
+ # try:
865
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
866
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
867
+ # if chunk_signal is not None:
868
+ # all_audio = self.audio_buffer.get_all_audio()
869
+
870
+ # # Check if we have enough audio and speech activity
871
+ # if len(all_audio) > 0:
872
+ # # Get the latest chunk for VAD check
873
+ # latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
874
+ # latest_chunk = all_audio[latest_chunk_start:]
875
+
876
+ # # For debugging
877
+ # logger.debug(f"Audio buffer size: {len(all_audio)} samples, Latest chunk: {len(latest_chunk)} samples")
878
+
879
+ # if self.audio_buffer.is_speech(latest_chunk):
880
+ # logger.info(f"Speech detected, processing with {'NeMo' if self.use_nemo else 'Whisper'}")
881
+
882
+ # loop = asyncio.get_event_loop()
883
+
884
+ # # Choose transcription method based on model selection
885
+ # if self.use_nemo:
886
+ # transcription = await loop.run_in_executor(
887
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
888
+ # )
889
+ # else:
890
+ # # For Whisper, ensure we process the accumulated audio
891
+ # transcription = await loop.run_in_executor(
892
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
893
+ # )
894
+
895
+ # logger.info(f"Transcription result: '{transcription}'")
896
+
897
+ # if transcription.strip():
898
+ # self.processing_count += 1
899
+ # self.accumulated_transcript = transcription
900
+
901
+ # if transcription != self.last_partial or self.interim_count == 0:
902
+ # self.last_partial = transcription
903
+ # self.interim_count += 1
904
+ # self.last_interim_time = time.time()
905
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
906
+ # else:
907
+ # self.last_interim_time = time.time()
908
+ # logger.info("Same transcription, updating time only")
909
+ # else:
910
+ # logger.debug("No speech detected in latest chunk")
911
+
912
+ # await asyncio.sleep(0.1) # Check every 100ms
913
+
914
+ # except Exception as e:
915
+ # logger.error(f"Error in chunk processing: {e}")
916
+ # import traceback
917
+ # traceback.print_exc()
918
+ # await asyncio.sleep(0.1)
919
+
920
+ # async def _monitor_for_auto_final(self):
921
+ # """Monitor for auto-final conditions - Enhanced logging"""
922
+ # logger.info("Starting auto-final monitoring")
923
+
924
+ # while self.running:
925
+ # try:
926
+ # current_time = time.time()
927
+
928
+ # if (self.interim_count >= self.min_interim_count and
929
+ # self.last_interim_time is not None and
930
+ # (current_time - self.last_interim_time) >= self.silence_timeout and
931
+ # not self.final_sent and
932
+ # self.accumulated_transcript.strip()):
933
+
934
+ # silence_duration = current_time - self.last_interim_time
935
+ # logger.info(f"Auto-final triggered for segment #{self.segment_number} - "
936
+ # f"Interim count: {self.interim_count}, Silence: {silence_duration:.1f}s")
937
+
938
+ # await self.send_transcription(self.accumulated_transcript, is_final=True)
939
+ # await self.start_new_segment()
940
+
941
+ # # Debug logging every 5 seconds
942
+ # if int(current_time) % 5 == 0:
943
+ # logger.debug(f"Auto-final status - Interim count: {self.interim_count}, "
944
+ # f"Last interim: {self.last_interim_time}, "
945
+ # f"Final sent: {self.final_sent}, "
946
+ # f"Transcript: '{self.accumulated_transcript[:50]}...'")
947
+
948
+ # await asyncio.sleep(0.5) # Check every 500ms
949
+
950
+ # except Exception as e:
951
+ # logger.error(f"Error in auto-final monitoring: {e}")
952
+ # await asyncio.sleep(0.5)
953
+
954
+ # async def _process_audio_chunks(self):
955
+ # """Process audio chunks for interim results - FIXED for both models"""
956
+ # model_name = "NeMo" if self.use_nemo else "Whisper"
957
+ # logger.info(f"Starting audio chunk processing for {model_name}")
958
+
959
+ # while self.running:
960
+ # try:
961
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
962
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
963
+ # if chunk_signal is not None:
964
+ # all_audio = self.audio_buffer.get_all_audio()
965
+
966
+ # # Debug logging
967
+ # logger.debug(f"Processing chunk - Total audio: {len(all_audio)} samples")
968
+
969
+ # if len(all_audio) > 0:
970
+ # # Get the latest chunk for VAD check
971
+ # latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
972
+ # latest_chunk = all_audio[latest_chunk_start:]
973
+
974
+ # # Check for speech activity
975
+ # has_speech = self.audio_buffer.is_speech(latest_chunk)
976
+ # logger.debug(f"Speech detection result: {has_speech}")
977
+
978
+ # if has_speech:
979
+ # logger.info(f"Processing audio with {model_name} - {len(all_audio)} samples")
980
+
981
+ # loop = asyncio.get_event_loop()
982
+ # start_time = time.time()
983
+
984
+ # try:
985
+ # # Choose transcription method based on model selection
986
+ # if self.use_nemo:
987
+ # transcription = await loop.run_in_executor(
988
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
989
+ # )
990
+ # else:
991
+ # # For Whisper, ensure we have enough audio
992
+ # if len(all_audio) >= int(0.5 * 16000): # At least 0.5 seconds at 16kHz
993
+ # transcription = await loop.run_in_executor(
994
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
995
+ # )
996
+ # else:
997
+ # transcription = ""
998
+ # logger.debug("Whisper: Not enough audio for transcription")
999
+
1000
+ # process_time = time.time() - start_time
1001
+ # logger.info(f"{model_name} processing took {process_time:.2f}s, result: '{transcription}'")
1002
+
1003
+ # if transcription and transcription.strip():
1004
+ # self.processing_count += 1
1005
+ # self.accumulated_transcript = transcription
1006
+
1007
+ # if transcription != self.last_partial or self.interim_count == 0:
1008
+ # self.last_partial = transcription
1009
+ # self.interim_count += 1
1010
+ # self.last_interim_time = time.time()
1011
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
1012
+ # else:
1013
+ # self.last_interim_time = time.time()
1014
+ # logger.debug("Same transcription, updating time only")
1015
+ # else:
1016
+ # logger.debug(f"{model_name} returned empty transcription")
1017
+
1018
+ # except Exception as e:
1019
+ # logger.error(f"Error in {model_name} transcription: {e}")
1020
+ # else:
1021
+ # logger.debug("No speech detected in latest chunk")
1022
+
1023
+ # # Different sleep intervals for different models
1024
+ # sleep_interval = 0.1 if self.use_nemo else 0.2 # Whisper can be less frequent
1025
+ # await asyncio.sleep(sleep_interval)
1026
+
1027
+ # except Exception as e:
1028
+ # logger.error(f"Error in chunk processing: {e}")
1029
+ # import traceback
1030
+ # traceback.print_exc()
1031
+ # await asyncio.sleep(1) # Longer sleep on error
1032
+
1033
+ # Also add this to the beginning of _process_audio_chunks method:
1034
+
1035
+ async def _process_audio_chunks(self):
1036
+ """Process audio chunks for interim results - with debugging"""
1037
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1038
+ logger.info(f"Starting audio chunk processing for {model_name}")
1039
+
1040
+ chunk_count = 0
1041
+
1042
+ while self.running:
1043
+ try:
1044
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
1045
+ chunk_count += 1
1046
+ logger.info(f"{model_name} - Processing chunk #{chunk_count}")
1047
+
1048
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
1049
+ if chunk_signal is not None:
1050
+ all_audio = self.audio_buffer.get_all_audio()
1051
+
1052
+ logger.info(f"{model_name} - Got {len(all_audio)} samples for processing")
1053
+
1054
+ if len(all_audio) > 0:
1055
+ # Get the latest chunk for VAD check
1056
+ latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
1057
+ latest_chunk = all_audio[latest_chunk_start:]
1058
+
1059
+ # Check for speech activity
1060
+ has_speech = self.audio_buffer.is_speech(latest_chunk)
1061
+ logger.info(f"{model_name} - Speech detected: {has_speech}")
1062
+
1063
+ if has_speech:
1064
+ logger.info(f"{model_name} - Starting transcription...")
1065
+
1066
+ loop = asyncio.get_event_loop()
1067
+ start_time = time.time()
1068
+
1069
+ try:
1070
+ # Choose transcription method based on model selection
1071
+ if self.use_nemo:
1072
+ transcription = await loop.run_in_executor(
1073
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
1074
+ )
1075
+ else:
1076
+ transcription = await loop.run_in_executor(
1077
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
1078
+ )
1079
+
1080
+ process_time = time.time() - start_time
1081
+ logger.info(f"{model_name} - Transcription completed in {process_time:.2f}s: '{transcription}'")
1082
+
1083
+ if transcription and transcription.strip():
1084
+ self.processing_count += 1
1085
+ self.accumulated_transcript = transcription
1086
+
1087
+ if transcription != self.last_partial or self.interim_count == 0:
1088
+ self.last_partial = transcription
1089
+ self.interim_count += 1
1090
+ self.last_interim_time = time.time()
1091
+ logger.info(f"{model_name} - Updated interim_count to {self.interim_count}")
1092
+ else:
1093
+ self.last_interim_time = time.time()
1094
+ logger.info(f"{model_name} - Same transcription, updating time only")
1095
+ else:
1096
+ logger.info(f"{model_name} - No transcription result")
1097
+
1098
+ except Exception as e:
1099
+ logger.error(f"{model_name} - Transcription error: {e}")
1100
+ import traceback
1101
+ traceback.print_exc()
1102
+ else:
1103
+ logger.debug(f"{model_name} - No speech in chunk")
1104
+ else:
1105
+ logger.warning(f"{model_name} - Chunk signal was None")
1106
+ else:
1107
+ # Log why chunk is not ready
1108
+ if self.audio_buffer:
1109
+ current_size = len(self.audio_buffer.buffer)
1110
+ required_size = self.audio_buffer.chunk_samples
1111
+ if current_size > 0:
1112
+ logger.debug(f"{model_name} - Buffer: {current_size}/{required_size} samples")
1113
+
1114
+ await asyncio.sleep(0.1)
1115
+
1116
+ except Exception as e:
1117
+ logger.error(f"{model_name} - Error in chunk processing: {e}")
1118
+ import traceback
1119
+ traceback.print_exc()
1120
+ await asyncio.sleep(1)
1121
+
1122
+ async def _monitor_for_auto_final(self):
1123
+ """Monitor for auto-final conditions with model-specific timeouts"""
1124
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1125
+ timeout = 2.0 if self.use_nemo else 3.0 # Longer timeout for Whisper
1126
+
1127
+ logger.info(f"Starting auto-final monitoring for {model_name} (timeout: {timeout}s)")
1128
+
1129
+ while self.running:
1130
+ try:
1131
+ current_time = time.time()
1132
+
1133
+ if (self.interim_count >= self.min_interim_count and
1134
+ self.last_interim_time is not None and
1135
+ (current_time - self.last_interim_time) >= timeout and
1136
+ not self.final_sent and
1137
+ self.accumulated_transcript.strip()):
1138
+
1139
+ silence_duration = current_time - self.last_interim_time
1140
+ logger.info(f"Auto-final triggered for segment #{self.segment_number} ({model_name}) - "
1141
+ f"Interim count: {self.interim_count}, Silence: {silence_duration:.1f}s")
1142
+
1143
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
1144
+ await self.start_new_segment()
1145
+
1146
+ await asyncio.sleep(0.5) # Check every 500ms
1147
+
1148
+ except Exception as e:
1149
+ logger.error(f"Error in auto-final monitoring: {e}")
1150
+ await asyncio.sleep(0.5)
1151
+
1152
+
1153
+
1154
+ async def send_transcription(self, text, is_final=True, confidence=0.9):
1155
+ """Send transcription in jambonz format"""
1156
+ try:
1157
+ # Apply number conversion only for Whisper
1158
+ if not self.use_nemo and is_final:
1159
+ original_text = text
1160
+ converted_text = convert_arabic_numbers_whisper(text)
1161
+
1162
+ if original_text != converted_text:
1163
+ logger.info(f"Whisper - Arabic numbers converted: '{original_text}' -> '{converted_text}'")
1164
+ text = converted_text
1165
+
1166
+ message = {
1167
+ "type": "transcription",
1168
+ "is_final": True, # Always send as final
1169
+ "alternatives": [
1170
+ {
1171
+ "transcript": text,
1172
+ "confidence": confidence
1173
+ }
1174
+ ],
1175
+ "language": self.config.get("language", "ar-EG"),
1176
+ "channel": 1
1177
+ }
1178
+
1179
+ await self.websocket.send(json.dumps(message))
1180
+ self.final_sent = True
1181
+
1182
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1183
+ logger.info(f"Sent FINAL transcription ({model_name}): '{text}'")
1184
+
1185
+ except Exception as e:
1186
+ logger.error(f"Error sending transcription: {e}")
1187
+
1188
+ async def send_error(self, error_message):
1189
+ """Send error message in jambonz format"""
1190
+ try:
1191
+ message = {
1192
+ "type": "error",
1193
+ "error": error_message
1194
+ }
1195
+ await self.websocket.send(json.dumps(message))
1196
+ logger.error(f"Sent error: {error_message}")
1197
+ except Exception as e:
1198
+ logger.error(f"Error sending error message: {e}")
1199
+
1200
+ async def handle_jambonz_websocket(websocket):
1201
+ """Handle jambonz WebSocket connections"""
1202
+
1203
+ client_id = f"jambonz_{id(websocket)}"
1204
+ logger.info(f"New unified STT connection: {client_id}")
1205
+
1206
+ handler = UnifiedSTTHandler(websocket)
1207
+
1208
+ try:
1209
+ async for message in websocket:
1210
+ try:
1211
+ if isinstance(message, str):
1212
+ data = json.loads(message)
1213
+ message_type = data.get("type")
1214
+
1215
+ if message_type == "start":
1216
+ logger.info(f"Received start message: {data}")
1217
+ await handler.start_processing(data)
1218
+
1219
+ elif message_type == "stop":
1220
+ logger.info("Received stop message - closing WebSocket")
1221
+ await handler.stop_processing()
1222
+ await websocket.close(code=1000, reason="Session stopped by client")
1223
+ break
1224
+
1225
+ else:
1226
+ logger.warning(f"Unknown message type: {message_type}")
1227
+ await handler.send_error(f"Unknown message type: {message_type}")
1228
+
1229
+ else:
1230
+ # Handle binary audio data
1231
+ if not handler.running or handler.audio_buffer is None:
1232
+ logger.warning("Received audio data outside of active session")
1233
+ await handler.send_error("Received audio before start message or after stop")
1234
+ continue
1235
+
1236
+ await handler.add_audio_data(message)
1237
+
1238
+ except json.JSONDecodeError as e:
1239
+ logger.error(f"JSON decode error: {e}")
1240
+ await handler.send_error(f"Invalid JSON: {str(e)}")
1241
+ except Exception as e:
1242
+ logger.error(f"Error processing message: {e}")
1243
+ await handler.send_error(f"Processing error: {str(e)}")
1244
+
1245
+ except websockets.exceptions.ConnectionClosed:
1246
+ logger.info(f"Unified STT connection closed: {client_id}")
1247
+ except Exception as e:
1248
+ logger.error(f"Unified STT WebSocket error: {e}")
1249
+ try:
1250
+ await handler.send_error(str(e))
1251
+ except:
1252
+ pass
1253
+ finally:
1254
+ if handler.running:
1255
+ await handler.stop_processing()
1256
+ logger.info(f"Unified STT connection ended: {client_id}")
1257
+
1258
+ async def main():
1259
+ """Start the Unified Arabic STT WebSocket server"""
1260
+ logger.info("Starting Unified Arabic STT WebSocket server on port 3007...")
1261
+
1262
+ # Check model availability
1263
+ models_available = []
1264
+ if asr_model_nemo is not None:
1265
+ models_available.append("NeMo FastConformer (ar-EG)")
1266
+ if whisper_model is not None:
1267
+ models_available.append("Whisper large-v3 (ar-EG-whis)")
1268
+
1269
+ if not models_available:
1270
+ logger.error("No models available! Please check model paths and installations.")
1271
+ return
1272
+
1273
+ # Start WebSocket server
1274
+ server = await websockets.serve(
1275
+ handle_jambonz_websocket,
1276
+ "0.0.0.0",
1277
+ 3007,
1278
+ ping_interval=20,
1279
+ ping_timeout=10,
1280
+ close_timeout=10
1281
+ )
1282
+
1283
+ logger.info("Unified Arabic STT WebSocket server started on ws://0.0.0.0:3007")
1284
+ logger.info("Ready to handle jambonz STT requests with both models")
1285
+ logger.info("ROUTING:")
1286
+ logger.info("- language: 'ar-EG' → NeMo FastConformer (with built-in number conversion)")
1287
+ logger.info("- language: 'ar-EG-whis' → Whisper large-v3 (with pyarabic number conversion)")
1288
+ logger.info("FEATURES:")
1289
+ logger.info("- Continuous transcription with segmentation")
1290
+ logger.info("- Voice Activity Detection")
1291
+ logger.info("- Auto-final detection (2s silence timeout)")
1292
+ logger.info("- Model-specific number conversion")
1293
+ logger.info(f"AVAILABLE MODELS: {', '.join(models_available)}")
1294
+
1295
+ # Wait for the server to close
1296
+ await server.wait_closed()
1297
+
1298
+ if __name__ == "__main__":
1299
+ print("=" * 80)
1300
+ print("Unified Arabic STT Server (NeMo + Whisper)")
1301
+ print("=" * 80)
1302
+ print("WebSocket Port: 3007")
1303
+ print("Protocol: jambonz STT API")
1304
+ print("Audio Format: LINEAR16 PCM @ 8kHz → 16kHz")
1305
+ print()
1306
+ print("LANGUAGE ROUTING:")
1307
+ print("- 'ar-EG' → NeMo FastConformer")
1308
+ print(" • Built-in Arabic number word to digit conversion")
1309
+ print(" • Optimized for Arabic dialects")
1310
+ print("- 'ar-EG-whis' → Whisper large-v3")
1311
+ print(" • pyarabic library number conversion (final transcripts only)")
1312
+ print(" • OpenAI Whisper model")
1313
+ print()
1314
+ print("FEATURES:")
1315
+ print("- Automatic model selection based on language parameter")
1316
+ print("- Voice Activity Detection")
1317
+ print("- Auto-final detection (2 seconds silence)")
1318
+ print("- Model-specific number conversion strategies")
1319
+ print("- Continuous transcription with segmentation")
1320
+ print()
1321
+
1322
+ # Check model availability for startup info
1323
+ nemo_status = "✓ Available" if asr_model_nemo is not None else "✗ Not Available"
1324
+ whisper_status = "✓ Available" if whisper_model is not None else "✗ Not Available"
1325
+ arabic_numbers_status = "✓ Available" if arabic_numbers_available else "✗ Not Available (install pyarabic)"
1326
+
1327
+ print("MODEL STATUS:")
1328
+ print(f"- NeMo FastConformer: {nemo_status}")
1329
+ print(f"- Whisper large-v3: {whisper_status}")
1330
+ print(f"- pyarabic (Whisper numbers): {arabic_numbers_status}")
1331
+ print("=" * 80)
1332
+
1333
+ try:
1334
+ asyncio.run(main())
1335
+ except KeyboardInterrupt:
1336
+ print("\nShutting down unified server...")
1337
+ except Exception as e:
1338
+ print(f"Server error: {e}")
best_nemo_whisper_jambonz_denoiser.py ADDED
@@ -0,0 +1,1357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import websockets
4
+ import json
5
+ import threading
6
+ import numpy as np
7
+ import logging
8
+ import time
9
+ import tempfile
10
+ import os
11
+ import re
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ import subprocess
14
+ import struct
15
+
16
+ # NeMo imports
17
+ import nemo.collections.asr as nemo_asr
18
+ import soundfile as sf
19
+
20
+ # Whisper imports
21
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
22
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
23
+
24
+
25
+ # Arabic number conversion imports for Whisper
26
+ try:
27
+ from pyarabic.number import text2number
28
+ arabic_numbers_available = True
29
+ print("✓ pyarabic library available for Whisper number conversion")
30
+ except ImportError:
31
+ arabic_numbers_available = False
32
+ print("✗ pyarabic not available - install with: pip install pyarabic")
33
+ print("Arabic numbers will not be converted to digits for Whisper")
34
+
35
+ # Set up logging
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # ===== NeMo Arabic number mapping =====
40
+ arabic_numbers_nemo = {
41
+ # Basic digits
42
+ "سفر": "0", "فيرو": "0", "هيرو": "0","صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0",
43
+ "واحد": "1", "واحدة": "1", "١": "1",
44
+ "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
45
+ "تلاتة": "3", "ثلاثة": "3", "٣": "3","تلاته": "3","ثلاثه": "3","ثلاثا": "3","تلاتا": "3",
46
+ "اربعة": "4", "أربعة": "4", "٤": "4","اربعه": "4","أربعه": "4","أربع": "4","اربع": "4","اربعا": "4","أربعا": "4",
47
+ "خمسة": "5", "خمسه": "5", "٥": "5", "خمس": "5", "خمسا": "5",
48
+ "ستة": "6", "سته": "6", "٦": "6", "ست": "6", "ستّا": "6", "ستةً": "6",
49
+ "سبعة": "7", "سبعه": "7", "٧": "7", "سبع": "7", "سبعا": "7",
50
+ "ثمانية": "8", "ثمانيه": "8", "٨": "8", "ثمان": "8", "ثمنية": "8", "ثمنيه": "8", "ثمانيا": "8", "ثمن": "8",
51
+ "تسعة": "9", "تسعه": "9", "٩": "9", "تسع": "9", "تسعا": "9",
52
+
53
+ # Teens
54
+ "عشرة": "10", "١٠": "10",
55
+ "حداشر": "11", "احد عشر": "11","احداشر": "11",
56
+ "اتناشر": "12", "اثنا عشر": "12",
57
+ "تلتاشر": "13", "ثلاثة عشر": "13",
58
+ "اربعتاشر": "14", "أربعة عشر": "14",
59
+ "خمستاشر": "15", "خمسة عشر": "15",
60
+ "ستاشر": "16", "ستة عشر": "16",
61
+ "سبعتاشر": "17", "سبعة عشر": "17",
62
+ "طمنتاشر": "18", "ثمانية عشر": "18",
63
+ "تسعتاشر": "19", "تسعة عشر": "19",
64
+
65
+ # Tens
66
+ "عشرين": "20", "٢٠": "20",
67
+ "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
68
+ "اربعين": "40", "أربعين": "40", "٤٠": "40",
69
+ "خمسين": "50", "٥٠": "50",
70
+ "ستين": "60", "٦٠": "60",
71
+ "سبعين": "70", "٧٠": "70",
72
+ "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
73
+ "تسعين": "90", "٩٠": "90",
74
+
75
+ # Hundreds
76
+ "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
77
+ "ميتين": "200", "مائتين": "200",
78
+ "تلاتمية": "300", "ثلاثمائة": "300",
79
+ "اربعمية": "400", "أربعمائة": "400",
80
+ "خمسمية": "500", "خمسمائة": "500",
81
+ "ستمية": "600", "ستمائة": "600",
82
+ "سبعمية": "700", "سبعمائة": "700",
83
+ "تمانمية": "800", "ثمانمائة": "800",
84
+ "تسعمية": "900", "تسعمائة": "900",
85
+
86
+ # Thousands
87
+ "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
88
+ "ألفين": "2000", "الفين": "2000",
89
+ "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
90
+ "اربعة آلاف": "4000", "أربعة آلاف": "4000",
91
+ "خمسة آلاف": "5000",
92
+ "ستة آلاف": "6000",
93
+ "سبعة آلاف": "7000",
94
+ "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
95
+ "تسعة آلاف": "9000",
96
+
97
+ # Large numbers
98
+ "عشرة آلاف": "10000",
99
+ "مية ألف": "100000", "مائة ألف": "100000",
100
+ "مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
101
+ "ملايين": "1000000",
102
+ "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000"
103
+ }
104
+
105
+ def replace_arabic_numbers_nemo(text: str) -> str:
106
+ """Convert Arabic number words to digits for NeMo"""
107
+ for word, digit in arabic_numbers_nemo.items():
108
+ text = re.sub(rf"\b{word}\b", digit, text)
109
+ return text
110
+
111
+ def convert_arabic_numbers_whisper(sentence: str) -> str:
112
+ """
113
+ Replace Arabic number words in a sentence with digits for Whisper,
114
+ preserving all other words and punctuation.
115
+ """
116
+ if not arabic_numbers_available or not sentence.strip():
117
+ return sentence
118
+
119
+ try:
120
+ # Normalization step
121
+ replacements = {
122
+ "اربعة": "أربعة", "اربع": "أربع", "اثنين": "اثنان",
123
+ "اتنين": "اثنان", "ثلاث": "ثلاثة", "خمس": "خمسة",
124
+ "ست": "ستة", "سبع": "سبعة", "ثمان": "ثمانية",
125
+ "تسع": "تسعة", "عشر": "عشرة",
126
+ }
127
+ for wrong, correct in replacements.items():
128
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
129
+
130
+ # Split by whitespace but keep spaces
131
+ words = re.split(r'(\s+)', sentence)
132
+ converted_words = []
133
+
134
+ for word in words:
135
+ stripped = word.strip()
136
+ if not stripped: # skip spaces
137
+ converted_words.append(word)
138
+ continue
139
+
140
+ try:
141
+ num = text2number(stripped)
142
+ if isinstance(num, int):
143
+ if num != 0 or stripped == "صفر":
144
+ converted_words.append(str(num))
145
+ else:
146
+ converted_words.append(word)
147
+ else:
148
+ converted_words.append(word)
149
+ except Exception:
150
+ converted_words.append(word)
151
+
152
+ return ''.join(converted_words)
153
+
154
+ except Exception as e:
155
+ logger.warning(f"Error converting Arabic numbers: {e}")
156
+ return sentence
157
+
158
+ # Global models
159
+ asr_model_nemo = None
160
+ whisper_model = None
161
+ whisper_processor = None
162
+ whisper_tokenizer = None
163
+ device = None
164
+ torch_dtype = None
165
+ import torch
166
+ from denoiser import pretrained
167
+
168
+
169
+ def initialize_models():
170
+ """Initialize both NeMo and Whisper models"""
171
+ global asr_model_nemo, whisper_model, whisper_processor, whisper_tokenizer, device, torch_dtype, denoiser_model
172
+
173
+ # Initialize device settings
174
+ device = "cuda" if torch.cuda.is_available() else "cpu"
175
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
176
+ device = "cuda" if torch.cuda.is_available() else "cpu"
177
+
178
+ # Load DNS64 pretrained model (auto-downloads if not cached)
179
+ denoiser_model = pretrained.dns64().to(device)
180
+ denoiser_model.eval()
181
+ logger.info(f"Using device: {device}")
182
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
183
+
184
+ # Initialize NeMo model
185
+ logger.info("Loading NeMo FastConformer Arabic ASR model...")
186
+ model_path = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
187
+
188
+ if os.path.exists(model_path):
189
+ try:
190
+ asr_model_nemo = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
191
+ asr_model_nemo.eval()
192
+ logger.info("✓ NeMo FastConformer model loaded successfully")
193
+ except Exception as e:
194
+ logger.error(f"Failed to load NeMo model: {e}")
195
+ asr_model_nemo = None
196
+ else:
197
+ logger.warning(f"NeMo model not found at: {model_path}")
198
+ asr_model_nemo = None
199
+
200
+ # Initialize Whisper model
201
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
202
+
203
+ logger.info("Loading Whisper large-v3 model...")
204
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"
205
+
206
+ try:
207
+ # Try with flash attention first
208
+ try:
209
+ import flash_attn
210
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
211
+ MODEL_NAME,
212
+ torch_dtype=torch_dtype,
213
+ low_cpu_mem_usage=True,
214
+ use_safetensors=True,
215
+ attn_implementation="flash_attention_2"
216
+ )
217
+ logger.info("✓ Whisper loaded with flash attention")
218
+ except:
219
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
220
+ MODEL_NAME,
221
+ torch_dtype=torch_dtype,
222
+ low_cpu_mem_usage=True,
223
+ use_safetensors=True
224
+ )
225
+ logger.info("✓ Whisper loaded with standard attention")
226
+
227
+ whisper_model.to(device)
228
+ whisper_processor = AutoProcessor.from_pretrained(MODEL_NAME)
229
+
230
+ # Use processor.tokenizer, don’t reload separately
231
+ whisper_tokenizer = whisper_processor.tokenizer
232
+
233
+ logger.info("✓ Whisper model + tokenizer loaded successfully")
234
+
235
+ except Exception as e:
236
+ logger.error(f"Failed to load Whisper model: {e}")
237
+ whisper_model = None
238
+
239
+
240
+
241
+
242
+ # logger.info("Loading Whisper large-v3-turbo model...")
243
+ # MODEL_NAME = "openai/whisper-large-v3-turbo"
244
+
245
+ # try:
246
+ # # Try with flash attention first
247
+ # try:
248
+ # import flash_attn
249
+ # whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
250
+ # MODEL_NAME,
251
+ # torch_dtype=torch_dtype,
252
+ # low_cpu_mem_usage=True,
253
+ # use_safetensors=True,
254
+ # attn_implementation="flash_attention_2"
255
+ # )
256
+ # logger.info("✓ Whisper loaded with flash attention")
257
+ # except:
258
+ # whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
259
+ # MODEL_NAME,
260
+ # torch_dtype=torch_dtype,
261
+ # low_cpu_mem_usage=True,
262
+ # use_safetensors=True
263
+ # )
264
+ # logger.info("✓ Whisper loaded with standard attention")
265
+
266
+ # whisper_model.to(device)
267
+ # whisper_processor = AutoProcessor.from_pretrained(MODEL_NAME)
268
+ # whisper_tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
269
+ # logger.info("✓ Whisper model loaded successfully")
270
+
271
+ # except Exception as e:
272
+ # logger.error(f"Failed to load Whisper model: {e}")
273
+ # whisper_model = None
274
+
275
+ # Initialize models on startup
276
+ initialize_models()
277
+ def denoise_audio(audio_data, sample_rate=16000):
278
+ """Apply denoising using facebook/denoiser pretrained model."""
279
+ if denoiser_model is None or len(audio_data) == 0:
280
+ return audio_data
281
+ try:
282
+ audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device=device).unsqueeze(0)
283
+ with torch.no_grad():
284
+ denoised_tensor = denoiser_model(audio_tensor, sample_rate=sample_rate)[0]
285
+ return denoised_tensor.squeeze().cpu().numpy().astype("float32")
286
+ except Exception as e:
287
+ print(f"[WARN] Denoiser failed: {e}")
288
+ return audio_data
289
+ # Thread pool for processing
290
+ executor = ThreadPoolExecutor(max_workers=4)
291
+
292
+ # class JambonzAudioBuffer:
293
+ # def __init__(self, sample_rate=8000, chunk_duration=1.0):
294
+ # self.sample_rate = sample_rate
295
+ # self.chunk_duration = chunk_duration
296
+ # self.chunk_samples = int(chunk_duration * sample_rate)
297
+
298
+ # self.buffer = np.array([], dtype=np.float32)
299
+ # self.lock = threading.Lock()
300
+ # self.total_audio = np.array([], dtype=np.float32)
301
+
302
+ # # Voice Activity Detection
303
+ # self.silence_threshold = 0.05
304
+ # self.min_speech_samples = int(0.5 * sample_rate)
305
+
306
+ # def add_audio(self, audio_data):
307
+ # with self.lock:
308
+ # self.buffer = np.concatenate([self.buffer, audio_data])
309
+ # self.total_audio = np.concatenate([self.total_audio, audio_data])
310
+
311
+ # def has_chunk_ready(self):
312
+ # with self.lock:
313
+ # return len(self.buffer) >= self.chunk_samples
314
+
315
+ # def is_speech(self, audio_chunk):
316
+ # """Simple VAD based on energy"""
317
+ # if len(audio_chunk) < self.min_speech_samples:
318
+ # return False
319
+ # energy = np.mean(np.abs(audio_chunk))
320
+ # return energy > self.silence_threshold
321
+
322
+ # def get_chunk_for_processing(self):
323
+ # """Get audio chunk for processing"""
324
+ # with self.lock:
325
+ # if len(self.buffer) < self.chunk_samples:
326
+ # return None
327
+ # return np.array([1]) # Signal that chunk is ready
328
+
329
+ # def get_all_audio(self):
330
+ # """Get all accumulated audio"""
331
+ # with self.lock:
332
+ # return self.total_audio.copy()
333
+
334
+ # def clear(self):
335
+ # with self.lock:
336
+ # self.buffer = np.array([], dtype=np.float32)
337
+ # self.total_audio = np.array([], dtype=np.float32)
338
+
339
+ # def reset_for_new_segment(self):
340
+ # """Reset buffers for new transcription segment"""
341
+ # with self.lock:
342
+ # self.buffer = np.array([], dtype=np.float32)
343
+ # self.total_audio = np.array([], dtype=np.float32)
344
+
345
+ class JambonzAudioBuffer:
346
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
347
+ self.sample_rate = sample_rate
348
+ self.chunk_duration = chunk_duration
349
+ self.chunk_samples = int(chunk_duration * sample_rate)
350
+
351
+ self.buffer = np.array([], dtype=np.float32)
352
+ self.lock = threading.Lock()
353
+ self.total_audio = np.array([], dtype=np.float32)
354
+
355
+ # Voice Activity Detection - ADJUSTED FOR WHISPER
356
+ self.silence_threshold = 0.01 # Lower threshold for Whisper
357
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
358
+
359
+ def add_audio(self, audio_data):
360
+ with self.lock:
361
+ self.buffer = np.concatenate([self.buffer, audio_data])
362
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
363
+
364
+ # Log audio addition for debugging
365
+ logger.debug(f"Added {len(audio_data)} audio samples, total: {len(self.total_audio)}")
366
+
367
+ def has_chunk_ready(self):
368
+ with self.lock:
369
+ ready = len(self.buffer) >= self.chunk_samples
370
+ if ready:
371
+ logger.debug(f"Chunk ready: {len(self.buffer)} >= {self.chunk_samples}")
372
+ return ready
373
+
374
+ def is_speech(self, audio_chunk):
375
+ """Enhanced VAD based on energy - better for Whisper"""
376
+ if len(audio_chunk) < self.min_speech_samples:
377
+ logger.debug(f"Audio too short for VAD: {len(audio_chunk)} < {self.min_speech_samples}")
378
+ return False
379
+
380
+ # Calculate RMS energy
381
+ rms_energy = np.sqrt(np.mean(audio_chunk ** 2))
382
+
383
+ # Also check peak amplitude
384
+ peak_amplitude = np.max(np.abs(audio_chunk))
385
+
386
+ is_speech = rms_energy > self.silence_threshold or peak_amplitude > (self.silence_threshold * 2)
387
+
388
+ logger.debug(f"VAD check - RMS: {rms_energy:.4f}, Peak: {peak_amplitude:.4f}, "
389
+ f"Threshold: {self.silence_threshold}, Speech: {is_speech}")
390
+
391
+ return is_speech
392
+
393
+ def get_chunk_for_processing(self):
394
+ """Get audio chunk for processing"""
395
+ with self.lock:
396
+ if len(self.buffer) < self.chunk_samples:
397
+ return None
398
+
399
+ logger.debug(f"Returning processing signal, buffer size: {len(self.buffer)}")
400
+ return np.array([1]) # Signal that chunk is ready
401
+
402
+ def get_all_audio(self):
403
+ """Get all accumulated audio"""
404
+ with self.lock:
405
+ audio_copy = self.total_audio.copy()
406
+ logger.debug(f"Returning {len(audio_copy)} total audio samples")
407
+ return audio_copy
408
+
409
+ def clear(self):
410
+ with self.lock:
411
+ self.buffer = np.array([], dtype=np.float32)
412
+ self.total_audio = np.array([], dtype=np.float32)
413
+ logger.debug("Audio buffer cleared")
414
+
415
+ def reset_for_new_segment(self):
416
+ """Reset buffers for new transcription segment"""
417
+ with self.lock:
418
+ self.buffer = np.array([], dtype=np.float32)
419
+ self.total_audio = np.array([], dtype=np.float32)
420
+ logger.debug("Audio buffer reset for new segment")
421
+
422
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
423
+ """Convert LINEAR16 PCM bytes to numpy array"""
424
+ try:
425
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
426
+ audio_array = audio_array.astype(np.float32) / 32768.0
427
+ return audio_array
428
+ except Exception as e:
429
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
430
+ return np.array([], dtype=np.float32)
431
+
432
+ def resample_audio(audio_data, source_rate, target_rate):
433
+ """Resample audio to target sample rate"""
434
+ if source_rate == target_rate:
435
+ return audio_data
436
+
437
+ if source_rate == 8000 and target_rate == 16000:
438
+ # Simple 2x upsampling for common case
439
+ upsampled = np.repeat(audio_data, 2)
440
+ return upsampled.astype(np.float32)
441
+
442
+ # Fallback: Linear interpolation resampling
443
+ ratio = target_rate / source_rate
444
+ indices = np.arange(0, len(audio_data), 1/ratio)
445
+ indices = indices[indices < len(audio_data)]
446
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
447
+
448
+ return resampled.astype(np.float32)
449
+
450
+ def transcribe_with_nemo(audio_data, source_sample_rate=8000, target_sample_rate=16000):
451
+ """Transcribe audio using NeMo FastConformer"""
452
+ try:
453
+ if len(audio_data) == 0 or asr_model_nemo is None:
454
+ return ""
455
+
456
+ # Resample to 16kHz (NeMo models typically expect 16kHz)
457
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
458
+ # --- Denoiser added ---
459
+ resampled_audio = denoise_audio(resampled_audio, sample_rate=target_sample_rate)
460
+ # Skip very short audio
461
+ min_samples = int(0.3 * target_sample_rate)
462
+ if len(resampled_audio) < min_samples:
463
+ return ""
464
+
465
+ start_time = time.time()
466
+
467
+ # Save audio to temporary file (NeMo expects file path)
468
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
469
+ sf.write(tmp_file.name, resampled_audio, target_sample_rate)
470
+ tmp_path = tmp_file.name
471
+
472
+ try:
473
+ # Transcribe with NeMo
474
+ result = asr_model_nemo.transcribe([tmp_path])
475
+
476
+ if result and len(result) > 0:
477
+ # Handle different NeMo result formats
478
+ if hasattr(result[0], 'text'):
479
+ raw_text = result[0].text
480
+ elif isinstance(result[0], str):
481
+ raw_text = result[0]
482
+ else:
483
+ raw_text = str(result[0])
484
+
485
+ if not isinstance(raw_text, str):
486
+ raw_text = str(raw_text)
487
+
488
+ if raw_text and raw_text.strip():
489
+ # Convert Arabic numbers to digits for NeMo
490
+ cleaned_text = replace_arabic_numbers_nemo(raw_text)
491
+ end_time = time.time()
492
+
493
+ if cleaned_text.strip():
494
+ logger.info(f"NeMo transcription: '{cleaned_text}' (processed in {end_time - start_time:.2f}s)")
495
+
496
+ return cleaned_text.strip()
497
+
498
+ finally:
499
+ # Clean up temporary file
500
+ if os.path.exists(tmp_path):
501
+ os.remove(tmp_path)
502
+
503
+ return ""
504
+
505
+ except Exception as e:
506
+ logger.error(f"Error during NeMo transcription: {e}")
507
+ return ""
508
+
509
+ def transcribe_with_whisper(audio_data, source_sample_rate=8000, target_sample_rate=16000):
510
+ """Transcribe audio chunk using Whisper model directly"""
511
+ try:
512
+ if len(audio_data) == 0 or whisper_model is None:
513
+ return ""
514
+
515
+ # Resample from 8kHz to 16kHz for Whisper
516
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
517
+
518
+ # Ensure minimum length for Whisper
519
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
520
+ if len(resampled_audio) < min_samples:
521
+ return ""
522
+
523
+ start_time = time.time()
524
+
525
+ # Prepare input features with proper dtype
526
+ input_features = whisper_processor(
527
+ resampled_audio,
528
+ sampling_rate=target_sample_rate,
529
+ return_tensors="pt"
530
+ ).input_features
531
+
532
+ # Ensure correct dtype and device
533
+ input_features = input_features.to(device=device, dtype=torch_dtype)
534
+
535
+ # Create attention mask to avoid warnings
536
+ attention_mask = torch.ones(
537
+ input_features.shape[:-1],
538
+ dtype=torch.long,
539
+ device=device
540
+ )
541
+
542
+ # Generate transcription using model directly
543
+ with torch.no_grad():
544
+ predicted_ids = whisper_model.generate(
545
+ input_features,
546
+ attention_mask=attention_mask,
547
+ max_new_tokens=128,
548
+ do_sample=False,
549
+ # temperature=0.0,
550
+ num_beams=1,
551
+ language="english",
552
+ task="translate",
553
+ pad_token_id=whisper_tokenizer.pad_token_id,
554
+ eos_token_id=whisper_tokenizer.eos_token_id
555
+ )
556
+
557
+ # Decode the transcription
558
+ transcription = whisper_tokenizer.batch_decode(
559
+ predicted_ids,
560
+ skip_special_tokens=True
561
+ )[0].strip()
562
+
563
+ end_time = time.time()
564
+
565
+ logger.info(f"Whisper transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
566
+ return transcription
567
+
568
+ except Exception as e:
569
+ logger.error(f"Error during Whisper transcription: {e}")
570
+ return ""
571
+
572
+ class UnifiedSTTHandler:
573
+ def __init__(self, websocket):
574
+ self.websocket = websocket
575
+ self.audio_buffer = None
576
+ self.config = {}
577
+ self.running = False
578
+ self.transcription_task = None
579
+ self.use_nemo = False # Flag to determine which model to use
580
+
581
+ # Auto-final detection variables
582
+ self.interim_count = 0
583
+ self.last_interim_time = None
584
+ self.silence_timeout = 2.9
585
+ self.min_interim_count = 1
586
+ self.auto_final_task = None
587
+ self.accumulated_transcript = ""
588
+ self.final_sent = False
589
+ self.segment_number = 0
590
+ self.last_partial = ""
591
+
592
+ # Processing tracking
593
+ self.processing_count = 0
594
+
595
+ # Add this debugging method to your UnifiedSTTHandler class
596
+
597
+ async def add_audio_data(self, audio_bytes):
598
+ """Add audio data to buffer with enhanced debugging"""
599
+ if self.audio_buffer and self.running:
600
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
601
+ self.audio_buffer.add_audio(audio_data)
602
+
603
+ model_name = "NeMo" if self.use_nemo else "Whisper"
604
+
605
+ # Debug logging every few audio packets
606
+ if len(audio_data) > 0:
607
+ total_samples = len(self.audio_buffer.get_all_audio())
608
+ total_seconds = total_samples / self.config["sample_rate"]
609
+
610
+ # Log every second of audio
611
+ if int(total_seconds) != getattr(self, '_last_logged_second', -1):
612
+ logger.info(f"{model_name} - Accumulated {total_seconds:.1f}s of audio ({total_samples} samples)")
613
+ self._last_logged_second = int(total_seconds)
614
+
615
+ # Check if we should have chunks ready
616
+ chunk_ready = self.audio_buffer.has_chunk_ready()
617
+ logger.info(f"{model_name} - Chunk ready: {chunk_ready}")
618
+ # async def start_processing(self, start_message):
619
+ # """Initialize with start message from jambonz"""
620
+ # self.config = {
621
+ # "language": start_message.get("language", "ar-EG"),
622
+ # "format": start_message.get("format", "raw"),
623
+ # "encoding": start_message.get("encoding", "LINEAR16"),
624
+ # "sample_rate": start_message.get("sampleRateHz", 8000),
625
+ # "interim_results": True, # Always enable for internal processing
626
+ # "options": start_message.get("options", {})
627
+ # }
628
+
629
+ # # Determine which model to use based on language parameter
630
+ # language = self.config["language"]
631
+ # if language == "ar-EG":
632
+ # logger.info("nemooooooooooooooooooooooooooo")
633
+ # self.use_nemo = True
634
+ # model_name = "NeMo FastConformer"
635
+ # elif language == "ar-EG-whis":
636
+ # logger.info("whisperrrrrrrrrrrrrrrrrrrrrrrrrrrrr")
637
+ # self.use_nemo = False
638
+ # model_name = "Whisper large-v3"
639
+ # else:
640
+ # # Default to NeMo for any other Arabic variant
641
+ # self.use_nemo = True
642
+ # model_name = "NeMo FastConformer (default)"
643
+
644
+ # logger.info(f"STT session started with {model_name} for language: {language}")
645
+ # logger.info(f"Config: {self.config}")
646
+
647
+ # # Check if selected model is available
648
+ # if self.use_nemo and asr_model_nemo is None:
649
+ # await self.send_error("NeMo model not available")
650
+ # return
651
+ # elif not self.use_nemo and whisper_model is None:
652
+ # await self.send_error("Whisper model not available")
653
+ # return
654
+
655
+ # # Initialize audio buffer
656
+ # self.audio_buffer = JambonzAudioBuffer(
657
+ # sample_rate=self.config["sample_rate"],
658
+ # chunk_duration=1.0 # 1 second chunks
659
+ # )
660
+
661
+ # # Reset session variables
662
+ # self.running = True
663
+ # self.interim_count = 0
664
+ # self.last_interim_time = None
665
+ # self.accumulated_transcript = ""
666
+ # self.final_sent = False
667
+ # self.segment_number = 0
668
+ # self.processing_count = 0
669
+ # self.last_partial = ""
670
+
671
+ # # Start background transcription task
672
+ # self.transcription_task = asyncio.create_task(self._process_audio_chunks())
673
+
674
+ # # Start auto-final detection task
675
+ # self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
676
+
677
+ # Replace these methods in your UnifiedSTTHandler class
678
+
679
+ async def start_processing(self, start_message):
680
+ """Initialize with start message from jambonz"""
681
+ self.config = {
682
+ "language": start_message.get("language", "ar-EG"),
683
+ "format": start_message.get("format", "raw"),
684
+ "encoding": start_message.get("encoding", "LINEAR16"),
685
+ "sample_rate": start_message.get("sampleRateHz", 8000),
686
+ "interim_results": True, # Always enable for internal processing
687
+ "options": start_message.get("options", {})
688
+ }
689
+
690
+ # Determine which model to use based on language parameter
691
+ language = self.config["language"]
692
+ if language == "ar-EG":
693
+ logger.info("Selected NeMo FastConformer")
694
+ self.use_nemo = True
695
+ model_name = "NeMo FastConformer"
696
+ elif language == "ar-EG-whis":
697
+ logger.info("Selected Whisper large-v3")
698
+ self.use_nemo = False
699
+ model_name = "Whisper large-v3"
700
+ else:
701
+ # Default to NeMo for any other Arabic variant
702
+ self.use_nemo = True
703
+ model_name = "NeMo FastConformer (default)"
704
+
705
+ logger.info(f"STT session started with {model_name} for language: {language}")
706
+ logger.info(f"Config: {self.config}")
707
+
708
+ # Check if selected model is available
709
+ if self.use_nemo and asr_model_nemo is None:
710
+ await self.send_error("NeMo model not available")
711
+ return
712
+ elif not self.use_nemo and whisper_model is None:
713
+ await self.send_error("Whisper model not available")
714
+ return
715
+
716
+ # Initialize audio buffer with model-specific settings
717
+ if self.use_nemo:
718
+ chunk_duration = 1.0 # NeMo processes every 1 second
719
+ else:
720
+ chunk_duration = 2.0 # Whisper processes every 2 seconds for better accuracy
721
+
722
+ self.audio_buffer = JambonzAudioBuffer(
723
+ sample_rate=self.config["sample_rate"],
724
+ chunk_duration=chunk_duration
725
+ )
726
+
727
+ # Adjust VAD threshold for Whisper
728
+ if not self.use_nemo:
729
+ self.audio_buffer.silence_threshold = 0.005 # Lower threshold for Whisper
730
+
731
+ # Reset session variables
732
+ self.running = True
733
+ self.interim_count = 0
734
+ self.last_interim_time = None
735
+ self.accumulated_transcript = ""
736
+ self.final_sent = False
737
+ self.segment_number = 0
738
+ self.processing_count = 0
739
+ self.last_partial = ""
740
+
741
+ # Start background transcription task
742
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
743
+
744
+ # Start auto-final detection task
745
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
746
+
747
+ logger.info(f"Background tasks started for {model_name}")
748
+
749
+
750
+
751
+ async def stop_processing(self):
752
+ """Stop current processing session"""
753
+ logger.info("Stopping STT session...")
754
+ self.running = False
755
+
756
+ # Cancel background tasks
757
+ for task in [self.transcription_task, self.auto_final_task]:
758
+ if task:
759
+ task.cancel()
760
+ try:
761
+ await task
762
+ except asyncio.CancelledError:
763
+ pass
764
+
765
+ # Send final transcription if not already sent
766
+ if not self.final_sent and self.accumulated_transcript.strip():
767
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
768
+
769
+ # Process any remaining audio for comprehensive final transcription
770
+ if self.audio_buffer:
771
+ all_audio = self.audio_buffer.get_all_audio()
772
+ if len(all_audio) > 0 and not self.final_sent:
773
+ loop = asyncio.get_event_loop()
774
+
775
+ if self.use_nemo:
776
+ final_transcription = await loop.run_in_executor(
777
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
778
+ )
779
+ else:
780
+ final_transcription = await loop.run_in_executor(
781
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
782
+ )
783
+
784
+ if final_transcription.strip():
785
+ await self.send_transcription(final_transcription, is_final=True)
786
+
787
+ # Clear audio buffer
788
+ if self.audio_buffer:
789
+ self.audio_buffer.clear()
790
+
791
+ logger.info("STT session stopped")
792
+
793
+ async def start_new_segment(self):
794
+ """Start a new transcription segment"""
795
+ self.segment_number += 1
796
+ self.interim_count = 0
797
+ self.last_interim_time = None
798
+ self.accumulated_transcript = ""
799
+ self.final_sent = False
800
+ self.last_partial = ""
801
+ self.processing_count = 0
802
+
803
+ if self.audio_buffer:
804
+ self.audio_buffer.reset_for_new_segment()
805
+
806
+ logger.info(f"Started new transcription segment #{self.segment_number}")
807
+
808
+ async def add_audio_data(self, audio_bytes):
809
+ """Add audio data to buffer"""
810
+ if self.audio_buffer and self.running:
811
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
812
+ self.audio_buffer.add_audio(audio_data)
813
+
814
+ # async def _process_audio_chunks(self):
815
+ # """Process audio chunks for interim results"""
816
+ # while self.running:
817
+ # try:
818
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
819
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
820
+ # if chunk_signal is not None:
821
+ # all_audio = self.audio_buffer.get_all_audio()
822
+
823
+ # if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
824
+ # loop = asyncio.get_event_loop()
825
+
826
+ # # Choose transcription method based on model selection
827
+ # if self.use_nemo:
828
+ # transcription = await loop.run_in_executor(
829
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
830
+ # )
831
+ # else:
832
+ # transcription = await loop.run_in_executor(
833
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
834
+ # )
835
+
836
+ # if transcription.strip():
837
+ # self.processing_count += 1
838
+ # self.accumulated_transcript = transcription
839
+
840
+ # if transcription != self.last_partial or self.interim_count == 0:
841
+ # self.last_partial = transcription
842
+ # self.interim_count += 1
843
+ # self.last_interim_time = time.time()
844
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
845
+ # else:
846
+ # self.last_interim_time = time.time()
847
+
848
+ # await asyncio.sleep(0.1) # Check every 100ms
849
+
850
+ # except Exception as e:
851
+ # logger.error(f"Error in chunk processing: {e}")
852
+ # await asyncio.sleep(0.1)
853
+
854
+
855
+ # async def _monitor_for_auto_final(self):
856
+ # """Monitor for auto-final conditions"""
857
+ # while self.running:
858
+ # try:
859
+ # current_time = time.time()
860
+
861
+ # if (self.interim_count >= self.min_interim_count and
862
+ # self.last_interim_time is not None and
863
+ # (current_time - self.last_interim_time) >= self.silence_timeout and
864
+ # not self.final_sent and
865
+ # self.accumulated_transcript.strip()):
866
+
867
+ # logger.info(f"Auto-final triggered for segment #{self.segment_number}")
868
+
869
+ # await self.send_transcription(self.accumulated_transcript, is_final=True)
870
+ # await self.start_new_segment()
871
+
872
+ # await asyncio.sleep(0.5) # Check every 500ms
873
+
874
+ # except Exception as e:
875
+ # logger.error(f"Error in auto-final monitoring: {e}")
876
+ # await asyncio.sleep(0.5)
877
+
878
+ # async def _process_audio_chunks(self):
879
+ # """Process audio chunks for interim results - FIXED for Whisper streaming"""
880
+ # logger.info(f"Starting audio chunk processing for {'NeMo' if self.use_nemo else 'Whisper'}")
881
+
882
+ # while self.running:
883
+ # try:
884
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
885
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
886
+ # if chunk_signal is not None:
887
+ # all_audio = self.audio_buffer.get_all_audio()
888
+
889
+ # # Check if we have enough audio and speech activity
890
+ # if len(all_audio) > 0:
891
+ # # Get the latest chunk for VAD check
892
+ # latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
893
+ # latest_chunk = all_audio[latest_chunk_start:]
894
+
895
+ # # For debugging
896
+ # logger.debug(f"Audio buffer size: {len(all_audio)} samples, Latest chunk: {len(latest_chunk)} samples")
897
+
898
+ # if self.audio_buffer.is_speech(latest_chunk):
899
+ # logger.info(f"Speech detected, processing with {'NeMo' if self.use_nemo else 'Whisper'}")
900
+
901
+ # loop = asyncio.get_event_loop()
902
+
903
+ # # Choose transcription method based on model selection
904
+ # if self.use_nemo:
905
+ # transcription = await loop.run_in_executor(
906
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
907
+ # )
908
+ # else:
909
+ # # For Whisper, ensure we process the accumulated audio
910
+ # transcription = await loop.run_in_executor(
911
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
912
+ # )
913
+
914
+ # logger.info(f"Transcription result: '{transcription}'")
915
+
916
+ # if transcription.strip():
917
+ # self.processing_count += 1
918
+ # self.accumulated_transcript = transcription
919
+
920
+ # if transcription != self.last_partial or self.interim_count == 0:
921
+ # self.last_partial = transcription
922
+ # self.interim_count += 1
923
+ # self.last_interim_time = time.time()
924
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
925
+ # else:
926
+ # self.last_interim_time = time.time()
927
+ # logger.info("Same transcription, updating time only")
928
+ # else:
929
+ # logger.debug("No speech detected in latest chunk")
930
+
931
+ # await asyncio.sleep(0.1) # Check every 100ms
932
+
933
+ # except Exception as e:
934
+ # logger.error(f"Error in chunk processing: {e}")
935
+ # import traceback
936
+ # traceback.print_exc()
937
+ # await asyncio.sleep(0.1)
938
+
939
+ # async def _monitor_for_auto_final(self):
940
+ # """Monitor for auto-final conditions - Enhanced logging"""
941
+ # logger.info("Starting auto-final monitoring")
942
+
943
+ # while self.running:
944
+ # try:
945
+ # current_time = time.time()
946
+
947
+ # if (self.interim_count >= self.min_interim_count and
948
+ # self.last_interim_time is not None and
949
+ # (current_time - self.last_interim_time) >= self.silence_timeout and
950
+ # not self.final_sent and
951
+ # self.accumulated_transcript.strip()):
952
+
953
+ # silence_duration = current_time - self.last_interim_time
954
+ # logger.info(f"Auto-final triggered for segment #{self.segment_number} - "
955
+ # f"Interim count: {self.interim_count}, Silence: {silence_duration:.1f}s")
956
+
957
+ # await self.send_transcription(self.accumulated_transcript, is_final=True)
958
+ # await self.start_new_segment()
959
+
960
+ # # Debug logging every 5 seconds
961
+ # if int(current_time) % 5 == 0:
962
+ # logger.debug(f"Auto-final status - Interim count: {self.interim_count}, "
963
+ # f"Last interim: {self.last_interim_time}, "
964
+ # f"Final sent: {self.final_sent}, "
965
+ # f"Transcript: '{self.accumulated_transcript[:50]}...'")
966
+
967
+ # await asyncio.sleep(0.5) # Check every 500ms
968
+
969
+ # except Exception as e:
970
+ # logger.error(f"Error in auto-final monitoring: {e}")
971
+ # await asyncio.sleep(0.5)
972
+
973
+ # async def _process_audio_chunks(self):
974
+ # """Process audio chunks for interim results - FIXED for both models"""
975
+ # model_name = "NeMo" if self.use_nemo else "Whisper"
976
+ # logger.info(f"Starting audio chunk processing for {model_name}")
977
+
978
+ # while self.running:
979
+ # try:
980
+ # if self.audio_buffer and self.audio_buffer.has_chunk_ready():
981
+ # chunk_signal = self.audio_buffer.get_chunk_for_processing()
982
+ # if chunk_signal is not None:
983
+ # all_audio = self.audio_buffer.get_all_audio()
984
+
985
+ # # Debug logging
986
+ # logger.debug(f"Processing chunk - Total audio: {len(all_audio)} samples")
987
+
988
+ # if len(all_audio) > 0:
989
+ # # Get the latest chunk for VAD check
990
+ # latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
991
+ # latest_chunk = all_audio[latest_chunk_start:]
992
+
993
+ # # Check for speech activity
994
+ # has_speech = self.audio_buffer.is_speech(latest_chunk)
995
+ # logger.debug(f"Speech detection result: {has_speech}")
996
+
997
+ # if has_speech:
998
+ # logger.info(f"Processing audio with {model_name} - {len(all_audio)} samples")
999
+
1000
+ # loop = asyncio.get_event_loop()
1001
+ # start_time = time.time()
1002
+
1003
+ # try:
1004
+ # # Choose transcription method based on model selection
1005
+ # if self.use_nemo:
1006
+ # transcription = await loop.run_in_executor(
1007
+ # executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
1008
+ # )
1009
+ # else:
1010
+ # # For Whisper, ensure we have enough audio
1011
+ # if len(all_audio) >= int(0.5 * 16000): # At least 0.5 seconds at 16kHz
1012
+ # transcription = await loop.run_in_executor(
1013
+ # executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
1014
+ # )
1015
+ # else:
1016
+ # transcription = ""
1017
+ # logger.debug("Whisper: Not enough audio for transcription")
1018
+
1019
+ # process_time = time.time() - start_time
1020
+ # logger.info(f"{model_name} processing took {process_time:.2f}s, result: '{transcription}'")
1021
+
1022
+ # if transcription and transcription.strip():
1023
+ # self.processing_count += 1
1024
+ # self.accumulated_transcript = transcription
1025
+
1026
+ # if transcription != self.last_partial or self.interim_count == 0:
1027
+ # self.last_partial = transcription
1028
+ # self.interim_count += 1
1029
+ # self.last_interim_time = time.time()
1030
+ # logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
1031
+ # else:
1032
+ # self.last_interim_time = time.time()
1033
+ # logger.debug("Same transcription, updating time only")
1034
+ # else:
1035
+ # logger.debug(f"{model_name} returned empty transcription")
1036
+
1037
+ # except Exception as e:
1038
+ # logger.error(f"Error in {model_name} transcription: {e}")
1039
+ # else:
1040
+ # logger.debug("No speech detected in latest chunk")
1041
+
1042
+ # # Different sleep intervals for different models
1043
+ # sleep_interval = 0.1 if self.use_nemo else 0.2 # Whisper can be less frequent
1044
+ # await asyncio.sleep(sleep_interval)
1045
+
1046
+ # except Exception as e:
1047
+ # logger.error(f"Error in chunk processing: {e}")
1048
+ # import traceback
1049
+ # traceback.print_exc()
1050
+ # await asyncio.sleep(1) # Longer sleep on error
1051
+
1052
+ # Also add this to the beginning of _process_audio_chunks method:
1053
+
1054
+ async def _process_audio_chunks(self):
1055
+ """Process audio chunks for interim results - with debugging"""
1056
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1057
+ logger.info(f"Starting audio chunk processing for {model_name}")
1058
+
1059
+ chunk_count = 0
1060
+
1061
+ while self.running:
1062
+ try:
1063
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
1064
+ chunk_count += 1
1065
+ logger.info(f"{model_name} - Processing chunk #{chunk_count}")
1066
+
1067
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
1068
+ if chunk_signal is not None:
1069
+ all_audio = self.audio_buffer.get_all_audio()
1070
+
1071
+ logger.info(f"{model_name} - Got {len(all_audio)} samples for processing")
1072
+
1073
+ if len(all_audio) > 0:
1074
+ # Get the latest chunk for VAD check
1075
+ latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
1076
+ latest_chunk = all_audio[latest_chunk_start:]
1077
+
1078
+ # Check for speech activity
1079
+ has_speech = self.audio_buffer.is_speech(latest_chunk)
1080
+ logger.info(f"{model_name} - Speech detected: {has_speech}")
1081
+
1082
+ if has_speech:
1083
+ logger.info(f"{model_name} - Starting transcription...")
1084
+
1085
+ loop = asyncio.get_event_loop()
1086
+ start_time = time.time()
1087
+
1088
+ try:
1089
+ # Choose transcription method based on model selection
1090
+ if self.use_nemo:
1091
+ transcription = await loop.run_in_executor(
1092
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
1093
+ )
1094
+ else:
1095
+ transcription = await loop.run_in_executor(
1096
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
1097
+ )
1098
+
1099
+ process_time = time.time() - start_time
1100
+ logger.info(f"{model_name} - Transcription completed in {process_time:.2f}s: '{transcription}'")
1101
+
1102
+ if transcription and transcription.strip():
1103
+ self.processing_count += 1
1104
+ self.accumulated_transcript = transcription
1105
+
1106
+ if transcription != self.last_partial or self.interim_count == 0:
1107
+ self.last_partial = transcription
1108
+ self.interim_count += 1
1109
+ self.last_interim_time = time.time()
1110
+ logger.info(f"{model_name} - Updated interim_count to {self.interim_count}")
1111
+ else:
1112
+ self.last_interim_time = time.time()
1113
+ logger.info(f"{model_name} - Same transcription, updating time only")
1114
+ else:
1115
+ logger.info(f"{model_name} - No transcription result")
1116
+
1117
+ except Exception as e:
1118
+ logger.error(f"{model_name} - Transcription error: {e}")
1119
+ import traceback
1120
+ traceback.print_exc()
1121
+ else:
1122
+ logger.debug(f"{model_name} - No speech in chunk")
1123
+ else:
1124
+ logger.warning(f"{model_name} - Chunk signal was None")
1125
+ else:
1126
+ # Log why chunk is not ready
1127
+ if self.audio_buffer:
1128
+ current_size = len(self.audio_buffer.buffer)
1129
+ required_size = self.audio_buffer.chunk_samples
1130
+ if current_size > 0:
1131
+ logger.debug(f"{model_name} - Buffer: {current_size}/{required_size} samples")
1132
+
1133
+ await asyncio.sleep(0.1)
1134
+
1135
+ except Exception as e:
1136
+ logger.error(f"{model_name} - Error in chunk processing: {e}")
1137
+ import traceback
1138
+ traceback.print_exc()
1139
+ await asyncio.sleep(1)
1140
+
1141
+ async def _monitor_for_auto_final(self):
1142
+ """Monitor for auto-final conditions with model-specific timeouts"""
1143
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1144
+ timeout = 2.0 if self.use_nemo else 3.0 # Longer timeout for Whisper
1145
+
1146
+ logger.info(f"Starting auto-final monitoring for {model_name} (timeout: {timeout}s)")
1147
+
1148
+ while self.running:
1149
+ try:
1150
+ current_time = time.time()
1151
+
1152
+ if (self.interim_count >= self.min_interim_count and
1153
+ self.last_interim_time is not None and
1154
+ (current_time - self.last_interim_time) >= timeout and
1155
+ not self.final_sent and
1156
+ self.accumulated_transcript.strip()):
1157
+
1158
+ silence_duration = current_time - self.last_interim_time
1159
+ logger.info(f"Auto-final triggered for segment #{self.segment_number} ({model_name}) - "
1160
+ f"Interim count: {self.interim_count}, Silence: {silence_duration:.1f}s")
1161
+
1162
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
1163
+ await self.start_new_segment()
1164
+
1165
+ await asyncio.sleep(0.5) # Check every 500ms
1166
+
1167
+ except Exception as e:
1168
+ logger.error(f"Error in auto-final monitoring: {e}")
1169
+ await asyncio.sleep(0.5)
1170
+
1171
+
1172
+
1173
+ async def send_transcription(self, text, is_final=True, confidence=0.9):
1174
+ """Send transcription in jambonz format"""
1175
+ try:
1176
+ # Apply number conversion only for Whisper
1177
+ if not self.use_nemo and is_final:
1178
+ original_text = text
1179
+ converted_text = convert_arabic_numbers_whisper(text)
1180
+
1181
+ if original_text != converted_text:
1182
+ logger.info(f"Whisper - Arabic numbers converted: '{original_text}' -> '{converted_text}'")
1183
+ text = converted_text
1184
+
1185
+ message = {
1186
+ "type": "transcription",
1187
+ "is_final": True, # Always send as final
1188
+ "alternatives": [
1189
+ {
1190
+ "transcript": text,
1191
+ "confidence": confidence
1192
+ }
1193
+ ],
1194
+ "language": self.config.get("language", "ar-EG"),
1195
+ "channel": 1
1196
+ }
1197
+
1198
+ await self.websocket.send(json.dumps(message))
1199
+ self.final_sent = True
1200
+
1201
+ model_name = "NeMo" if self.use_nemo else "Whisper"
1202
+ logger.info(f"Sent FINAL transcription ({model_name}): '{text}'")
1203
+
1204
+ except Exception as e:
1205
+ logger.error(f"Error sending transcription: {e}")
1206
+
1207
+ async def send_error(self, error_message):
1208
+ """Send error message in jambonz format"""
1209
+ try:
1210
+ message = {
1211
+ "type": "error",
1212
+ "error": error_message
1213
+ }
1214
+ await self.websocket.send(json.dumps(message))
1215
+ logger.error(f"Sent error: {error_message}")
1216
+ except Exception as e:
1217
+ logger.error(f"Error sending error message: {e}")
1218
+
1219
+ async def handle_jambonz_websocket(websocket):
1220
+ """Handle jambonz WebSocket connections"""
1221
+
1222
+ client_id = f"jambonz_{id(websocket)}"
1223
+ logger.info(f"New unified STT connection: {client_id}")
1224
+
1225
+ handler = UnifiedSTTHandler(websocket)
1226
+
1227
+ try:
1228
+ async for message in websocket:
1229
+ try:
1230
+ if isinstance(message, str):
1231
+ data = json.loads(message)
1232
+ message_type = data.get("type")
1233
+
1234
+ if message_type == "start":
1235
+ logger.info(f"Received start message: {data}")
1236
+ await handler.start_processing(data)
1237
+
1238
+ elif message_type == "stop":
1239
+ logger.info("Received stop message - closing WebSocket")
1240
+ await handler.stop_processing()
1241
+ await websocket.close(code=1000, reason="Session stopped by client")
1242
+ break
1243
+
1244
+ else:
1245
+ logger.warning(f"Unknown message type: {message_type}")
1246
+ await handler.send_error(f"Unknown message type: {message_type}")
1247
+
1248
+ else:
1249
+ # Handle binary audio data
1250
+ if not handler.running or handler.audio_buffer is None:
1251
+ logger.warning("Received audio data outside of active session")
1252
+ await handler.send_error("Received audio before start message or after stop")
1253
+ continue
1254
+
1255
+ await handler.add_audio_data(message)
1256
+
1257
+ except json.JSONDecodeError as e:
1258
+ logger.error(f"JSON decode error: {e}")
1259
+ await handler.send_error(f"Invalid JSON: {str(e)}")
1260
+ except Exception as e:
1261
+ logger.error(f"Error processing message: {e}")
1262
+ await handler.send_error(f"Processing error: {str(e)}")
1263
+
1264
+ except websockets.exceptions.ConnectionClosed:
1265
+ logger.info(f"Unified STT connection closed: {client_id}")
1266
+ except Exception as e:
1267
+ logger.error(f"Unified STT WebSocket error: {e}")
1268
+ try:
1269
+ await handler.send_error(str(e))
1270
+ except:
1271
+ pass
1272
+ finally:
1273
+ if handler.running:
1274
+ await handler.stop_processing()
1275
+ logger.info(f"Unified STT connection ended: {client_id}")
1276
+
1277
+ async def main():
1278
+ """Start the Unified Arabic STT WebSocket server"""
1279
+ logger.info("Starting Unified Arabic STT WebSocket server on port 3007...")
1280
+
1281
+ # Check model availability
1282
+ models_available = []
1283
+ if asr_model_nemo is not None:
1284
+ models_available.append("NeMo FastConformer (ar-EG)")
1285
+ if whisper_model is not None:
1286
+ models_available.append("Whisper large-v3 (ar-EG-whis)")
1287
+
1288
+ if not models_available:
1289
+ logger.error("No models available! Please check model paths and installations.")
1290
+ return
1291
+
1292
+ # Start WebSocket server
1293
+ server = await websockets.serve(
1294
+ handle_jambonz_websocket,
1295
+ "0.0.0.0",
1296
+ 3007,
1297
+ ping_interval=20,
1298
+ ping_timeout=10,
1299
+ close_timeout=10
1300
+ )
1301
+
1302
+ logger.info("Unified Arabic STT WebSocket server started on ws://0.0.0.0:3007")
1303
+ logger.info("Ready to handle jambonz STT requests with both models")
1304
+ logger.info("ROUTING:")
1305
+ logger.info("- language: 'ar-EG' → NeMo FastConformer (with built-in number conversion)")
1306
+ logger.info("- language: 'ar-EG-whis' → Whisper large-v3 (with pyarabic number conversion)")
1307
+ logger.info("FEATURES:")
1308
+ logger.info("- Continuous transcription with segmentation")
1309
+ logger.info("- Voice Activity Detection")
1310
+ logger.info("- Auto-final detection (2s silence timeout)")
1311
+ logger.info("- Model-specific number conversion")
1312
+ logger.info(f"AVAILABLE MODELS: {', '.join(models_available)}")
1313
+
1314
+ # Wait for the server to close
1315
+ await server.wait_closed()
1316
+
1317
+ if __name__ == "__main__":
1318
+ print("=" * 80)
1319
+ print("Unified Arabic STT Server (NeMo + Whisper)")
1320
+ print("=" * 80)
1321
+ print("WebSocket Port: 3007")
1322
+ print("Protocol: jambonz STT API")
1323
+ print("Audio Format: LINEAR16 PCM @ 8kHz → 16kHz")
1324
+ print()
1325
+ print("LANGUAGE ROUTING:")
1326
+ print("- 'ar-EG' → NeMo FastConformer")
1327
+ print(" • Built-in Arabic number word to digit conversion")
1328
+ print(" • Optimized for Arabic dialects")
1329
+ print("- 'ar-EG-whis' → Whisper large-v3")
1330
+ print(" • pyarabic library number conversion (final transcripts only)")
1331
+ print(" • OpenAI Whisper model")
1332
+ print()
1333
+ print("FEATURES:")
1334
+ print("- Automatic model selection based on language parameter")
1335
+ print("- Voice Activity Detection")
1336
+ print("- Auto-final detection (2 seconds silence)")
1337
+ print("- Model-specific number conversion strategies")
1338
+ print("- Continuous transcription with segmentation")
1339
+ print()
1340
+
1341
+ # Check model availability for startup info
1342
+ nemo_status = "✓ Available" if asr_model_nemo is not None else "✗ Not Available"
1343
+ whisper_status = "✓ Available" if whisper_model is not None else "✗ Not Available"
1344
+ arabic_numbers_status = "✓ Available" if arabic_numbers_available else "✗ Not Available (install pyarabic)"
1345
+
1346
+ print("MODEL STATUS:")
1347
+ print(f"- NeMo FastConformer: {nemo_status}")
1348
+ print(f"- Whisper large-v3: {whisper_status}")
1349
+ print(f"- pyarabic (Whisper numbers): {arabic_numbers_status}")
1350
+ print("=" * 80)
1351
+
1352
+ try:
1353
+ asyncio.run(main())
1354
+ except KeyboardInterrupt:
1355
+ print("\nShutting down unified server...")
1356
+ except Exception as e:
1357
+ print(f"Server error: {e}")
denoiser_model.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from denoiser import pretrained
3
+
4
+ device = "cuda" if torch.cuda.is_available() else "cpu"
5
+
6
+ # Load DNS64 pretrained model (auto-downloads if not cached)
7
+ denoiser_model = pretrained.dns64().to(device)
8
+ denoiser_model.eval()
improved_asr_web_ui.html ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>ASR WebSocket Testing Client with Sample Rate Analysis</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
17
+ min-height: 100vh;
18
+ padding: 20px;
19
+ }
20
+
21
+ .container {
22
+ background: rgba(255, 255, 255, 0.95);
23
+ backdrop-filter: blur(10px);
24
+ border-radius: 20px;
25
+ padding: 40px;
26
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
27
+ max-width: 800px;
28
+ margin: 0 auto;
29
+ border: 1px solid rgba(255, 255, 255, 0.2);
30
+ }
31
+
32
+ .header {
33
+ text-align: center;
34
+ margin-bottom: 30px;
35
+ }
36
+
37
+ .header h1 {
38
+ color: #333;
39
+ font-size: 2.5em;
40
+ font-weight: 300;
41
+ margin-bottom: 10px;
42
+ }
43
+
44
+ .header p {
45
+ color: #666;
46
+ font-size: 1.1em;
47
+ }
48
+
49
+ .connection-section {
50
+ margin-bottom: 30px;
51
+ }
52
+
53
+ .input-group {
54
+ margin-bottom: 20px;
55
+ }
56
+
57
+ .input-group label {
58
+ display: block;
59
+ margin-bottom: 8px;
60
+ color: #333;
61
+ font-weight: 500;
62
+ }
63
+
64
+ .input-group input, .input-group select {
65
+ width: 100%;
66
+ padding: 12px 16px;
67
+ border: 2px solid #e1e5e9;
68
+ border-radius: 10px;
69
+ font-size: 16px;
70
+ transition: all 0.3s ease;
71
+ background: rgba(255, 255, 255, 0.8);
72
+ }
73
+
74
+ .input-group input:focus, .input-group select:focus {
75
+ outline: none;
76
+ border-color: #667eea;
77
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
78
+ }
79
+
80
+ .btn {
81
+ padding: 12px 24px;
82
+ border: none;
83
+ border-radius: 10px;
84
+ font-size: 16px;
85
+ font-weight: 500;
86
+ cursor: pointer;
87
+ transition: all 0.3s ease;
88
+ text-transform: uppercase;
89
+ letter-spacing: 0.5px;
90
+ }
91
+
92
+ .btn:disabled {
93
+ opacity: 0.6;
94
+ cursor: not-allowed;
95
+ }
96
+
97
+ .btn-connect {
98
+ background: linear-gradient(135deg, #4CAF50, #45a049);
99
+ color: white;
100
+ width: 100%;
101
+ }
102
+
103
+ .btn-connect:hover:not(:disabled) {
104
+ transform: translateY(-2px);
105
+ box-shadow: 0 5px 15px rgba(76, 175, 80, 0.3);
106
+ }
107
+
108
+ .btn-disconnect {
109
+ background: linear-gradient(135deg, #f44336, #da190b);
110
+ color: white;
111
+ width: 100%;
112
+ }
113
+
114
+ .audio-controls {
115
+ display: flex;
116
+ justify-content: center;
117
+ gap: 20px;
118
+ margin: 30px 0;
119
+ }
120
+
121
+ .btn-mic {
122
+ background: linear-gradient(135deg, #2196F3, #1976D2);
123
+ color: white;
124
+ width: 80px;
125
+ height: 80px;
126
+ border-radius: 50%;
127
+ display: flex;
128
+ align-items: center;
129
+ justify-content: center;
130
+ font-size: 24px;
131
+ }
132
+
133
+ .btn-mic:hover:not(:disabled) {
134
+ transform: scale(1.1);
135
+ box-shadow: 0 10px 25px rgba(33, 150, 243, 0.3);
136
+ }
137
+
138
+ .btn-mic.recording {
139
+ background: linear-gradient(135deg, #f44336, #da190b);
140
+ animation: pulse 1.5s infinite;
141
+ }
142
+
143
+ .btn-stop {
144
+ background: linear-gradient(135deg, #FF9800, #F57C00);
145
+ color: white;
146
+ width: 80px;
147
+ height: 80px;
148
+ border-radius: 50%;
149
+ display: flex;
150
+ align-items: center;
151
+ justify-content: center;
152
+ font-size: 24px;
153
+ }
154
+
155
+ @keyframes pulse {
156
+ 0% { transform: scale(1); }
157
+ 50% { transform: scale(1.05); }
158
+ 100% { transform: scale(1); }
159
+ }
160
+
161
+ .status {
162
+ text-align: center;
163
+ margin: 20px 0;
164
+ padding: 12px;
165
+ border-radius: 10px;
166
+ font-weight: 500;
167
+ }
168
+
169
+ .status.connected {
170
+ background: rgba(76, 175, 80, 0.1);
171
+ color: #4CAF50;
172
+ border: 1px solid rgba(76, 175, 80, 0.3);
173
+ }
174
+
175
+ .status.disconnected {
176
+ background: rgba(244, 67, 54, 0.1);
177
+ color: #f44336;
178
+ border: 1px solid rgba(244, 67, 54, 0.3);
179
+ }
180
+
181
+ .status.recording {
182
+ background: rgba(33, 150, 243, 0.1);
183
+ color: #2196F3;
184
+ border: 1px solid rgba(33, 150, 243, 0.3);
185
+ }
186
+
187
+ .stats-section {
188
+ display: grid;
189
+ grid-template-columns: 1fr 1fr;
190
+ gap: 20px;
191
+ margin: 20px 0;
192
+ }
193
+
194
+ .stat-box {
195
+ background: rgba(0, 0, 0, 0.05);
196
+ border-radius: 10px;
197
+ padding: 15px;
198
+ text-align: center;
199
+ border: 1px solid rgba(0, 0, 0, 0.1);
200
+ }
201
+
202
+ .stat-value {
203
+ font-size: 2em;
204
+ font-weight: bold;
205
+ color: #667eea;
206
+ margin-bottom: 5px;
207
+ }
208
+
209
+ .stat-label {
210
+ color: #666;
211
+ font-size: 0.9em;
212
+ }
213
+
214
+ .response-section {
215
+ margin-top: 30px;
216
+ }
217
+
218
+ .response-box {
219
+ background: rgba(0, 0, 0, 0.05);
220
+ border-radius: 10px;
221
+ padding: 20px;
222
+ min-height: 200px;
223
+ max-height: 400px;
224
+ overflow-y: auto;
225
+ border: 1px solid rgba(0, 0, 0, 0.1);
226
+ font-family: 'Courier New', monospace;
227
+ font-size: 12px;
228
+ white-space: pre-wrap;
229
+ word-wrap: break-word;
230
+ }
231
+
232
+ .debug-section {
233
+ margin-top: 20px;
234
+ }
235
+
236
+ .debug-box {
237
+ background: rgba(0, 0, 0, 0.8);
238
+ color: #00ff00;
239
+ border-radius: 10px;
240
+ padding: 15px;
241
+ min-height: 150px;
242
+ max-height: 300px;
243
+ overflow-y: auto;
244
+ font-family: 'Courier New', monospace;
245
+ font-size: 11px;
246
+ }
247
+ </style>
248
+ </head>
249
+ <body>
250
+ <div class="container">
251
+ <div class="header">
252
+ <h1>🎤 ASR Sample Rate Analyzer</h1>
253
+ <p>WebSocket-based Speech Recognition with Audio Analysis</p>
254
+ </div>
255
+
256
+ <div class="connection-section">
257
+ <div class="input-group">
258
+ <label for="websocketUrl">WebSocket URL:</label>
259
+ <input type="text" id="websocketUrl" value="ws://185.208.206.135:5005" placeholder="ws://185.208.206.135:5005">
260
+ </div>
261
+
262
+ <div class="input-group">
263
+ <label for="targetSampleRate">Target Sample Rate (Hz):</label>
264
+ <select id="targetSampleRate">
265
+ <option value="8000">8000 Hz (Default)</option>
266
+ <option value="16000">16000 Hz</option>
267
+ <option value="22050">22050 Hz</option>
268
+ <option value="44100">44100 Hz</option>
269
+ </select>
270
+ </div>
271
+
272
+ <div class="input-group">
273
+ <label for="chunkSize">Audio Chunk Size (samples):</label>
274
+ <select id="chunkSize">
275
+ <option value="1024">1024 samples</option>
276
+ <option value="2048">2048 samples</option>
277
+ <option value="4096" selected>4096 samples</option>
278
+ <option value="8192">8192 samples</option>
279
+ </select>
280
+ </div>
281
+
282
+ <div class="input-group">
283
+ <label for="Interim-Results"> Interim-Results:</label>
284
+ <select id="interimResults">
285
+ <option value="true">True</option>
286
+ <option value="false">False</option>
287
+ </select>
288
+ </div>
289
+
290
+ <div class="input-group">
291
+ <label for="language">Language Code:</label>
292
+ <input type="text" id="language_code" value="en-US" placeholder="en-US">
293
+ </div>
294
+
295
+
296
+ <button id="connectBtn" class="btn btn-connect">Connect to Debug Server</button>
297
+ <button id="disconnectBtn" class="btn btn-disconnect" style="display: none;">Disconnect</button>
298
+ </div>
299
+
300
+ <div id="status" class="status disconnected">Disconnected</div>
301
+
302
+ <div class="stats-section">
303
+ <div class="stat-box">
304
+ <div class="stat-value" id="actualSampleRate">0</div>
305
+ <div class="stat-label">Calculated Sample Rate (Hz)</div>
306
+ </div>
307
+ <div class="stat-box">
308
+ <div class="stat-value" id="bytesSent">0</div>
309
+ <div class="stat-label">Total Bytes Sent</div>
310
+ </div>
311
+ <div class="stat-box">
312
+ <div class="stat-value" id="chunksSent">0</div>
313
+ <div class="stat-label">Audio Chunks Sent</div>
314
+ </div>
315
+ <div class="stat-box">
316
+ <div class="stat-value" id="duration">0.0s</div>
317
+ <div class="stat-label">Recording Duration</div>
318
+ </div>
319
+ </div>
320
+
321
+ <div class="audio-controls">
322
+ <button id="micBtn" class="btn btn-mic" disabled title="Start Recording">🎤</button>
323
+ <button id="stopBtn" class="btn btn-stop" disabled title="Stop Recording">⏹️</button>
324
+ </div>
325
+
326
+ <div class="response-section">
327
+ <h3>Server Responses:</h3>
328
+ <div id="responseBox" class="response-box">Waiting for connection...</div>
329
+ </div>
330
+
331
+ <div class="debug-section">
332
+ <h3>Debug Console:</h3>
333
+ <div id="debugBox" class="debug-box">Ready to connect...</div>
334
+ </div>
335
+ </div>
336
+
337
+ <script>
338
+ class SampleRateAnalyzer {
339
+ constructor() {
340
+ this.websocket = null;
341
+ this.audioContext = null;
342
+ this.mediaRecorder = null;
343
+ this.audioStream = null;
344
+ this.processor = null;
345
+ this.isRecording = false;
346
+ this.isConnected = false;
347
+
348
+ // Audio analysis variables
349
+ this.startTime = null;
350
+ this.totalBytesSent = 0;
351
+ this.chunksSent = 0;
352
+ this.targetSampleRate = 8000;
353
+ this.chunkSize = 4096;
354
+
355
+ this.initializeElements();
356
+ this.attachEventListeners();
357
+ }
358
+
359
+ initializeElements() {
360
+ this.elements = {
361
+ websocketUrl: document.getElementById('websocketUrl'),
362
+ targetSampleRate: document.getElementById('targetSampleRate'),
363
+ chunkSize: document.getElementById('chunkSize'),
364
+ connectBtn: document.getElementById('connectBtn'),
365
+ disconnectBtn: document.getElementById('disconnectBtn'),
366
+ micBtn: document.getElementById('micBtn'),
367
+ stopBtn: document.getElementById('stopBtn'),
368
+ status: document.getElementById('status'),
369
+ responseBox: document.getElementById('responseBox'),
370
+ debugBox: document.getElementById('debugBox'),
371
+ actualSampleRate: document.getElementById('actualSampleRate'),
372
+ bytesSent: document.getElementById('bytesSent'),
373
+ chunksSent: document.getElementById('chunksSent'),
374
+ duration: document.getElementById('duration')
375
+ };
376
+ }
377
+
378
+ attachEventListeners() {
379
+ this.elements.connectBtn.addEventListener('click', () => this.connect());
380
+ this.elements.disconnectBtn.addEventListener('click', () => this.disconnect());
381
+ this.elements.micBtn.addEventListener('click', () => this.startRecording());
382
+ this.elements.stopBtn.addEventListener('click', () => this.stopRecording());
383
+ this.elements.targetSampleRate.addEventListener('change', (e) => {
384
+ this.targetSampleRate = parseInt(e.target.value);
385
+ this.debugLog(`Target sample rate changed to: ${this.targetSampleRate} Hz`);
386
+ });
387
+ this.elements.chunkSize.addEventListener('change', (e) => {
388
+ this.chunkSize = parseInt(e.target.value);
389
+ this.debugLog(`Chunk size changed to: ${this.chunkSize} samples`);
390
+ });
391
+ }
392
+
393
+ debugLog(message) {
394
+ const timestamp = new Date().toLocaleTimeString();
395
+ const debugBox = this.elements.debugBox;
396
+ debugBox.innerHTML += `[${timestamp}] ${message}\n`;
397
+ debugBox.scrollTop = debugBox.scrollHeight;
398
+ }
399
+
400
+ updateStatus(message, type) {
401
+ this.elements.status.textContent = message;
402
+ this.elements.status.className = `status ${type}`;
403
+ }
404
+
405
+ updateStats() {
406
+ if (this.startTime) {
407
+ const elapsed = (Date.now() - this.startTime) / 1000;
408
+ this.elements.duration.textContent = `${elapsed.toFixed(1)}s`;
409
+
410
+ // Calculate actual sample rate
411
+ if (elapsed > 0.5) {
412
+ const totalSamples = this.totalBytesSent / 2; // 16-bit samples
413
+ const calculatedRate = totalSamples / elapsed;
414
+ this.elements.actualSampleRate.textContent = Math.round(calculatedRate);
415
+
416
+ // Log if there's a significant difference
417
+ const difference = Math.abs(calculatedRate - this.targetSampleRate);
418
+ if (difference > 100) {
419
+ this.debugLog(`⚠️ Sample rate mismatch! Target: ${this.targetSampleRate}Hz, Actual: ${Math.round(calculatedRate)}Hz`);
420
+ }
421
+ }
422
+ }
423
+
424
+ this.elements.bytesSent.textContent = this.totalBytesSent.toLocaleString();
425
+ this.elements.chunksSent.textContent = this.chunksSent.toLocaleString();
426
+ }
427
+
428
+ async connect() {
429
+ const url = this.elements.websocketUrl.value.trim();
430
+ if (!url) {
431
+ alert('Please enter a WebSocket URL');
432
+ return;
433
+ }
434
+
435
+ try {
436
+ this.updateStatus('Connecting...', 'disconnected');
437
+ this.elements.connectBtn.disabled = true;
438
+ this.debugLog(`Attempting to connect to: ${url}`);
439
+
440
+ this.websocket = new WebSocket(url);
441
+ this.websocket.binaryType = 'arraybuffer';
442
+
443
+ // Set a connection timeout
444
+ const connectionTimeout = setTimeout(() => {
445
+ if (this.websocket.readyState === WebSocket.CONNECTING) {
446
+ this.websocket.close();
447
+ this.debugLog('❌ Connection timeout');
448
+ this.updateStatus('Connection Timeout', 'disconnected');
449
+ this.resetConnection();
450
+ }
451
+ }, 10000); // 10 second timeout
452
+
453
+ this.websocket.onopen = () => {
454
+ clearTimeout(connectionTimeout);
455
+ this.isConnected = true;
456
+ this.updateStatus('Connected to Debug Server', 'connected');
457
+ this.elements.connectBtn.style.display = 'none';
458
+ this.elements.disconnectBtn.style.display = 'block';
459
+ this.elements.micBtn.disabled = false;
460
+ this.elements.responseBox.textContent = 'Connected to debug server. Ready to test sample rate...';
461
+ this.debugLog('✅ WebSocket connected successfully');
462
+ };
463
+
464
+ this.websocket.onmessage = (event) => {
465
+ if (typeof event.data === 'string') {
466
+ try {
467
+ const response = JSON.parse(event.data);
468
+ this.displayResponse('JSON Response', response);
469
+ this.debugLog(`📨 Received JSON: ${JSON.stringify(response)}`);
470
+ } catch (e) {
471
+ this.displayResponse('Text Response', event.data);
472
+ this.debugLog(`📨 Received Text: ${event.data}`);
473
+ }
474
+ } else {
475
+ this.displayResponse('Binary Response', `Received binary data: ${event.data.byteLength} bytes`);
476
+ this.debugLog(`📨 Received Binary: ${event.data.byteLength} bytes`);
477
+ }
478
+ };
479
+
480
+ this.websocket.onerror = (error) => {
481
+ clearTimeout(connectionTimeout);
482
+ console.error('WebSocket error:', error);
483
+ this.debugLog(`❌ WebSocket error: ${error.message || 'Unknown error'}`);
484
+ this.updateStatus('Connection Error', 'disconnected');
485
+ this.resetConnection();
486
+ };
487
+
488
+ this.websocket.onclose = (event) => {
489
+ clearTimeout(connectionTimeout);
490
+ this.isConnected = false;
491
+ const reason = event.reason || 'No reason provided';
492
+ this.debugLog(`🔌 WebSocket closed: Code ${event.code}, Reason: ${reason}`);
493
+ this.updateStatus(`Disconnected (Code: ${event.code})`, 'disconnected');
494
+ this.resetConnection();
495
+ this.displayResponse('Connection Closed', `Code: ${event.code}, Reason: ${reason}`);
496
+ };
497
+
498
+ } catch (error) {
499
+ console.error('Connection failed:', error);
500
+ this.debugLog(`❌ Connection failed: ${error.message}`);
501
+ this.updateStatus('Connection Failed', 'disconnected');
502
+ this.resetConnection();
503
+ }
504
+ }
505
+
506
+ disconnect() {
507
+ if (this.isRecording) {
508
+ this.stopRecording();
509
+ }
510
+ if (this.websocket && this.websocket.readyState === WebSocket.OPEN) {
511
+ this.debugLog('🔌 Manually disconnecting...');
512
+ this.websocket.close(1000, 'Client disconnect');
513
+ }
514
+ this.resetConnection();
515
+ }
516
+
517
+ resetConnection() {
518
+ this.isConnected = false;
519
+ this.elements.connectBtn.disabled = false;
520
+ this.elements.connectBtn.style.display = 'block';
521
+ this.elements.disconnectBtn.style.display = 'none';
522
+ this.elements.micBtn.disabled = true;
523
+ this.elements.stopBtn.disabled = true;
524
+ this.stopRecording();
525
+
526
+ // Reset stats
527
+ this.totalBytesSent = 0;
528
+ this.chunksSent = 0;
529
+ this.startTime = null;
530
+ this.updateStats();
531
+ }
532
+
533
+ // Convert Float32Array to Int16Array (LINEAR16 PCM)
534
+ floatTo16BitPCM(float32Array) {
535
+ const int16Array = new Int16Array(float32Array.length);
536
+ for (let i = 0; i < float32Array.length; i++) {
537
+ const clipped = Math.max(-1, Math.min(1, float32Array[i]));
538
+ int16Array[i] = clipped * 0x7FFF;
539
+ }
540
+ return int16Array;
541
+ }
542
+
543
+ // Resample audio to target sample rate
544
+ resampleAudio(audioBuffer, sourceSampleRate, targetSampleRate) {
545
+ if (sourceSampleRate === targetSampleRate) {
546
+ return audioBuffer;
547
+ }
548
+
549
+ const ratio = sourceSampleRate / targetSampleRate;
550
+ const targetLength = Math.round(audioBuffer.length / ratio);
551
+ const resampled = new Float32Array(targetLength);
552
+
553
+ for (let i = 0; i < targetLength; i++) {
554
+ const sourceIndex = i * ratio;
555
+ const sourceIndexFloor = Math.floor(sourceIndex);
556
+ const sourceIndexCeil = Math.min(sourceIndexFloor + 1, audioBuffer.length - 1);
557
+ const weight = sourceIndex - sourceIndexFloor;
558
+
559
+ resampled[i] = audioBuffer[sourceIndexFloor] * (1 - weight) +
560
+ audioBuffer[sourceIndexCeil] * weight;
561
+ }
562
+
563
+ return resampled;
564
+ }
565
+
566
+ async startRecording() {
567
+ if (!this.isConnected) {
568
+ alert('Please connect to debug server first');
569
+ return;
570
+ }
571
+
572
+ try {
573
+ this.debugLog(`🎤 Starting recording with target sample rate: ${this.targetSampleRate} Hz`);
574
+
575
+ // Initialize audio context
576
+ this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
577
+ this.debugLog(`🔊 Audio context created with sample rate: ${this.audioContext.sampleRate} Hz`);
578
+
579
+ // Get microphone stream
580
+ this.audioStream = await navigator.mediaDevices.getUserMedia({
581
+ audio: {
582
+ echoCancellation: false,
583
+ noiseSuppression: false,
584
+ autoGainControl: false,
585
+ channelCount: 1
586
+ }
587
+ });
588
+
589
+ const source = this.audioContext.createMediaStreamSource(this.audioStream);
590
+
591
+ // Create processor with specified chunk size
592
+ this.processor = this.audioContext.createScriptProcessor(this.chunkSize, 1, 1);
593
+
594
+ this.processor.onaudioprocess = (event) => {
595
+ if (!this.isRecording || !this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
596
+ return;
597
+ }
598
+
599
+ const inputBuffer = event.inputBuffer;
600
+ const audioData = inputBuffer.getChannelData(0);
601
+
602
+ // Resample to target sample rate
603
+ const resampled = this.resampleAudio(audioData, this.audioContext.sampleRate, this.targetSampleRate);
604
+
605
+ // Convert to LINEAR16 PCM
606
+ const pcmData = this.floatTo16BitPCM(resampled);
607
+
608
+ // Send binary audio data
609
+ this.websocket.send(pcmData.buffer);
610
+
611
+ // Update stats
612
+ this.totalBytesSent += pcmData.buffer.byteLength;
613
+ this.chunksSent += 1;
614
+ this.updateStats();
615
+
616
+ if (this.chunksSent % 10 === 0) { // Log every 10 chunks
617
+ this.debugLog(`📊 Sent ${this.chunksSent} chunks, ${this.totalBytesSent} bytes`);
618
+ }
619
+ };
620
+
621
+ // Connect audio nodes
622
+ source.connect(this.processor);
623
+ this.processor.connect(this.audioContext.destination);
624
+
625
+ // Send START message
626
+ const startMessage = {
627
+ type: "start",
628
+ language: this.language_code,
629
+ format: "raw",
630
+ encoding: "LINEAR16",
631
+ interimResults: this.interimResults,
632
+ sampleRateHz: this.targetSampleRate,
633
+ options: {
634
+ testClient: true,
635
+ chunkSize: this.chunkSize
636
+ }
637
+ };
638
+
639
+ this.websocket.send(JSON.stringify(startMessage));
640
+ this.displayResponse('Sent START', startMessage);
641
+ this.debugLog(`📤 Sent START message: ${JSON.stringify(startMessage)}`);
642
+
643
+ this.isRecording = true;
644
+ this.startTime = Date.now();
645
+
646
+ // Update UI
647
+ this.elements.micBtn.classList.add('recording');
648
+ this.elements.micBtn.disabled = true;
649
+ this.elements.stopBtn.disabled = false;
650
+ this.updateStatus(`Recording @ ${this.targetSampleRate}Hz`, 'recording');
651
+
652
+ } catch (error) {
653
+ console.error('Failed to start recording:', error);
654
+ this.debugLog(`❌ Recording failed: ${error.message}`);
655
+ alert('Failed to access microphone. Please check permissions.');
656
+ this.stopRecording();
657
+ }
658
+ }
659
+
660
+ stopRecording() {
661
+ if (this.isRecording) {
662
+ this.isRecording = false;
663
+ this.debugLog('🛑 Stopping recording...');
664
+
665
+ // Send STOP message
666
+ if (this.websocket && this.websocket.readyState === WebSocket.OPEN) {
667
+ const stopMessage = { type: "stop" };
668
+ this.websocket.send(JSON.stringify(stopMessage));
669
+ this.displayResponse('Sent STOP', stopMessage);
670
+ this.debugLog(`📤 Sent STOP message`);
671
+ }
672
+ }
673
+
674
+ // Clean up audio resources
675
+ if (this.processor) {
676
+ this.processor.disconnect();
677
+ this.processor = null;
678
+ }
679
+
680
+ if (this.audioContext) {
681
+ this.audioContext.close().then(() => {
682
+ this.audioContext = null;
683
+ this.debugLog('🔊 Audio context closed');
684
+ });
685
+ }
686
+
687
+ if (this.audioStream) {
688
+ this.audioStream.getTracks().forEach(track => track.stop());
689
+ this.audioStream = null;
690
+ this.debugLog('🎤 Microphone stream stopped');
691
+ }
692
+
693
+ // Update UI
694
+ this.elements.micBtn.classList.remove('recording');
695
+ this.elements.micBtn.disabled = false;
696
+ this.elements.stopBtn.disabled = true;
697
+
698
+ if (this.isConnected) {
699
+ this.updateStatus('Connected - Recording stopped', 'connected');
700
+ }
701
+
702
+ // Final stats update
703
+ this.updateStats();
704
+ }
705
+
706
+ displayResponse(messageType, response) {
707
+ const responseBox = this.elements.responseBox;
708
+ const timestamp = new Date().toLocaleTimeString();
709
+
710
+ let content = `[${timestamp}] ${messageType}:\n`;
711
+
712
+ if (typeof response === 'object') {
713
+ content += JSON.stringify(response, null, 2);
714
+ } else {
715
+ content += response;
716
+ }
717
+
718
+ responseBox.innerHTML += content + '\n\n';
719
+ responseBox.scrollTop = responseBox.scrollHeight;
720
+ }
721
+ }
722
+
723
+ // Initialize when page loads
724
+ document.addEventListener('DOMContentLoaded', () => {
725
+ new SampleRateAnalyzer();
726
+ });
727
+ </script>
728
+ </body>
729
+ </html>
pretrained_models/asr-whisper-large-v2-commonvoice-ar/hyperparams.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: Whisper (Encoder-Decoder) + NLL
3
+ # Augmentation: TimeDomainSpecAugment
4
+ # Authors: Pooneh Mousavi 2022
5
+ # ################################
6
+
7
+
8
+ # URL for the biggest Fairseq english whisper model.
9
+ whisper_hub: openai/whisper-large-v2
10
+
11
+ # Normalize inputs with
12
+ # the same normalization done in the paper. Refer to Appendix C for further information.
13
+ normalized_transcripts: True
14
+
15
+
16
+ language: arabic
17
+
18
+ auto_mix_prec: False
19
+ sample_rate: 16000
20
+
21
+ # Decoding parameters
22
+ min_decode_ratio: 0.0
23
+ max_decode_ratio: 1.0
24
+ test_beam_size: 8
25
+
26
+ # Model parameters
27
+ freeze_whisper: True
28
+ freeze_encoder: True
29
+
30
+
31
+ whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
32
+ source: !ref <whisper_hub>
33
+ freeze: !ref <freeze_whisper>
34
+ freeze_encoder: !ref <freeze_encoder>
35
+ save_path: whisper_checkpoints
36
+ encoder_only: False
37
+
38
+ decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher
39
+ model: !ref <whisper>
40
+ min_decode_ratio: !ref <min_decode_ratio>
41
+ max_decode_ratio: !ref <max_decode_ratio>
42
+
43
+ # test_beam_searcher: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher
44
+ # module: [!ref <whisper>]
45
+ # min_decode_ratio: !ref <min_decode_ratio>
46
+ # max_decode_ratio: !ref <max_decode_ratio>
47
+ # beam_size: !ref <test_beam_size>
48
+
49
+
50
+ modules:
51
+ whisper: !ref <whisper>
52
+ decoder: !ref <decoder>
53
+
54
+
55
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
56
+ loadables:
57
+ whisper: !ref <whisper>
58
+
pretrained_models/asr-whisper-large-v2-commonvoice-ar/whisper.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac653766f62d8701b6fe6177a77505f98564e6c5f4c03948f2c87ad21db18c4
3
+ size 6173767281
requirements_denoiser.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pip install git+https://github.com/facebookresearch/denoiser
2
+
3
+ pip install noisereduce
speech_brain_whisper_denoiser.py ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # import asyncio
3
+ # import websockets
4
+ # import json
5
+ # import threading
6
+ # import numpy as np
7
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline , WhisperForConditionalGeneration, WhisperProcessor
8
+ # import subprocess
9
+ # import logging
10
+ # import time
11
+ # from concurrent.futures import ThreadPoolExecutor
12
+ # import struct
13
+ # import re
14
+ # 3 - 10 - 2025
15
+ import torch
16
+ import asyncio
17
+ import websockets
18
+ import json
19
+ import threading
20
+ import numpy as np
21
+ from transformers import pipeline
22
+ import subprocess
23
+ import logging
24
+ import time
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ import re
27
+ import tempfile
28
+ import os
29
+ import soundfile as sf
30
+ from pathlib import Path
31
+ # --- Denoiser added ---
32
+ try:
33
+ import noisereduce as nr
34
+ denoiser_available = True
35
+ print("Denoiser available (using noisereduce)")
36
+ except ImportError:
37
+ denoiser_available = False
38
+ print("noisereduce not available - install with: pip install noisereduce")
39
+ ##############################################################################################
40
+ # Arabic number conversion imports
41
+ try:
42
+ from pyarabic.number import text2number
43
+ arabic_numbers_available = True
44
+ print("Arabic number conversion available")
45
+ except ImportError:
46
+ arabic_numbers_available = False
47
+ print("pyarabic not available - install with: pip install pyarabic")
48
+ print("Arabic numbers will not be converted to digits")
49
+
50
+ # Set up logging
51
+ logging.basicConfig(level=logging.INFO)
52
+ logger = logging.getLogger(__name__)
53
+ # 3 - 10 - 2025
54
+ # def denoise_audio(audio_data, sample_rate=16000):
55
+ # """Apply noise reduction to audio using noisereduce."""
56
+ # if not denoiser_available or len(audio_data) == 0:
57
+ # return audio_data
58
+ # try:
59
+ # reduced = nr.reduce_noise(y=audio_data, sr=sample_rate)
60
+ # return reduced.astype(np.float32)
61
+ # except Exception as e:
62
+ # logger.warning(f"Denoiser failed: {e}")
63
+ # return audio_data
64
+ #############################################################################################
65
+ def convert_arabic_numbers_in_sentence(sentence: str) -> str:
66
+ """
67
+ Replace Arabic number words in a sentence with digits,
68
+ preserving all other words and punctuation.
69
+ Handles common spelling variants and zero explicitly.
70
+ """
71
+ try:
72
+ print("Fxn called--------------")
73
+
74
+ # --- Normalization step ---
75
+ replacements = {
76
+ "اربعة": "أربعة",
77
+ "اربع": "أربع",
78
+ "اثنين": "اثنان",
79
+ "اتنين": "اثنان", # Egyptian variant
80
+ "ثلاث": "ثلاثة",
81
+ "خمس": "خمسة",
82
+ "ست": "ستة",
83
+ "سبع": "سبعة",
84
+ "ثمان": "ثمانية",
85
+ "تسع": "تسعة",
86
+ "عشر": "عشرة",
87
+ }
88
+ for wrong, correct in replacements.items():
89
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
90
+
91
+ # --- Split by whitespace but keep spaces ---
92
+ words = re.split(r'(\s+)', sentence)
93
+ converted_words = []
94
+
95
+ for word in words:
96
+ stripped = word.strip()
97
+ if not stripped: # skip spaces
98
+ converted_words.append(word)
99
+ continue
100
+
101
+ try:
102
+ num = text2number(stripped)
103
+
104
+ # Accept valid numbers, including zero explicitly
105
+ if isinstance(num, int):
106
+ if num != 0 or stripped == "صفر":
107
+ converted_words.append(str(num))
108
+ else:
109
+ converted_words.append(word)
110
+ else:
111
+ converted_words.append(word)
112
+
113
+ except Exception:
114
+ converted_words.append(word)
115
+
116
+ return ''.join(converted_words)
117
+
118
+ except Exception as e:
119
+ logger.warning(f"Error converting Arabic numbers: {e}")
120
+ return sentence
121
+
122
+
123
+ # Try to install flash-attn if not available
124
+ try:
125
+ import flash_attn
126
+ use_flash_attn = True
127
+ except ImportError:
128
+ print("Flash attention not available, using standard attention")
129
+ use_flash_attn = False
130
+ try:
131
+ subprocess.run(
132
+ "pip install websockets",
133
+ shell=True,
134
+ check=False
135
+ )
136
+ subprocess.run(
137
+ "pip install flash-attn --no-build-isolation",
138
+ shell=True,
139
+ check=False
140
+ )
141
+ except:
142
+ pass
143
+
144
+ device = "cuda" if torch.cuda.is_available() else "cpu"
145
+ # --- Facebook Denoiser added ---
146
+ try:
147
+ import torchaudio
148
+ from denoiser import pretrained
149
+ # Load DNS64 pretrained model (auto-downloads if not cached)
150
+ denoiser_model = pretrained.dns64().to(device)
151
+ denoiser_model.eval()
152
+ denoiser_available = True
153
+ print("facebook/denoiser loaded successfully")
154
+ except ImportError as e:
155
+ denoiser_available = False
156
+ print("facebook/denoiser not available - install with: pip install denoiser torchaudio")
157
+ denoiser_model = None
158
+
159
+
160
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
161
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"#"openai/whisper-large-v3-turbo"
162
+
163
+ print(f"Using device: {device}")
164
+ print(f"CUDA available: {torch.cuda.is_available()}")
165
+ MODEL_NAME = "speechbrain/asr-whisper-large-v2-commonvoice-ar"
166
+ # Replace your pipeline definition
167
+ from speechbrain.inference.ASR import WhisperASR
168
+
169
+ # Load the SpeechBrain model
170
+ model = WhisperASR.from_hparams(
171
+ source="speechbrain/asr-whisper-large-v2-commonvoice-ar",
172
+ savedir="pretrained_models/asr-whisper-large-v2-commonvoice-ar",
173
+ run_opts={"device": "cuda"} if torch.cuda.is_available() else {}
174
+ )
175
+
176
+
177
+ def denoise_audio(audio_data, sample_rate=16000):
178
+ """Apply denoising using facebook/denoiser pretrained model."""
179
+ if denoiser_model is None or len(audio_data) == 0:
180
+ return audio_data
181
+ try:
182
+ audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device=device).unsqueeze(0)
183
+ with torch.no_grad():
184
+ denoised_tensor = denoiser_model(audio_tensor)[0] # no sample_rate arg
185
+ return denoised_tensor.squeeze().cpu().numpy().astype("float32")
186
+ except Exception as e:
187
+ print(f"[WARN] Denoiser failed: {e}")
188
+ return audio_data
189
+
190
+ # Thread pool for processing audio
191
+ executor = ThreadPoolExecutor(max_workers=4)
192
+
193
+ class JambonzAudioBuffer:
194
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
195
+ self.sample_rate = sample_rate
196
+ self.chunk_duration = chunk_duration
197
+ self.chunk_samples = int(chunk_duration * sample_rate)
198
+
199
+ self.buffer = np.array([], dtype=np.float32)
200
+ self.lock = threading.Lock()
201
+ self.total_audio = np.array([], dtype=np.float32)
202
+
203
+ # Voice Activity Detection (simple energy-based)
204
+ self.silence_threshold = 0.01
205
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
206
+
207
+ def add_audio(self, audio_data):
208
+ with self.lock:
209
+ self.buffer = np.concatenate([self.buffer, audio_data])
210
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
211
+
212
+ def has_chunk_ready(self):
213
+ with self.lock:
214
+ return len(self.buffer) >= self.chunk_samples
215
+
216
+ def is_speech(self, audio_chunk):
217
+ """Simple VAD based on energy"""
218
+ if len(audio_chunk) < self.min_speech_samples:
219
+ return False
220
+ energy = np.mean(np.abs(audio_chunk))
221
+ return energy > self.silence_threshold
222
+
223
+ def get_chunk_for_processing(self):
224
+ """Get audio chunk for processing - but don't remove it from buffer for interim results"""
225
+ with self.lock:
226
+ if len(self.buffer) < self.chunk_samples:
227
+ return None
228
+
229
+ # For interim results, we want to trigger processing but keep accumulating audio
230
+ # So we just return a signal that we have enough audio, but don't consume it
231
+ return np.array([1]) # Return a dummy array to signal chunk is ready
232
+
233
+ def get_all_audio(self):
234
+ """Get all accumulated audio for final transcription"""
235
+ with self.lock:
236
+ return self.total_audio.copy()
237
+
238
+ def clear(self):
239
+ with self.lock:
240
+ self.buffer = np.array([], dtype=np.float32)
241
+ self.total_audio = np.array([], dtype=np.float32)
242
+
243
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
244
+ """Convert LINEAR16 PCM bytes to numpy array (jambonz format)"""
245
+ try:
246
+ # jambonz sends LINEAR16 PCM at 8kHz
247
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
248
+ # Convert to float32 and normalize
249
+ audio_array = audio_array.astype(np.float32) / 32768.0
250
+ return audio_array
251
+ except Exception as e:
252
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
253
+ return np.array([], dtype=np.float32)
254
+
255
+ def resample_audio(audio_data, source_rate, target_rate):
256
+ """Simple resampling from 8kHz to 16kHz"""
257
+ if source_rate == target_rate:
258
+ return audio_data
259
+ ratio = target_rate / source_rate
260
+ indices = np.arange(0, len(audio_data), 1/ratio)
261
+ indices = indices[indices < len(audio_data)]
262
+ resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)
263
+ return resampled.astype(np.float32)
264
+
265
+
266
+ import os
267
+ import tempfile
268
+ import soundfile as sf
269
+ import logging
270
+
271
+ logger = logging.getLogger(__name__)
272
+ from pathlib import Path
273
+ import uuid
274
+ import shutil
275
+ # Project-level temp folder
276
+ PROJECT_DIR = Path(__file__).parent.resolve()
277
+ AUDIO_TMP_DIR = PROJECT_DIR / "temp_audio"
278
+ AUDIO_TMP_DIR.mkdir(exist_ok=True)
279
+ def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
280
+ try:
281
+ if len(audio_data) == 0:
282
+ return ""
283
+
284
+ # Step 1: Resample
285
+ resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
286
+
287
+ # Step 2: Denoise
288
+ resampled_audio = denoise_audio(resampled_audio)
289
+
290
+ # Step 3: Check minimum length (100ms)
291
+ min_samples = int(0.1 * target_sample_rate)
292
+ if len(resampled_audio) < min_samples:
293
+ return ""
294
+
295
+ # Step 4: Convert numpy -> torch tensor
296
+ waveform = torch.tensor(resampled_audio, dtype=torch.float32).unsqueeze(0) # [1, T]
297
+
298
+ # Step 5: Create wav_lens (normalized length)
299
+ wav_lens = torch.tensor([1.0]) # full length, no padding
300
+
301
+ # Step 6: Transcribe
302
+ words, tokens = model.transcribe_batch(waveform, wav_lens)
303
+
304
+ # Step 7: Convert list of words to a sentence
305
+ transcription = " ".join(words[0]) if words and len(words) > 0 else ""
306
+
307
+ logger.info(f"SpeechBrain transcription: '{transcription}'")
308
+ return transcription.strip()
309
+
310
+ except Exception as e:
311
+ logger.error(f"Error during SpeechBrain transcription: {e}")
312
+ return ""
313
+
314
+
315
+ # def transcribe_chunk_direct(audio_data, source_sample_rate=8000, target_sample_rate=16000):
316
+ # """Transcribe audio chunk using model's generate method directly"""
317
+ # try:
318
+ # if len(audio_data) == 0:
319
+ # return ""
320
+
321
+ # # Resample from 8kHz to 16kHz for Whisper
322
+ # resampled_audio = resample_audio(audio_data, source_sample_rate, target_sample_rate)
323
+
324
+ # # Ensure minimum length for Whisper
325
+ # min_samples = int(0.1 * target_sample_rate) # 100ms minimum
326
+ # if len(resampled_audio) < min_samples:
327
+ # return ""
328
+
329
+ # start_time = time.time()
330
+
331
+ # # Prepare input features with proper dtype
332
+ # input_features = processor(
333
+ # resampled_audio,
334
+ # sampling_rate=target_sample_rate,
335
+ # return_tensors="pt"
336
+ # ).input_features
337
+
338
+ # # Ensure correct dtype and device
339
+ # input_features = input_features.to(device=device, dtype=torch_dtype)
340
+
341
+ # # Create attention mask to avoid warnings
342
+ # attention_mask = torch.ones(
343
+ # input_features.shape[:-1],
344
+ # dtype=torch.long,
345
+ # device=device
346
+ # )
347
+
348
+ # # Generate transcription using model directly
349
+ # with torch.no_grad():
350
+ # predicted_ids = model.generate(
351
+ # input_features,
352
+ # attention_mask=attention_mask,
353
+ # max_new_tokens=128,
354
+ # do_sample=False,
355
+ # temperature=0.0,
356
+ # num_beams=1,
357
+ # language="ar",
358
+ # task="transcribe",
359
+ # pad_token_id=tokenizer.pad_token_id,
360
+ # eos_token_id=tokenizer.eos_token_id
361
+ # )
362
+
363
+ # # Decode the transcription
364
+ # transcription = tokenizer.batch_decode(
365
+ # predicted_ids,
366
+ # skip_special_tokens=True
367
+ # )[0].strip()
368
+
369
+ # end_time = time.time()
370
+
371
+ # logger.info(f"Direct transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
372
+ # return transcription
373
+
374
+ # except Exception as e:
375
+ # logger.error(f"Error during direct transcription: {e}")
376
+ # return ""
377
+
378
+ class JambonzSTTHandler:
379
+ def __init__(self, websocket):
380
+ self.websocket = websocket
381
+ self.audio_buffer = None
382
+ self.config = {}
383
+ self.running = True
384
+ self.transcription_task = None
385
+ self.full_transcript = ""
386
+ self.last_partial = ""
387
+
388
+ # Auto-final detection variables
389
+ self.interim_count = 0
390
+ self.last_interim_time = None
391
+ self.silence_timeout = 1.5 # 3 seconds of silence to trigger final
392
+ self.min_interim_count = 1 # Minimum interim results before considering final
393
+ self.auto_final_task = None
394
+ self.accumulated_transcript = ""
395
+ self.final_sent = False
396
+
397
+ async def start_processing(self, start_message):
398
+ """Initialize with start message from jambonz"""
399
+ self.config = {
400
+ "language": start_message.get("language", "ar-EG"),
401
+ "format": start_message.get("format", "raw"),
402
+ "encoding": start_message.get("encoding", "LINEAR16"),
403
+ "sample_rate": start_message.get("sampleRateHz", 8000),
404
+ "interim_results": start_message.get("interimResults", True),
405
+ "options": start_message.get("options", {})
406
+ }
407
+
408
+ logger.info(f"STT session started with config: {self.config}")
409
+
410
+ # Initialize audio buffer
411
+ self.audio_buffer = JambonzAudioBuffer(
412
+ sample_rate=self.config["sample_rate"],
413
+ chunk_duration=1.0 # Process every 1 second
414
+ )
415
+
416
+ # Reset auto-final detection variables
417
+ self.interim_count = 0
418
+ self.last_interim_time = None
419
+ self.accumulated_transcript = ""
420
+ self.final_sent = False
421
+
422
+ # Start background transcription task
423
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
424
+
425
+ # Start auto-final detection task
426
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
427
+
428
+ async def stop_processing(self):
429
+ """Stop processing and send final transcription"""
430
+ self.running = False
431
+
432
+ # Cancel background tasks
433
+ if self.transcription_task:
434
+ self.transcription_task.cancel()
435
+ try:
436
+ await self.transcription_task
437
+ except asyncio.CancelledError:
438
+ pass
439
+
440
+ if self.auto_final_task:
441
+ self.auto_final_task.cancel()
442
+ try:
443
+ await self.auto_final_task
444
+ except asyncio.CancelledError:
445
+ pass
446
+
447
+ # Send final transcription if not already sent
448
+ if not self.final_sent and self.accumulated_transcript.strip():
449
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
450
+
451
+ # Also process any remaining audio for comprehensive final transcription
452
+ if self.audio_buffer:
453
+ all_audio = self.audio_buffer.get_all_audio()
454
+ if len(all_audio) > 0 and not self.final_sent:
455
+ loop = asyncio.get_event_loop()
456
+ final_transcription = await loop.run_in_executor(
457
+ executor,
458
+ transcribe_chunk_direct,
459
+ all_audio,
460
+ self.config["sample_rate"]
461
+ )
462
+
463
+ if final_transcription.strip():
464
+ # Send comprehensive final transcription
465
+ await self.send_transcription(final_transcription, is_final=True)
466
+
467
+ logger.info("STT session ended")
468
+
469
+ async def add_audio_data(self, audio_bytes):
470
+ """Add audio data to buffer"""
471
+ if self.audio_buffer:
472
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
473
+ self.audio_buffer.add_audio(audio_data)
474
+
475
+ async def _process_audio_chunks(self):
476
+ """Process audio chunks for interim results"""
477
+ while self.running and self.config.get("interim_results", False):
478
+ try:
479
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
480
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
481
+ if chunk_signal is not None:
482
+ # Get all accumulated audio so far for complete transcription
483
+ all_audio = self.audio_buffer.get_all_audio()
484
+
485
+ # Only process if we have actual speech content
486
+ if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
487
+ # Run transcription on all accumulated audio
488
+ loop = asyncio.get_event_loop()
489
+ transcription = await loop.run_in_executor(
490
+ executor,
491
+ transcribe_chunk_direct,
492
+ all_audio,
493
+ self.config["sample_rate"]
494
+ )
495
+
496
+ if transcription.strip() and transcription != self.last_partial:
497
+ self.last_partial = transcription
498
+ self.accumulated_transcript = transcription # Update accumulated transcript
499
+ self.interim_count += 1
500
+ self.last_interim_time = time.time()
501
+
502
+ # Send interim result
503
+ await self.send_transcription(transcription, is_final=False)
504
+
505
+ logger.info(f"Interim #{self.interim_count}: '{transcription}'")
506
+
507
+ # Small delay to prevent excessive processing
508
+ await asyncio.sleep(0.1)
509
+
510
+ except Exception as e:
511
+ logger.error(f"Error in chunk processing: {e}")
512
+ await asyncio.sleep(1)
513
+
514
+ async def _monitor_for_auto_final(self):
515
+ """Monitor for auto-final conditions: 3 seconds silence after 3+ interim results"""
516
+ while self.running:
517
+ try:
518
+ current_time = time.time()
519
+
520
+ # Check if we should send auto-final transcription
521
+ if (self.interim_count >= self.min_interim_count and
522
+ self.last_interim_time is not None and
523
+ (current_time - self.last_interim_time) >= self.silence_timeout and
524
+ not self.final_sent and
525
+ self.accumulated_transcript.strip()):
526
+
527
+ logger.info(f"Auto-final triggered: {self.interim_count} interim results, "
528
+ f"{current_time - self.last_interim_time:.1f}s silence")
529
+
530
+ # Send the accumulated transcript as final
531
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
532
+ self.final_sent = True
533
+
534
+ # Reset counters for potential next utterance
535
+ self.interim_count = 0
536
+ self.last_interim_time = None
537
+ self.accumulated_transcript = ""
538
+
539
+ # Check every 0.5 seconds
540
+ await asyncio.sleep(0.5)
541
+
542
+ except Exception as e:
543
+ logger.error(f"Error in auto-final monitoring: {e}")
544
+ await asyncio.sleep(1)
545
+
546
+ # async def send_transcription(self, text, is_final=False, confidence=0.9):
547
+ # """Send transcription in jambonz format with Arabic number conversion"""
548
+ # try:
549
+ # # Convert Arabic numbers to digits before sending
550
+ # original_text = text
551
+ # converted_text = convert_arabic_numbers_in_sentence(text)
552
+
553
+ # # Log the conversion if numbers were found and converted
554
+ # if original_text != converted_text:
555
+ # logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
556
+
557
+ # message = {
558
+ # "type": "transcription",
559
+ # "is_final": is_final,
560
+ # "alternatives": [
561
+ # {
562
+ # "transcript": converted_text,
563
+ # "confidence": confidence
564
+ # }
565
+ # ],
566
+ # "language": self.config.get("language", "ar-EG"),
567
+ # "channel": 1
568
+ # }
569
+
570
+ # await self.websocket.send(json.dumps(message))
571
+ # logger.info(f"Sent {'FINAL' if is_final else 'interim'} transcription: '{converted_text}'")
572
+
573
+ # if is_final:
574
+ # self.final_sent = True
575
+
576
+ # except Exception as e:
577
+ # logger.error(f"Error sending transcription: {e}")
578
+
579
+
580
+
581
+ async def send_transcription(self, text, is_final=False, confidence=0.9):
582
+ """Send transcription in jambonz format with Arabic number conversion, only for final results"""
583
+ try:
584
+ if not is_final:
585
+ # Do nothing for interim results
586
+ logger.debug("Skipping interim transcription (not final).")
587
+ return
588
+
589
+ # Convert Arabic numbers only for final transcripts
590
+ original_text = text
591
+ converted_text = convert_arabic_numbers_in_sentence(text)
592
+
593
+ # Log the conversion if numbers were found and converted
594
+ if original_text != converted_text:
595
+ logger.info(f"Arabic numbers converted: '{original_text}' -> '{converted_text}'")
596
+
597
+ message = {
598
+ "type": "transcription",
599
+ "is_final": True,
600
+ "alternatives": [
601
+ {
602
+ "transcript": original_text,#converted_text,
603
+ "confidence": confidence
604
+ }
605
+ ],
606
+ "language": self.config.get("language", "ar-EG"),
607
+ "channel": 1
608
+ }
609
+
610
+ # Send only final messages
611
+ await self.websocket.send(json.dumps(message))
612
+ logger.info(f"Sent FINAL transcription: '{converted_text}'")
613
+
614
+ self.final_sent = True
615
+
616
+ except Exception as e:
617
+ logger.error(f"Error sending transcription: {e}")
618
+
619
+
620
+
621
+
622
+ async def send_error(self, error_message):
623
+ """Send error message in jambonz format"""
624
+ try:
625
+ message = {
626
+ "type": "error",
627
+ "error": error_message
628
+ }
629
+ await self.websocket.send(json.dumps(message))
630
+ logger.error(f"Sent error: {error_message}")
631
+ except Exception as e:
632
+ logger.error(f"Error sending error message: {e}")
633
+
634
+ async def handle_jambonz_websocket(websocket):
635
+ """Handle jambonz WebSocket connections"""
636
+
637
+ client_id = f"jambonz_{id(websocket)}"
638
+ logger.info(f"New jambonz connection: {client_id}")
639
+
640
+ handler = JambonzSTTHandler(websocket)
641
+
642
+ try:
643
+ async for message in websocket:
644
+ try:
645
+ if isinstance(message, str):
646
+ # Handle JSON control messages
647
+ data = json.loads(message)
648
+ message_type = data.get("type")
649
+
650
+ if message_type == "start":
651
+ logger.info(f"Received start message: {data}")
652
+ await handler.start_processing(data)
653
+
654
+ elif message_type == "stop":
655
+ logger.info("Received stop message")
656
+ await handler.stop_processing()
657
+ # Close websocket after final transcription
658
+ await websocket.close(code=1000, reason="Session completed")
659
+ break
660
+
661
+ else:
662
+ logger.warning(f"Unknown message type: {message_type}")
663
+ await handler.send_error(f"Unknown message type: {message_type}")
664
+
665
+ else:
666
+ # Handle binary audio data (LINEAR16 PCM)
667
+ if handler.audio_buffer is None:
668
+ await handler.send_error("Received audio before start message")
669
+ continue
670
+
671
+ await handler.add_audio_data(message)
672
+
673
+ except json.JSONDecodeError as e:
674
+ logger.error(f"JSON decode error: {e}")
675
+ await handler.send_error(f"Invalid JSON: {str(e)}")
676
+ except Exception as e:
677
+ logger.error(f"Error processing message: {e}")
678
+ await handler.send_error(f"Processing error: {str(e)}")
679
+
680
+ except websockets.exceptions.ConnectionClosed:
681
+ logger.info(f"jambonz connection closed: {client_id}")
682
+ except Exception as e:
683
+ logger.error(f"jambonz WebSocket error: {e}")
684
+ try:
685
+ await handler.send_error(str(e))
686
+ except:
687
+ pass
688
+ finally:
689
+ if handler.running:
690
+ await handler.stop_processing()
691
+ logger.info(f"jambonz connection ended: {client_id}")
692
+
693
+ async def main():
694
+ """Start the jambonz STT WebSocket server"""
695
+ logger.info("Starting Jambonz Custom STT WebSocket server on port 3006...")
696
+
697
+ # Start WebSocket server
698
+ server = await websockets.serve(
699
+ handle_jambonz_websocket,
700
+ "0.0.0.0",
701
+ 3006,
702
+ ping_interval=20,
703
+ ping_timeout=10,
704
+ close_timeout=10
705
+ )
706
+
707
+ logger.info("Jambonz Custom STT WebSocket server started on ws://0.0.0.0:3006")
708
+ logger.info("Ready to handle jambonz STT requests")
709
+ logger.info("- Expects LINEAR16 PCM audio at 8kHz")
710
+ logger.info("- Supports interim results with auto-final detection")
711
+ logger.info("- Auto-final: 3+ interim results + 1.3s silence")
712
+ logger.info("- Resamples to 16kHz for Whisper processing")
713
+ logger.info("- Converts Arabic numbers to digits before sending")
714
+
715
+ # Wait for the server to close
716
+ await server.wait_closed()
717
+
718
+ if __name__ == "__main__":
719
+ print("=" * 60)
720
+ print("Jambonz Custom STT Server with Whisper + Arabic Numbers")
721
+ print("=" * 60)
722
+ print(f"Model: {MODEL_NAME}")
723
+ print(f"Device: {device}")
724
+ print("WebSocket Port: 3006")
725
+ print("Protocol: jambonz STT API")
726
+ print("Audio Format: LINEAR16 PCM @ 8kHz")
727
+ print("Auto-Final: 2+ speech activities + 1.3s silence")
728
+ print("Arabic Numbers: Converted to digits in FINAL transcriptions only")
729
+ print("Interim Results: DISABLED (final transcription only)")
730
+ if arabic_numbers_available:
731
+ print("✓ pyarabic library available for number conversion")
732
+ else:
733
+ print("✗ pyarabic library not available - install with: pip install pyarabic")
734
+ print("=" * 60)
735
+
736
+ try:
737
+ asyncio.run(main())
738
+ except KeyboardInterrupt:
739
+ print("\nShutting down server...")
740
+ except Exception as e:
741
+ print(f"Server error: {e}")
stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29d19d7c054a5fc010ac6815e9cbb0dd1b21a30e0a7f7f2982e1fecaf0c3e31
3
+ size 459233280
w_nemo.py ADDED
@@ -0,0 +1,1033 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import websockets
4
+ import json
5
+ import threading
6
+ import numpy as np
7
+ import logging
8
+ import time
9
+ import tempfile
10
+ import os
11
+ import re
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ import subprocess
14
+ import struct
15
+
16
+ # NeMo imports
17
+ import nemo.collections.asr as nemo_asr
18
+ import soundfile as sf
19
+
20
+ # Whisper imports
21
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
22
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
23
+
24
+
25
+ # Arabic number conversion imports for Whisper
26
+ try:
27
+ from pyarabic.number import text2number
28
+ arabic_numbers_available = True
29
+ print("✓ pyarabic library available for Whisper number conversion")
30
+ except ImportError:
31
+ arabic_numbers_available = False
32
+ print("✗ pyarabic not available - install with: pip install pyarabic")
33
+ print("Arabic numbers will not be converted to digits for Whisper")
34
+
35
+ # Set up logging
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # ===== NeMo Arabic number mapping =====
40
+ arabic_numbers_nemo = {
41
+ # Basic digits
42
+ "سفر": "0", "فيرو": "0", "هيرو": "0","صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0",
43
+ "واحد": "1", "واحدة": "1", "١": "1",
44
+ "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
45
+ "تلاتة": "3", "ثلاثة": "3", "٣": "3","تلاته": "3","ثلاثه": "3","ثلاثا": "3","تلاتا": "3",
46
+ "اربعة": "4", "أربعة": "4", "٤": "4","اربعه": "4","أربعه": "4","أربع": "4","اربع": "4","اربعا": "4","أربعا": "4",
47
+ "خمسة": "5", "خمسه": "5", "٥": "5", "خمس": "5", "خمسا": "5",
48
+ "ستة": "6", "سته": "6", "٦": "6", "ست": "6", "ستّا": "6", "ستةً": "6",
49
+ "سبعة": "7", "سبعه": "7", "٧": "7", "سبع": "7", "سبعا": "7",
50
+ "ثمانية": "8", "ثمانيه": "8", "٨": "8", "ثمان": "8", "ثمنية": "8", "ثمنيه": "8", "ثمانيا": "8", "ثمن": "8",
51
+ "تسعة": "9", "تسعه": "9", "٩": "9", "تسع": "9", "تسعا": "9",
52
+
53
+ # Teens
54
+ "عشرة": "10", "١٠": "10",
55
+ "حداشر": "11", "احد عشر": "11","احداشر": "11",
56
+ "اتناشر": "12", "اثنا عشر": "12",
57
+ "تلتاشر": "13", "ثلاثة عشر": "13",
58
+ "اربعتاشر": "14", "أربعة عشر": "14",
59
+ "خمستاشر": "15", "خمسة عشر": "15",
60
+ "ستاشر": "16", "ستة عشر": "16",
61
+ "سبعتاشر": "17", "سبعة عشر": "17",
62
+ "طمنتاشر": "18", "ثمانية عشر": "18",
63
+ "تسعتاشر": "19", "تسعة عشر": "19",
64
+
65
+ # Tens
66
+ "عشرين": "20", "٢٠": "20",
67
+ "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
68
+ "اربعين": "40", "أربعين": "40", "٤٠": "40",
69
+ "خمسين": "50", "٥٠": "50",
70
+ "ستين": "60", "٦٠": "60",
71
+ "سبعين": "70", "٧٠": "70",
72
+ "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
73
+ "تسعين": "90", "٩٠": "90",
74
+
75
+ # Hundreds
76
+ "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
77
+ "ميتين": "200", "مائتين": "200",
78
+ "تلاتمية": "300", "ثلاثمائة": "300",
79
+ "اربعمية": "400", "أربعمائة": "400",
80
+ "خمسمية": "500", "خمسمائة": "500",
81
+ "ستمية": "600", "ستمائة": "600",
82
+ "سبعمية": "700", "سبعمائة": "700",
83
+ "تمانمية": "800", "ثمانمائة": "800",
84
+ "تسعمية": "900", "تسعمائة": "900",
85
+
86
+ # Thousands
87
+ "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
88
+ "ألفين": "2000", "الفين": "2000",
89
+ "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
90
+ "اربعة آلاف": "4000", "أربعة آلاف": "4000",
91
+ "خمسة آلاف": "5000",
92
+ "ستة آلاف": "6000",
93
+ "سبعة آلاف": "7000",
94
+ "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
95
+ "تسعة آلاف": "9000",
96
+
97
+ # Large numbers
98
+ "عشرة آلاف": "10000",
99
+ "مية ألف": "100000", "مائة ألف": "100000",
100
+ "مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
101
+ "ملايين": "1000000",
102
+ "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000"
103
+ }
104
+
105
+ def replace_arabic_numbers_nemo(text: str) -> str:
106
+ """Convert Arabic number words to digits for NeMo"""
107
+ for word, digit in arabic_numbers_nemo.items():
108
+ text = re.sub(rf"\b{word}\b", digit, text)
109
+ return text
110
+
111
+ def convert_arabic_numbers_whisper(sentence: str) -> str:
112
+ """
113
+ Replace Arabic number words in a sentence with digits for Whisper,
114
+ preserving all other words and punctuation.
115
+ """
116
+ if not arabic_numbers_available or not sentence.strip():
117
+ return sentence
118
+
119
+ try:
120
+ # Normalization step
121
+ replacements = {
122
+ "اربعة": "أربعة", "اربع": "أربع", "اثنين": "اثنان",
123
+ "اتنين": "اثنان", "ثلاث": "ثلاثة", "خمس": "خمسة",
124
+ "ست": "ستة", "سبع": "سبعة", "ثمان": "ثمانية",
125
+ "تسع": "تسعة", "عشر": "عشرة",
126
+ }
127
+ for wrong, correct in replacements.items():
128
+ sentence = re.sub(rf"\b{wrong}\b", correct, sentence)
129
+
130
+ # Split by whitespace but keep spaces
131
+ words = re.split(r'(\s+)', sentence)
132
+ converted_words = []
133
+
134
+ for word in words:
135
+ stripped = word.strip()
136
+ if not stripped: # skip spaces
137
+ converted_words.append(word)
138
+ continue
139
+
140
+ try:
141
+ num = text2number(stripped)
142
+ if isinstance(num, int):
143
+ if num != 0 or stripped == "صفر":
144
+ converted_words.append(str(num))
145
+ else:
146
+ converted_words.append(word)
147
+ else:
148
+ converted_words.append(word)
149
+ except Exception:
150
+ converted_words.append(word)
151
+
152
+ return ''.join(converted_words)
153
+
154
+ except Exception as e:
155
+ logger.warning(f"Error converting Arabic numbers: {e}")
156
+ return sentence
157
+
158
+ # Global models
159
+ asr_model_nemo = None
160
+ whisper_model = None
161
+ whisper_processor = None
162
+ whisper_tokenizer = None
163
+ device = None
164
+ torch_dtype = None
165
+
166
+ def initialize_models():
167
+ """Initialize both NeMo and Whisper models"""
168
+ global asr_model_nemo, whisper_model, whisper_processor, whisper_tokenizer, device, torch_dtype
169
+
170
+ # Initialize device settings
171
+ device = "cuda" if torch.cuda.is_available() else "cpu"
172
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
173
+
174
+ logger.info(f"Using device: {device}")
175
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
176
+
177
+ # Initialize NeMo model
178
+ logger.info("Loading NeMo FastConformer Arabic ASR model...")
179
+ model_path = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
180
+
181
+ if os.path.exists(model_path):
182
+ try:
183
+ asr_model_nemo = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
184
+ asr_model_nemo.eval()
185
+ logger.info("✓ NeMo FastConformer model loaded successfully")
186
+ except Exception as e:
187
+ logger.error(f"Failed to load NeMo model: {e}")
188
+ asr_model_nemo = None
189
+ else:
190
+ logger.warning(f"NeMo model not found at: {model_path}")
191
+ asr_model_nemo = None
192
+
193
+ # Initialize Whisper model
194
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
195
+
196
+ logger.info("Loading Whisper large-v3 model...")
197
+ MODEL_NAME = "alaatiger989/FT_Arabic_Whisper_V1_1"
198
+
199
+ try:
200
+ # Try with flash attention first
201
+ try:
202
+ import flash_attn
203
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
204
+ MODEL_NAME,
205
+ torch_dtype=torch_dtype,
206
+ low_cpu_mem_usage=True,
207
+ use_safetensors=True,
208
+ attn_implementation="flash_attention_2"
209
+ )
210
+ logger.info("✓ Whisper loaded with flash attention")
211
+ except:
212
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
213
+ MODEL_NAME,
214
+ torch_dtype=torch_dtype,
215
+ low_cpu_mem_usage=True,
216
+ use_safetensors=True
217
+ )
218
+ logger.info("✓ Whisper loaded with standard attention")
219
+
220
+ whisper_model.to(device)
221
+ whisper_processor = AutoProcessor.from_pretrained(MODEL_NAME)
222
+
223
+ # Use processor.tokenizer, don’t reload separately
224
+ whisper_tokenizer = whisper_processor.tokenizer
225
+
226
+ logger.info("✓ Whisper model + tokenizer loaded successfully")
227
+
228
+ except Exception as e:
229
+ logger.error(f"Failed to load Whisper model: {e}")
230
+ whisper_model = None
231
+
232
+ # Initialize models on startup
233
+ initialize_models()
234
+
235
+ # Thread pool for processing
236
+ executor = ThreadPoolExecutor(max_workers=4)
237
+
238
+
239
+
240
+ class JambonzAudioBuffer:
241
+ def __init__(self, sample_rate=8000, chunk_duration=1.0):
242
+ self.sample_rate = sample_rate
243
+ self.chunk_duration = chunk_duration
244
+ self.chunk_samples = int(chunk_duration * sample_rate)
245
+
246
+ self.buffer = np.array([], dtype=np.float32)
247
+ self.lock = threading.Lock()
248
+ self.total_audio = np.array([], dtype=np.float32)
249
+
250
+ # Voice Activity Detection - ADJUSTED FOR WHISPER
251
+ self.silence_threshold = 0.01 # Lower threshold for Whisper
252
+ self.min_speech_samples = int(0.3 * sample_rate) # 300ms minimum speech
253
+
254
+ def add_audio(self, audio_data):
255
+ with self.lock:
256
+ self.buffer = np.concatenate([self.buffer, audio_data])
257
+ self.total_audio = np.concatenate([self.total_audio, audio_data])
258
+
259
+ # Log audio addition for debugging
260
+ logger.debug(f"Added {len(audio_data)} audio samples, total: {len(self.total_audio)}")
261
+
262
+ def has_chunk_ready(self):
263
+ with self.lock:
264
+ ready = len(self.buffer) >= self.chunk_samples
265
+ if ready:
266
+ logger.debug(f"Chunk ready: {len(self.buffer)} >= {self.chunk_samples}")
267
+ return ready
268
+
269
+ def is_speech(self, audio_chunk):
270
+ """Enhanced VAD based on energy - better for Whisper"""
271
+ if len(audio_chunk) < self.min_speech_samples:
272
+ logger.debug(f"Audio too short for VAD: {len(audio_chunk)} < {self.min_speech_samples}")
273
+ return False
274
+
275
+ # Calculate RMS energy
276
+ rms_energy = np.sqrt(np.mean(audio_chunk ** 2))
277
+
278
+ # Also check peak amplitude
279
+ peak_amplitude = np.max(np.abs(audio_chunk))
280
+
281
+ is_speech = rms_energy > self.silence_threshold or peak_amplitude > (self.silence_threshold * 2)
282
+
283
+ logger.debug(f"VAD check - RMS: {rms_energy:.4f}, Peak: {peak_amplitude:.4f}, "
284
+ f"Threshold: {self.silence_threshold}, Speech: {is_speech}")
285
+
286
+ return is_speech
287
+
288
+ def get_chunk_for_processing(self):
289
+ """Get audio chunk for processing"""
290
+ with self.lock:
291
+ if len(self.buffer) < self.chunk_samples:
292
+ return None
293
+
294
+ logger.debug(f"Returning processing signal, buffer size: {len(self.buffer)}")
295
+ return np.array([1]) # Signal that chunk is ready
296
+
297
+ def get_all_audio(self):
298
+ """Get all accumulated audio"""
299
+ with self.lock:
300
+ audio_copy = self.total_audio.copy()
301
+ logger.debug(f"Returning {len(audio_copy)} total audio samples")
302
+ return audio_copy
303
+
304
+ def clear(self):
305
+ with self.lock:
306
+ self.buffer = np.array([], dtype=np.float32)
307
+ self.total_audio = np.array([], dtype=np.float32)
308
+ logger.debug("Audio buffer cleared")
309
+
310
+ def reset_for_new_segment(self):
311
+ """Reset buffers for new transcription segment"""
312
+ with self.lock:
313
+ self.buffer = np.array([], dtype=np.float32)
314
+ self.total_audio = np.array([], dtype=np.float32)
315
+ logger.debug("Audio buffer reset for new segment")
316
+
317
+ def linear16_to_audio(audio_bytes, sample_rate=8000):
318
+ """Convert LINEAR16 PCM bytes to numpy array"""
319
+ try:
320
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
321
+ audio_array = audio_array.astype(np.float32) / 32768.0
322
+ return audio_array
323
+ except Exception as e:
324
+ logger.error(f"Error converting LINEAR16 to audio: {e}")
325
+ return np.array([], dtype=np.float32)
326
+
327
+ from scipy.signal import resample_poly
328
+
329
+ # def resample_audio(audio_data, source_rate, target_rate):
330
+ # """High-quality resampling using polyphase resampler."""
331
+ # if source_rate == target_rate:
332
+ # return audio_data.astype(np.float32)
333
+ # # convert float32 [-1..1] to float32 still, but resample
334
+ # gcd = np.gcd(source_rate, target_rate)
335
+ # up = target_rate // gcd
336
+ # down = source_rate // gcd
337
+ # # resample_poly expects 1D numpy array
338
+ # try:
339
+ # resampled = resample_poly(audio_data, up, down).astype(np.float32)
340
+ # return resampled
341
+ # except Exception as e:
342
+ # logger.warning(f"resample_audio fallback: {e}")
343
+ # # last-resort simple repeat (keep previous behavior) but warn
344
+ # if source_rate == 8000 and target_rate == 16000:
345
+ # return np.repeat(audio_data, 2).astype(np.float32)
346
+ # return audio_data.astype(np.float32)
347
+
348
+ import numpy as np
349
+ from scipy.signal import resample_poly, butter, lfilter
350
+ import webrtcvad
351
+ import noisereduce as nr
352
+
353
+ # Initialize WebRTC VAD once (0..3, higher = more aggressive/noisy environments)
354
+ _vad = webrtcvad.Vad(2)
355
+
356
+ def resample_audio(audio_data, source_rate, target_rate=16000,
357
+ lowcut=80.0, highcut=7600.0,
358
+ frame_ms=30, required_ratio=0.55):
359
+ """
360
+ Resample -> Bandpass filter -> Noise reduction -> WebRTC VAD speech detection.
361
+
362
+ Returns:
363
+ processed_audio (np.ndarray float32): cleaned/resampled audio
364
+ is_speech (bool): True if VAD detects speech
365
+ """
366
+
367
+ # --- Resample ---
368
+ if source_rate != target_rate:
369
+ gcd = np.gcd(source_rate, target_rate)
370
+ up = target_rate // gcd
371
+ down = source_rate // gcd
372
+ try:
373
+ audio_data = resample_poly(audio_data, up, down).astype(np.float32)
374
+ except Exception:
375
+ audio_data = np.repeat(audio_data, int(target_rate/source_rate)).astype(np.float32)
376
+ else:
377
+ audio_data = audio_data.astype(np.float32)
378
+
379
+ # --- Bandpass filter (speech range) ---
380
+ try:
381
+ nyq = 0.5 * target_rate
382
+ low = lowcut / nyq
383
+ high = highcut / nyq
384
+ b, a = butter(4, [low, high], btype='band')
385
+ audio_data = lfilter(b, a, audio_data).astype(np.float32)
386
+ except Exception:
387
+ pass
388
+
389
+ # --- Noise reduction ---
390
+ try:
391
+ if len(audio_data) >= int(0.25 * target_rate):
392
+ noise_clip = audio_data[:int(0.25 * target_rate)]
393
+ audio_data = nr.reduce_noise(y=audio_data, y_noise=noise_clip, sr=target_rate).astype(np.float32)
394
+ except Exception:
395
+ pass
396
+
397
+ # --- WebRTC VAD ---
398
+ def frame_generator(frame_ms, audio, sample_rate):
399
+ n = int(sample_rate * (frame_ms / 1000.0))
400
+ if len(audio) < n:
401
+ return
402
+ offset = 0
403
+ while offset + n <= len(audio):
404
+ frame = audio[offset:offset+n]
405
+ yield (frame * 32767).astype(np.int16).tobytes()
406
+ offset += n
407
+
408
+ frames = list(frame_generator(frame_ms, audio_data, target_rate))
409
+ voiced = 0
410
+ for f in frames:
411
+ try:
412
+ if _vad.is_speech(f, target_rate):
413
+ voiced += 1
414
+ except Exception:
415
+ pass
416
+ ratio = voiced / max(1, len(frames))
417
+ is_speech = ratio >= required_ratio
418
+
419
+ return audio_data, is_speech
420
+
421
+ def transcribe_with_nemo(audio_data, source_sample_rate=8000, target_sample_rate=16000):
422
+ """Transcribe audio using NeMo FastConformer"""
423
+ try:
424
+ if len(audio_data) == 0 or asr_model_nemo is None:
425
+ return ""
426
+
427
+ # Resample to 16kHz (NeMo models typically expect 16kHz)
428
+ resampled_audio, has_speech = resample_audio(audio_data, source_sample_rate, target_sample_rate)
429
+
430
+ if has_speech:
431
+ print("Speech detected, sending to ASR...")
432
+ # Skip very short audio
433
+ min_samples = int(0.3 * target_sample_rate)
434
+ if len(resampled_audio) < min_samples:
435
+ return ""
436
+
437
+ start_time = time.time()
438
+
439
+ # Save audio to temporary file (NeMo expects file path)
440
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
441
+ sf.write(tmp_file.name, resampled_audio, target_sample_rate)
442
+ tmp_path = tmp_file.name
443
+
444
+ try:
445
+ # Transcribe with NeMo
446
+ result = asr_model_nemo.transcribe([tmp_path])
447
+
448
+ if result and len(result) > 0:
449
+ # Handle different NeMo result formats
450
+ if hasattr(result[0], 'text'):
451
+ raw_text = result[0].text
452
+ elif isinstance(result[0], str):
453
+ raw_text = result[0]
454
+ else:
455
+ raw_text = str(result[0])
456
+
457
+ if not isinstance(raw_text, str):
458
+ raw_text = str(raw_text)
459
+
460
+ if raw_text and raw_text.strip():
461
+ # Convert Arabic numbers to digits for NeMo
462
+ cleaned_text = replace_arabic_numbers_nemo(raw_text)
463
+ end_time = time.time()
464
+
465
+ if cleaned_text.strip():
466
+ logger.info(f"NeMo transcription: '{cleaned_text}' (processed in {end_time - start_time:.2f}s)")
467
+
468
+ return cleaned_text.strip()
469
+
470
+ finally:
471
+ # Clean up temporary file
472
+ if os.path.exists(tmp_path):
473
+ os.remove(tmp_path)
474
+
475
+ return ""
476
+ else:
477
+ print("Silence/noise, skipping...")
478
+
479
+ except Exception as e:
480
+ logger.error(f"Error during NeMo transcription: {e}")
481
+ return ""
482
+
483
+ def transcribe_with_whisper(audio_data, source_sample_rate=8000, target_sample_rate=16000):
484
+ """Transcribe audio chunk using Whisper model directly"""
485
+ try:
486
+ if len(audio_data) == 0 or whisper_model is None:
487
+ return ""
488
+
489
+ # Resample from 8kHz to 16kHz for Whisper
490
+ resampled_audio, has_speech = resample_audio(audio_data, source_sample_rate, target_sample_rate)
491
+ if has_speech:
492
+ print("Speech detected, sending to ASR...")
493
+ # Ensure minimum length for Whisper
494
+ min_samples = int(0.1 * target_sample_rate) # 100ms minimum
495
+ if len(resampled_audio) < min_samples:
496
+ return ""
497
+
498
+ start_time = time.time()
499
+
500
+ # Prepare input features with proper dtype
501
+ input_features = whisper_processor(
502
+ resampled_audio,
503
+ sampling_rate=target_sample_rate,
504
+ return_tensors="pt"
505
+ ).input_features
506
+
507
+ # Ensure correct dtype and device
508
+ input_features = input_features.to(device=device, dtype=torch_dtype)
509
+
510
+ # Create attention mask to avoid warnings
511
+ attention_mask = torch.ones(
512
+ input_features.shape[:-1],
513
+ dtype=torch.long,
514
+ device=device
515
+ )
516
+
517
+ # Generate transcription using model directly
518
+ with torch.no_grad():
519
+ predicted_ids = whisper_model.generate(
520
+ input_features,
521
+ attention_mask=attention_mask,
522
+ max_new_tokens=128,
523
+ do_sample=False,
524
+ # temperature=0.0,
525
+ num_beams=1,
526
+ language="english",
527
+ task="translate",
528
+ pad_token_id=whisper_tokenizer.pad_token_id,
529
+ eos_token_id=whisper_tokenizer.eos_token_id
530
+ )
531
+
532
+ # Decode the transcription
533
+ transcription = whisper_tokenizer.batch_decode(
534
+ predicted_ids,
535
+ skip_special_tokens=True
536
+ )[0].strip()
537
+
538
+ end_time = time.time()
539
+
540
+ logger.info(f"Whisper transcription completed in {end_time - start_time:.2f}s: '{transcription}'")
541
+ return transcription
542
+ else:
543
+ print("Silence/noise, skipping...")
544
+ except Exception as e:
545
+ logger.error(f"Error during Whisper transcription: {e}")
546
+ return ""
547
+
548
+ class UnifiedSTTHandler:
549
+ def __init__(self, websocket):
550
+ self.websocket = websocket
551
+ self.audio_buffer = None
552
+ self.config = {}
553
+ self.running = False
554
+ self.transcription_task = None
555
+ self.use_nemo = False # Flag to determine which model to use
556
+
557
+ # Auto-final detection variables
558
+ self.interim_count = 0
559
+ self.last_interim_time = None
560
+ self.silence_timeout = 2.9
561
+ self.min_interim_count = 1
562
+ self.auto_final_task = None
563
+ self.accumulated_transcript = ""
564
+ self.final_sent = False
565
+ self.segment_number = 0
566
+ self.last_partial = ""
567
+
568
+ # Processing tracking
569
+ self.processing_count = 0
570
+
571
+ # Add this debugging method to your UnifiedSTTHandler class
572
+
573
+ async def add_audio_data(self, audio_bytes):
574
+ """Add audio data to buffer with enhanced debugging"""
575
+ if self.audio_buffer and self.running:
576
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
577
+ self.audio_buffer.add_audio(audio_data)
578
+
579
+ model_name = "NeMo" if self.use_nemo else "Whisper"
580
+
581
+ # Debug logging every few audio packets
582
+ if len(audio_data) > 0:
583
+ total_samples = len(self.audio_buffer.get_all_audio())
584
+ total_seconds = total_samples / self.config["sample_rate"]
585
+
586
+ # Log every second of audio
587
+ if int(total_seconds) != getattr(self, '_last_logged_second', -1):
588
+ logger.info(f"{model_name} - Accumulated {total_seconds:.1f}s of audio ({total_samples} samples)")
589
+ self._last_logged_second = int(total_seconds)
590
+
591
+ # Check if we should have chunks ready
592
+ chunk_ready = self.audio_buffer.has_chunk_ready()
593
+ logger.info(f"{model_name} - Chunk ready: {chunk_ready}")
594
+
595
+ async def start_processing(self, start_message):
596
+ """Initialize with start message from jambonz"""
597
+ self.config = {
598
+ "language": start_message.get("language", "ar-EG"),
599
+ "format": start_message.get("format", "raw"),
600
+ "encoding": start_message.get("encoding", "LINEAR16"),
601
+ "sample_rate": start_message.get("sampleRateHz", 8000),
602
+ "interim_results": True, # Always enable for internal processing
603
+ "options": start_message.get("options", {})
604
+ }
605
+
606
+ # Determine which model to use based on language parameter
607
+ language = self.config["language"]
608
+ if language == "ar-EG":
609
+ logger.info("Selected NeMo FastConformer")
610
+ self.use_nemo = True
611
+ model_name = "NeMo FastConformer"
612
+ elif language == "ar-EG-whis":
613
+ logger.info("Selected Whisper large-v3")
614
+ self.use_nemo = False
615
+ model_name = "Whisper large-v3"
616
+ else:
617
+ # Default to NeMo for any other Arabic variant
618
+ self.use_nemo = True
619
+ model_name = "NeMo FastConformer (default)"
620
+
621
+ logger.info(f"STT session started with {model_name} for language: {language}")
622
+ logger.info(f"Config: {self.config}")
623
+
624
+ # Check if selected model is available
625
+ if self.use_nemo and asr_model_nemo is None:
626
+ await self.send_error("NeMo model not available")
627
+ return
628
+ elif not self.use_nemo and whisper_model is None:
629
+ await self.send_error("Whisper model not available")
630
+ return
631
+
632
+ # Initialize audio buffer with model-specific settings
633
+ if self.use_nemo:
634
+ chunk_duration = 1.0 # NeMo processes every 1 second
635
+ else:
636
+ chunk_duration = 2.0 # Whisper processes every 2 seconds for better accuracy
637
+
638
+ self.audio_buffer = JambonzAudioBuffer(
639
+ sample_rate=self.config["sample_rate"],
640
+ chunk_duration=chunk_duration
641
+ )
642
+
643
+ # Adjust VAD threshold for Whisper
644
+ if not self.use_nemo:
645
+ self.audio_buffer.silence_threshold = 0.005 # Lower threshold for Whisper
646
+
647
+ # Reset session variables
648
+ self.running = True
649
+ self.interim_count = 0
650
+ self.last_interim_time = None
651
+ self.accumulated_transcript = ""
652
+ self.final_sent = False
653
+ self.segment_number = 0
654
+ self.processing_count = 0
655
+ self.last_partial = ""
656
+
657
+ # Start background transcription task
658
+ self.transcription_task = asyncio.create_task(self._process_audio_chunks())
659
+
660
+ # Start auto-final detection task
661
+ self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())
662
+
663
+ logger.info(f"Background tasks started for {model_name}")
664
+
665
+
666
+
667
+ async def stop_processing(self):
668
+ """Stop current processing session"""
669
+ logger.info("Stopping STT session...")
670
+ self.running = False
671
+
672
+ # Cancel background tasks
673
+ for task in [self.transcription_task, self.auto_final_task]:
674
+ if task:
675
+ task.cancel()
676
+ try:
677
+ await task
678
+ except asyncio.CancelledError:
679
+ pass
680
+
681
+ # Send final transcription if not already sent
682
+ if not self.final_sent and self.accumulated_transcript.strip():
683
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
684
+
685
+ # Process any remaining audio for comprehensive final transcription
686
+ if self.audio_buffer:
687
+ all_audio = self.audio_buffer.get_all_audio()
688
+ if len(all_audio) > 0 and not self.final_sent:
689
+ loop = asyncio.get_event_loop()
690
+
691
+ if self.use_nemo:
692
+ final_transcription = await loop.run_in_executor(
693
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
694
+ )
695
+ else:
696
+ final_transcription = await loop.run_in_executor(
697
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
698
+ )
699
+
700
+ if final_transcription.strip():
701
+ await self.send_transcription(final_transcription, is_final=True)
702
+
703
+ # Clear audio buffer
704
+ if self.audio_buffer:
705
+ self.audio_buffer.clear()
706
+
707
+ logger.info("STT session stopped")
708
+
709
+ async def start_new_segment(self):
710
+ """Start a new transcription segment"""
711
+ self.segment_number += 1
712
+ self.interim_count = 0
713
+ self.last_interim_time = None
714
+ self.accumulated_transcript = ""
715
+ self.final_sent = False
716
+ self.last_partial = ""
717
+ self.processing_count = 0
718
+
719
+ if self.audio_buffer:
720
+ self.audio_buffer.reset_for_new_segment()
721
+
722
+ logger.info(f"Started new transcription segment #{self.segment_number}")
723
+
724
+ async def add_audio_data(self, audio_bytes):
725
+ """Add audio data to buffer"""
726
+ if self.audio_buffer and self.running:
727
+ audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
728
+ self.audio_buffer.add_audio(audio_data)
729
+
730
+ async def _process_audio_chunks(self):
731
+ """Process audio chunks for interim results - with debugging"""
732
+ model_name = "NeMo" if self.use_nemo else "Whisper"
733
+ logger.info(f"Starting audio chunk processing for {model_name}")
734
+
735
+ chunk_count = 0
736
+
737
+ while self.running:
738
+ try:
739
+ if self.audio_buffer and self.audio_buffer.has_chunk_ready():
740
+ chunk_count += 1
741
+ logger.info(f"{model_name} - Processing chunk #{chunk_count}")
742
+
743
+ chunk_signal = self.audio_buffer.get_chunk_for_processing()
744
+ if chunk_signal is not None:
745
+ all_audio = self.audio_buffer.get_all_audio()
746
+
747
+ logger.info(f"{model_name} - Got {len(all_audio)} samples for processing")
748
+
749
+ if len(all_audio) > 0:
750
+ # Get the latest chunk for VAD check
751
+ latest_chunk_start = max(0, len(all_audio) - self.audio_buffer.chunk_samples)
752
+ latest_chunk = all_audio[latest_chunk_start:]
753
+
754
+ # Check for speech activity
755
+ has_speech = self.audio_buffer.is_speech(latest_chunk)
756
+ logger.info(f"{model_name} - Speech detected: {has_speech}")
757
+
758
+ if has_speech:
759
+ logger.info(f"{model_name} - Starting transcription...")
760
+
761
+ loop = asyncio.get_event_loop()
762
+ start_time = time.time()
763
+
764
+ try:
765
+ # Choose transcription method based on model selection
766
+ if self.use_nemo:
767
+ transcription = await loop.run_in_executor(
768
+ executor, transcribe_with_nemo, all_audio, self.config["sample_rate"]
769
+ )
770
+ else:
771
+ transcription = await loop.run_in_executor(
772
+ executor, transcribe_with_whisper, all_audio, self.config["sample_rate"]
773
+ )
774
+
775
+ process_time = time.time() - start_time
776
+ logger.info(f"{model_name} - Transcription completed in {process_time:.2f}s: '{transcription}'")
777
+
778
+ if transcription and transcription.strip():
779
+ self.processing_count += 1
780
+ self.accumulated_transcript = transcription
781
+
782
+ if transcription != self.last_partial or self.interim_count == 0:
783
+ self.last_partial = transcription
784
+ self.interim_count += 1
785
+ self.last_interim_time = time.time()
786
+ logger.info(f"{model_name} - Updated interim_count to {self.interim_count}")
787
+ else:
788
+ self.last_interim_time = time.time()
789
+ logger.info(f"{model_name} - Same transcription, updating time only")
790
+ else:
791
+ logger.info(f"{model_name} - No transcription result")
792
+
793
+ except Exception as e:
794
+ logger.error(f"{model_name} - Transcription error: {e}")
795
+ import traceback
796
+ traceback.print_exc()
797
+ else:
798
+ logger.debug(f"{model_name} - No speech in chunk")
799
+ else:
800
+ logger.warning(f"{model_name} - Chunk signal was None")
801
+ else:
802
+ # Log why chunk is not ready
803
+ if self.audio_buffer:
804
+ current_size = len(self.audio_buffer.buffer)
805
+ required_size = self.audio_buffer.chunk_samples
806
+ if current_size > 0:
807
+ logger.debug(f"{model_name} - Buffer: {current_size}/{required_size} samples")
808
+
809
+ await asyncio.sleep(0.1)
810
+
811
+ except Exception as e:
812
+ logger.error(f"{model_name} - Error in chunk processing: {e}")
813
+ import traceback
814
+ traceback.print_exc()
815
+ await asyncio.sleep(1)
816
+
817
+ async def _monitor_for_auto_final(self):
818
+ """Monitor for auto-final conditions with model-specific timeouts"""
819
+ model_name = "NeMo" if self.use_nemo else "Whisper"
820
+ timeout = 2.0 if self.use_nemo else 3.0 # Longer timeout for Whisper
821
+
822
+ logger.info(f"Starting auto-final monitoring for {model_name} (timeout: {timeout}s)")
823
+
824
+ while self.running:
825
+ try:
826
+ current_time = time.time()
827
+
828
+ if (self.interim_count >= self.min_interim_count and
829
+ self.last_interim_time is not None and
830
+ (current_time - self.last_interim_time) >= timeout and
831
+ not self.final_sent and
832
+ self.accumulated_transcript.strip()):
833
+
834
+ silence_duration = current_time - self.last_interim_time
835
+ logger.info(f"Auto-final triggered for segment #{self.segment_number} ({model_name}) - "
836
+ f"Interim count: {self.interim_count}, Silence: {silence_duration:.1f}s")
837
+
838
+ await self.send_transcription(self.accumulated_transcript, is_final=True)
839
+ await self.start_new_segment()
840
+
841
+ await asyncio.sleep(0.5) # Check every 500ms
842
+
843
+ except Exception as e:
844
+ logger.error(f"Error in auto-final monitoring: {e}")
845
+ await asyncio.sleep(0.5)
846
+
847
+
848
+
849
+ async def send_transcription(self, text, is_final=True, confidence=0.9):
850
+ """Send transcription in jambonz format"""
851
+ try:
852
+ # Apply number conversion only for Whisper
853
+ if not self.use_nemo and is_final:
854
+ original_text = text
855
+ converted_text = convert_arabic_numbers_whisper(text)
856
+
857
+ if original_text != converted_text:
858
+ logger.info(f"Whisper - Arabic numbers converted: '{original_text}' -> '{converted_text}'")
859
+ text = converted_text
860
+
861
+ message = {
862
+ "type": "transcription",
863
+ "is_final": True, # Always send as final
864
+ "alternatives": [
865
+ {
866
+ "transcript": text,
867
+ "confidence": confidence
868
+ }
869
+ ],
870
+ "language": self.config.get("language", "ar-EG"),
871
+ "channel": 1
872
+ }
873
+
874
+ await self.websocket.send(json.dumps(message))
875
+ self.final_sent = True
876
+
877
+ model_name = "NeMo" if self.use_nemo else "Whisper"
878
+ logger.info(f"Sent FINAL transcription ({model_name}): '{text}'")
879
+
880
+ except Exception as e:
881
+ logger.error(f"Error sending transcription: {e}")
882
+
883
+ async def send_error(self, error_message):
884
+ """Send error message in jambonz format"""
885
+ try:
886
+ message = {
887
+ "type": "error",
888
+ "error": error_message
889
+ }
890
+ await self.websocket.send(json.dumps(message))
891
+ logger.error(f"Sent error: {error_message}")
892
+ except Exception as e:
893
+ logger.error(f"Error sending error message: {e}")
894
+
895
+ async def handle_jambonz_websocket(websocket):
896
+ """Handle jambonz WebSocket connections"""
897
+
898
+ client_id = f"jambonz_{id(websocket)}"
899
+ logger.info(f"New unified STT connection: {client_id}")
900
+
901
+ handler = UnifiedSTTHandler(websocket)
902
+
903
+ try:
904
+ async for message in websocket:
905
+ try:
906
+ if isinstance(message, str):
907
+ data = json.loads(message)
908
+ message_type = data.get("type")
909
+
910
+ if message_type == "start":
911
+ logger.info(f"Received start message: {data}")
912
+ await handler.start_processing(data)
913
+
914
+ elif message_type == "stop":
915
+ logger.info("Received stop message - closing WebSocket")
916
+ await handler.stop_processing()
917
+ await websocket.close(code=1000, reason="Session stopped by client")
918
+ break
919
+
920
+ else:
921
+ logger.warning(f"Unknown message type: {message_type}")
922
+ await handler.send_error(f"Unknown message type: {message_type}")
923
+
924
+ else:
925
+ # Handle binary audio data
926
+ if not handler.running or handler.audio_buffer is None:
927
+ logger.warning("Received audio data outside of active session")
928
+ await handler.send_error("Received audio before start message or after stop")
929
+ continue
930
+
931
+ await handler.add_audio_data(message)
932
+
933
+ except json.JSONDecodeError as e:
934
+ logger.error(f"JSON decode error: {e}")
935
+ await handler.send_error(f"Invalid JSON: {str(e)}")
936
+ except Exception as e:
937
+ logger.error(f"Error processing message: {e}")
938
+ await handler.send_error(f"Processing error: {str(e)}")
939
+
940
+ except websockets.exceptions.ConnectionClosed:
941
+ logger.info(f"Unified STT connection closed: {client_id}")
942
+ except Exception as e:
943
+ logger.error(f"Unified STT WebSocket error: {e}")
944
+ try:
945
+ await handler.send_error(str(e))
946
+ except:
947
+ pass
948
+ finally:
949
+ if handler.running:
950
+ await handler.stop_processing()
951
+ logger.info(f"Unified STT connection ended: {client_id}")
952
+
953
+ async def main():
954
+ """Start the Unified Arabic STT WebSocket server"""
955
+ logger.info("Starting Unified Arabic STT WebSocket server on port 3007...")
956
+
957
+ # Check model availability
958
+ models_available = []
959
+ if asr_model_nemo is not None:
960
+ models_available.append("NeMo FastConformer (ar-EG)")
961
+ if whisper_model is not None:
962
+ models_available.append("Whisper large-v3 (ar-EG-whis)")
963
+
964
+ if not models_available:
965
+ logger.error("No models available! Please check model paths and installations.")
966
+ return
967
+
968
+ # Start WebSocket server
969
+ server = await websockets.serve(
970
+ handle_jambonz_websocket,
971
+ "0.0.0.0",
972
+ 3007,
973
+ ping_interval=20,
974
+ ping_timeout=10,
975
+ close_timeout=10
976
+ )
977
+
978
+ logger.info("Unified Arabic STT WebSocket server started on ws://0.0.0.0:3007")
979
+ logger.info("Ready to handle jambonz STT requests with both models")
980
+ logger.info("ROUTING:")
981
+ logger.info("- language: 'ar-EG' → NeMo FastConformer (with built-in number conversion)")
982
+ logger.info("- language: 'ar-EG-whis' → Whisper large-v3 (with pyarabic number conversion)")
983
+ logger.info("FEATURES:")
984
+ logger.info("- Continuous transcription with segmentation")
985
+ logger.info("- Voice Activity Detection")
986
+ logger.info("- Auto-final detection (2s silence timeout)")
987
+ logger.info("- Model-specific number conversion")
988
+ logger.info(f"AVAILABLE MODELS: {', '.join(models_available)}")
989
+
990
+ # Wait for the server to close
991
+ await server.wait_closed()
992
+
993
+ if __name__ == "__main__":
994
+ print("=" * 80)
995
+ print("Unified Arabic STT Server (NeMo + Whisper)")
996
+ print("=" * 80)
997
+ print("WebSocket Port: 3007")
998
+ print("Protocol: jambonz STT API")
999
+ print("Audio Format: LINEAR16 PCM @ 8kHz → 16kHz")
1000
+ print()
1001
+ print("LANGUAGE ROUTING:")
1002
+ print("- 'ar-EG' → NeMo FastConformer")
1003
+ print(" • Built-in Arabic number word to digit conversion")
1004
+ print(" • Optimized for Arabic dialects")
1005
+ print("- 'ar-EG-whis' → Whisper large-v3")
1006
+ print(" • pyarabic library number conversion (final transcripts only)")
1007
+ print(" • OpenAI Whisper model")
1008
+ print()
1009
+ print("FEATURES:")
1010
+ print("- Automatic model selection based on language parameter")
1011
+ print("- Voice Activity Detection")
1012
+ print("- Auto-final detection (2 seconds silence)")
1013
+ print("- Model-specific number conversion strategies")
1014
+ print("- Continuous transcription with segmentation")
1015
+ print()
1016
+
1017
+ # Check model availability for startup info
1018
+ nemo_status = "✓ Available" if asr_model_nemo is not None else "✗ Not Available"
1019
+ whisper_status = "✓ Available" if whisper_model is not None else "✗ Not Available"
1020
+ arabic_numbers_status = "✓ Available" if arabic_numbers_available else "✗ Not Available (install pyarabic)"
1021
+
1022
+ print("MODEL STATUS:")
1023
+ print(f"- NeMo FastConformer: {nemo_status}")
1024
+ print(f"- Whisper large-v3: {whisper_status}")
1025
+ print(f"- pyarabic (Whisper numbers): {arabic_numbers_status}")
1026
+ print("=" * 80)
1027
+
1028
+ try:
1029
+ asyncio.run(main())
1030
+ except KeyboardInterrupt:
1031
+ print("\nShutting down unified server...")
1032
+ except Exception as e:
1033
+ print(f"Server error: {e}")
whisper_checkpoints/models--openai--whisper-large-v2/.no_exist/ae4642769ce2ad8fc292556ccea8e901f1530655/processor_config.json ADDED
File without changes
whisper_checkpoints/models--openai--whisper-large-v2/blobs/1ce74630ed587e80f3db2b3d434f7026327f131e ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-large-v2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1280,
15
+ "decoder_attention_heads": 20,
16
+ "decoder_ffn_dim": 5120,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 32,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 20,
22
+ "encoder_ffn_dim": 5120,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 32,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": [
27
+ [
28
+ 1,
29
+ 50259
30
+ ],
31
+ [
32
+ 2,
33
+ 50359
34
+ ],
35
+ [
36
+ 3,
37
+ 50363
38
+ ]
39
+ ],
40
+ "init_std": 0.02,
41
+ "is_encoder_decoder": true,
42
+ "max_length": 448,
43
+ "max_source_positions": 1500,
44
+ "max_target_positions": 448,
45
+ "model_type": "whisper",
46
+ "num_hidden_layers": 32,
47
+ "num_mel_bins": 80,
48
+ "pad_token_id": 50257,
49
+ "scale_embedding": false,
50
+ "suppress_tokens": [
51
+ 1,
52
+ 2,
53
+ 7,
54
+ 8,
55
+ 9,
56
+ 10,
57
+ 14,
58
+ 25,
59
+ 26,
60
+ 27,
61
+ 28,
62
+ 29,
63
+ 31,
64
+ 58,
65
+ 59,
66
+ 60,
67
+ 61,
68
+ 62,
69
+ 63,
70
+ 90,
71
+ 91,
72
+ 92,
73
+ 93,
74
+ 359,
75
+ 503,
76
+ 522,
77
+ 542,
78
+ 873,
79
+ 893,
80
+ 902,
81
+ 918,
82
+ 922,
83
+ 931,
84
+ 1350,
85
+ 1853,
86
+ 1982,
87
+ 2460,
88
+ 2627,
89
+ 3246,
90
+ 3253,
91
+ 3268,
92
+ 3536,
93
+ 3846,
94
+ 3961,
95
+ 4183,
96
+ 4667,
97
+ 6585,
98
+ 6647,
99
+ 7273,
100
+ 9061,
101
+ 9383,
102
+ 10428,
103
+ 10929,
104
+ 11938,
105
+ 12033,
106
+ 12331,
107
+ 12562,
108
+ 13793,
109
+ 14157,
110
+ 14635,
111
+ 15265,
112
+ 15618,
113
+ 16553,
114
+ 16604,
115
+ 18362,
116
+ 18956,
117
+ 20075,
118
+ 21675,
119
+ 22520,
120
+ 26130,
121
+ 26161,
122
+ 26435,
123
+ 28279,
124
+ 29464,
125
+ 31650,
126
+ 32302,
127
+ 32470,
128
+ 36865,
129
+ 42863,
130
+ 47425,
131
+ 49870,
132
+ 50254,
133
+ 50258,
134
+ 50358,
135
+ 50359,
136
+ 50360,
137
+ 50361,
138
+ 50362
139
+ ],
140
+ "torch_dtype": "float32",
141
+ "transformers_version": "4.27.0.dev0",
142
+ "use_cache": true,
143
+ "vocab_size": 51865
144
+ }
whisper_checkpoints/models--openai--whisper-large-v2/blobs/57a1ba2a82c093cabff2541409ae778c97145378b9ddfa722763cb1cb8f9020b ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57a1ba2a82c093cabff2541409ae778c97145378b9ddfa722763cb1cb8f9020b
3
+ size 6173370152
whisper_checkpoints/models--openai--whisper-large-v2/blobs/c2048dfa9fd94a052e62e908d2c4dfb18534b4d2 ADDED
The diff for this file is too large to render. See raw diff
 
whisper_checkpoints/models--openai--whisper-large-v2/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ ae4642769ce2ad8fc292556ccea8e901f1530655
whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-large-v2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1280,
15
+ "decoder_attention_heads": 20,
16
+ "decoder_ffn_dim": 5120,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 32,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 20,
22
+ "encoder_ffn_dim": 5120,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 32,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": [
27
+ [
28
+ 1,
29
+ 50259
30
+ ],
31
+ [
32
+ 2,
33
+ 50359
34
+ ],
35
+ [
36
+ 3,
37
+ 50363
38
+ ]
39
+ ],
40
+ "init_std": 0.02,
41
+ "is_encoder_decoder": true,
42
+ "max_length": 448,
43
+ "max_source_positions": 1500,
44
+ "max_target_positions": 448,
45
+ "model_type": "whisper",
46
+ "num_hidden_layers": 32,
47
+ "num_mel_bins": 80,
48
+ "pad_token_id": 50257,
49
+ "scale_embedding": false,
50
+ "suppress_tokens": [
51
+ 1,
52
+ 2,
53
+ 7,
54
+ 8,
55
+ 9,
56
+ 10,
57
+ 14,
58
+ 25,
59
+ 26,
60
+ 27,
61
+ 28,
62
+ 29,
63
+ 31,
64
+ 58,
65
+ 59,
66
+ 60,
67
+ 61,
68
+ 62,
69
+ 63,
70
+ 90,
71
+ 91,
72
+ 92,
73
+ 93,
74
+ 359,
75
+ 503,
76
+ 522,
77
+ 542,
78
+ 873,
79
+ 893,
80
+ 902,
81
+ 918,
82
+ 922,
83
+ 931,
84
+ 1350,
85
+ 1853,
86
+ 1982,
87
+ 2460,
88
+ 2627,
89
+ 3246,
90
+ 3253,
91
+ 3268,
92
+ 3536,
93
+ 3846,
94
+ 3961,
95
+ 4183,
96
+ 4667,
97
+ 6585,
98
+ 6647,
99
+ 7273,
100
+ 9061,
101
+ 9383,
102
+ 10428,
103
+ 10929,
104
+ 11938,
105
+ 12033,
106
+ 12331,
107
+ 12562,
108
+ 13793,
109
+ 14157,
110
+ 14635,
111
+ 15265,
112
+ 15618,
113
+ 16553,
114
+ 16604,
115
+ 18362,
116
+ 18956,
117
+ 20075,
118
+ 21675,
119
+ 22520,
120
+ 26130,
121
+ 26161,
122
+ 26435,
123
+ 28279,
124
+ 29464,
125
+ 31650,
126
+ 32302,
127
+ 32470,
128
+ 36865,
129
+ 42863,
130
+ 47425,
131
+ 49870,
132
+ 50254,
133
+ 50258,
134
+ 50358,
135
+ 50359,
136
+ 50360,
137
+ 50361,
138
+ 50362
139
+ ],
140
+ "torch_dtype": "float32",
141
+ "transformers_version": "4.27.0.dev0",
142
+ "use_cache": true,
143
+ "vocab_size": 51865
144
+ }
whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57a1ba2a82c093cabff2541409ae778c97145378b9ddfa722763cb1cb8f9020b
3
+ size 6173370152
whisper_checkpoints/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff