MinaNasser commited on
Commit
3dcada4
·
1 Parent(s): 4395ef9

initial deploy

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+
6
+ *.db
7
+
8
+ Whisper-Base-MN-EG-int8
9
+ Whisper-Small-MN-int8
.vscode/settings.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda"
4
+ }
Client/client.html ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>TRANSCRIBE — Live Audio</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@300;400;600&family=Barlow+Condensed:wght@300;500;700&display=swap" rel="stylesheet" />
8
+ <style>
9
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
10
+
11
+ :root {
12
+ --bg: #0a0a0a;
13
+ --surface: #111111;
14
+ --border: #222222;
15
+ --accent: #e8ff47;
16
+ --accent2: #ff4747;
17
+ --text: #d4d4d4;
18
+ --muted: #555;
19
+ --mono: 'IBM Plex Mono', monospace;
20
+ --display: 'Barlow Condensed', sans-serif;
21
+ }
22
+
23
+ body {
24
+ background: var(--bg);
25
+ color: var(--text);
26
+ font-family: var(--mono);
27
+ font-size: 13px;
28
+ min-height: 100vh;
29
+ display: flex;
30
+ flex-direction: column;
31
+ }
32
+
33
+ /* ── Header ── */
34
+ header {
35
+ display: flex;
36
+ align-items: center;
37
+ justify-content: space-between;
38
+ padding: 18px 32px;
39
+ border-bottom: 1px solid var(--border);
40
+ position: sticky;
41
+ top: 0;
42
+ background: var(--bg);
43
+ z-index: 10;
44
+ }
45
+
46
+ .logo {
47
+ font-family: var(--display);
48
+ font-weight: 700;
49
+ font-size: 26px;
50
+ letter-spacing: 6px;
51
+ color: #fff;
52
+ text-transform: uppercase;
53
+ }
54
+ .logo span { color: var(--accent); }
55
+
56
+ .status-pill {
57
+ display: flex;
58
+ align-items: center;
59
+ gap: 8px;
60
+ font-size: 11px;
61
+ letter-spacing: 2px;
62
+ text-transform: uppercase;
63
+ color: var(--muted);
64
+ }
65
+ .dot {
66
+ width: 8px; height: 8px;
67
+ border-radius: 50%;
68
+ background: var(--muted);
69
+ transition: background 0.3s, box-shadow 0.3s;
70
+ }
71
+ .dot.live { background: var(--accent2); box-shadow: 0 0 8px var(--accent2); }
72
+ .dot.ready { background: var(--accent); box-shadow: 0 0 8px var(--accent); }
73
+
74
+ /* ── Main layout ── */
75
+ main {
76
+ display: grid;
77
+ grid-template-columns: 340px 1fr;
78
+ flex: 1;
79
+ overflow: hidden;
80
+ }
81
+
82
+ /* ── Sidebar ── */
83
+ aside {
84
+ border-right: 1px solid var(--border);
85
+ padding: 28px 24px;
86
+ display: flex;
87
+ flex-direction: column;
88
+ gap: 28px;
89
+ overflow-y: auto;
90
+ }
91
+
92
+ .section-label {
93
+ font-size: 10px;
94
+ letter-spacing: 3px;
95
+ text-transform: uppercase;
96
+ color: var(--muted);
97
+ margin-bottom: 10px;
98
+ }
99
+
100
+ /* Input */
101
+ .field { display: flex; flex-direction: column; gap: 6px; }
102
+ label { font-size: 10px; letter-spacing: 2px; text-transform: uppercase; color: var(--muted); }
103
+
104
+ input[type="text"] {
105
+ background: var(--surface);
106
+ border: 1px solid var(--border);
107
+ color: var(--text);
108
+ font-family: var(--mono);
109
+ font-size: 13px;
110
+ padding: 10px 14px;
111
+ outline: none;
112
+ transition: border-color 0.2s;
113
+ width: 100%;
114
+ }
115
+ input[type="text"]:focus { border-color: var(--accent); }
116
+
117
+ /* Buttons */
118
+ .btn {
119
+ font-family: var(--display);
120
+ font-weight: 700;
121
+ font-size: 15px;
122
+ letter-spacing: 3px;
123
+ text-transform: uppercase;
124
+ border: none;
125
+ cursor: pointer;
126
+ padding: 12px 20px;
127
+ transition: all 0.15s;
128
+ width: 100%;
129
+ }
130
+ .btn-primary {
131
+ background: var(--accent);
132
+ color: #000;
133
+ }
134
+ .btn-primary:hover { background: #fff; }
135
+ .btn-primary:active { transform: scale(0.98); }
136
+
137
+ .btn-danger {
138
+ background: transparent;
139
+ color: var(--accent2);
140
+ border: 1px solid var(--accent2);
141
+ }
142
+ .btn-danger:hover { background: var(--accent2); color: #fff; }
143
+
144
+ .btn-ghost {
145
+ background: transparent;
146
+ color: var(--text);
147
+ border: 1px solid var(--border);
148
+ font-size: 12px;
149
+ }
150
+ .btn-ghost:hover { border-color: var(--text); }
151
+
152
+ .btn:disabled {
153
+ opacity: 0.35;
154
+ cursor: not-allowed;
155
+ }
156
+
157
+ /* Upload */
158
+ .upload-zone {
159
+ border: 1px dashed var(--border);
160
+ padding: 24px 16px;
161
+ text-align: center;
162
+ cursor: pointer;
163
+ transition: border-color 0.2s, background 0.2s;
164
+ position: relative;
165
+ }
166
+ .upload-zone:hover { border-color: var(--accent); background: rgba(232,255,71,0.03); }
167
+ .upload-zone input { position: absolute; inset: 0; opacity: 0; cursor: pointer; }
168
+ .upload-zone .icon { font-size: 28px; margin-bottom: 8px; }
169
+ .upload-zone p { color: var(--muted); font-size: 11px; line-height: 1.7; }
170
+ .upload-zone .filename { color: var(--accent); margin-top: 6px; font-size: 11px; }
171
+
172
+ /* Visualizer */
173
+ .viz-wrap {
174
+ border: 1px solid var(--border);
175
+ height: 64px;
176
+ display: flex;
177
+ align-items: center;
178
+ justify-content: center;
179
+ overflow: hidden;
180
+ background: var(--surface);
181
+ }
182
+ canvas#viz { width: 100%; height: 100%; display: block; }
183
+
184
+ /* Chunk counter */
185
+ .stats-row {
186
+ display: flex;
187
+ justify-content: space-between;
188
+ border: 1px solid var(--border);
189
+ padding: 12px 16px;
190
+ }
191
+ .stat { display: flex; flex-direction: column; gap: 2px; align-items: center; }
192
+ .stat-val { font-family: var(--display); font-size: 22px; font-weight: 700; color: var(--accent); }
193
+ .stat-key { font-size: 9px; letter-spacing: 2px; text-transform: uppercase; color: var(--muted); }
194
+
195
+ /* ── Transcript panel ── */
196
+ .transcript-panel {
197
+ display: flex;
198
+ flex-direction: column;
199
+ overflow: hidden;
200
+ }
201
+
202
+ .panel-toolbar {
203
+ display: flex;
204
+ align-items: center;
205
+ justify-content: space-between;
206
+ padding: 16px 28px;
207
+ border-bottom: 1px solid var(--border);
208
+ gap: 12px;
209
+ }
210
+ .panel-toolbar h2 {
211
+ font-family: var(--display);
212
+ font-size: 18px;
213
+ font-weight: 500;
214
+ letter-spacing: 4px;
215
+ text-transform: uppercase;
216
+ color: #fff;
217
+ }
218
+ .toolbar-actions { display: flex; gap: 8px; }
219
+ .toolbar-actions .btn { width: auto; padding: 8px 16px; font-size: 11px; }
220
+
221
+ #transcript-container {
222
+ flex: 1;
223
+ overflow-y: auto;
224
+ padding: 28px;
225
+ display: flex;
226
+ flex-direction: column;
227
+ gap: 14px;
228
+ }
229
+
230
+ .empty-state {
231
+ flex: 1;
232
+ display: flex;
233
+ flex-direction: column;
234
+ align-items: center;
235
+ justify-content: center;
236
+ gap: 12px;
237
+ color: var(--muted);
238
+ }
239
+ .empty-state .big-icon { font-size: 48px; opacity: 0.3; }
240
+ .empty-state p { font-size: 11px; letter-spacing: 2px; text-transform: uppercase; }
241
+
242
+ /* Transcript entry */
243
+ .entry {
244
+ display: grid;
245
+ grid-template-columns: 52px 1fr auto;
246
+ gap: 16px;
247
+ align-items: start;
248
+ padding: 16px;
249
+ border: 1px solid var(--border);
250
+ background: var(--surface);
251
+ animation: slideIn 0.25s ease;
252
+ transition: border-color 0.2s;
253
+ }
254
+ .entry:hover { border-color: #333; }
255
+
256
+ @keyframes slideIn {
257
+ from { opacity: 0; transform: translateY(6px); }
258
+ to { opacity: 1; transform: translateY(0); }
259
+ }
260
+
261
+ .entry-chunk {
262
+ font-family: var(--display);
263
+ font-size: 28px;
264
+ font-weight: 700;
265
+ color: var(--border);
266
+ line-height: 1;
267
+ text-align: right;
268
+ padding-top: 2px;
269
+ }
270
+
271
+ .entry-body { display: flex; flex-direction: column; gap: 4px; }
272
+ .entry-text { color: #e8e8e8; line-height: 1.65; font-size: 13px; }
273
+ .entry-meta { display: flex; gap: 10px; color: var(--muted); font-size: 10px; letter-spacing: 1px; }
274
+
275
+ .lang-badge {
276
+ background: #1a1a1a;
277
+ border: 1px solid var(--border);
278
+ padding: 2px 8px;
279
+ font-size: 10px;
280
+ letter-spacing: 1px;
281
+ text-transform: uppercase;
282
+ color: var(--accent);
283
+ }
284
+
285
+ .entry-time {
286
+ font-size: 10px;
287
+ color: var(--muted);
288
+ white-space: nowrap;
289
+ letter-spacing: 1px;
290
+ padding-top: 3px;
291
+ }
292
+
293
+ /* Error toast */
294
+ #toast {
295
+ position: fixed;
296
+ bottom: 28px;
297
+ right: 28px;
298
+ background: var(--accent2);
299
+ color: #fff;
300
+ padding: 12px 20px;
301
+ font-size: 12px;
302
+ letter-spacing: 1px;
303
+ display: none;
304
+ z-index: 100;
305
+ max-width: 340px;
306
+ }
307
+
308
+ /* Scrollbar */
309
+ ::-webkit-scrollbar { width: 4px; }
310
+ ::-webkit-scrollbar-track { background: transparent; }
311
+ ::-webkit-scrollbar-thumb { background: var(--border); }
312
+
313
+ /* REC pulse */
314
+ @keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.3} }
315
+ .pulsing { animation: pulse 1s infinite; }
316
+
317
+ @media (max-width: 768px) {
318
+ main { grid-template-columns: 1fr; }
319
+ aside { border-right: none; border-bottom: 1px solid var(--border); }
320
+ }
321
+ </style>
322
+ </head>
323
+ <body>
324
+
325
+ <header>
326
+ <div class="logo">Trans<span>·</span>cribe</div>
327
+ <div class="status-pill">
328
+ <div class="dot" id="status-dot"></div>
329
+ <span id="status-text">IDLE</span>
330
+ </div>
331
+ </header>
332
+
333
+ <main>
334
+ <!-- ── Sidebar ── -->
335
+ <aside>
336
+
337
+ <!-- Session -->
338
+ <div>
339
+ <div class="section-label">Session</div>
340
+ <div class="field" style="margin-bottom:10px">
341
+ <label for="session-input">Session ID</label>
342
+ <input type="text" id="session-input" placeholder="auto-generated" />
343
+ </div>
344
+ </div>
345
+
346
+ <!-- Live Recording -->
347
+ <div>
348
+ <div class="section-label">Live Recording</div>
349
+ <div class="viz-wrap" style="margin-bottom:10px">
350
+ <canvas id="viz"></canvas>
351
+ </div>
352
+ <div class="stats-row" style="margin-bottom:12px">
353
+ <div class="stat">
354
+ <div class="stat-val" id="chunk-count">0</div>
355
+ <div class="stat-key">Chunks</div>
356
+ </div>
357
+ <div class="stat">
358
+ <div class="stat-val" id="word-count">0</div>
359
+ <div class="stat-key">Words</div>
360
+ </div>
361
+ <div class="stat">
362
+ <div class="stat-val" id="duration">0s</div>
363
+ <div class="stat-key">Duration</div>
364
+ </div>
365
+ </div>
366
+ <button class="btn btn-primary" id="btn-record">▶ START RECORDING</button>
367
+ <div style="height:8px"></div>
368
+ <button class="btn btn-danger" id="btn-stop" disabled>■ STOP</button>
369
+ </div>
370
+
371
+ <!-- File Upload -->
372
+ <div>
373
+ <div class="section-label">File Upload</div>
374
+ <div class="upload-zone" id="upload-zone">
375
+ <input type="file" id="file-input" accept="audio/*,video/*" />
376
+ <div class="icon">⬆</div>
377
+ <p>Drop audio file here<br/>or click to browse</p>
378
+ <div class="filename" id="file-name"></div>
379
+ </div>
380
+ <div style="height:10px"></div>
381
+ <button class="btn btn-ghost" id="btn-upload" disabled>TRANSCRIBE FILE</button>
382
+ </div>
383
+
384
+ <!-- Clear -->
385
+ <div style="margin-top:auto">
386
+ <button class="btn btn-ghost" id="btn-clear">CLEAR TRANSCRIPT</button>
387
+ </div>
388
+
389
+ </aside>
390
+
391
+ <!-- ── Transcript Panel ── -->
392
+ <section class="transcript-panel">
393
+ <div class="panel-toolbar">
394
+ <h2>Transcript</h2>
395
+ <div class="toolbar-actions">
396
+ <button class="btn btn-ghost" id="btn-copy">COPY ALL</button>
397
+ <button class="btn btn-ghost" id="btn-export">EXPORT .TXT</button>
398
+ </div>
399
+ </div>
400
+ <div id="transcript-container">
401
+ <div class="empty-state" id="empty-state">
402
+ <div class="big-icon">🎙</div>
403
+ <p>Start recording or upload a file</p>
404
+ </div>
405
+ </div>
406
+ </section>
407
+ </main>
408
+
409
+ <div id="toast"></div>
410
+
411
+ <script>
412
+ // ── Config ──────────────────────────────────────────────────────────────
413
+ // Point these at your FastAPI server — change the port if needed (default uvicorn: 8000)
414
+ const API_HOST = `${window.location.hostname}:6060`;
415
+ const BASE_URL = `http://${API_HOST}`;
416
+ const WS_BASE = `ws://${API_HOST}`;
417
+ const APP_PATH = '/IntegraAI/voiceapi';
418
+
419
+ // ── State ────────────────────────────────────────────────────────────────
420
+ let ws = null;
421
+ let mediaRecorder = null;
422
+ let audioCtx = null, analyser = null, animId = null;
423
+ let chunkNumber = 0;
424
+ let wordCount = 0;
425
+ let recSeconds = 0;
426
+ let timerHandle = null;
427
+ let entries = [];
428
+
429
+ // ── DOM refs ─────────────────────────────────────────────────────────────
430
+ const sessionInput = document.getElementById('session-input');
431
+ const btnRecord = document.getElementById('btn-record');
432
+ const btnStop = document.getElementById('btn-stop');
433
+ const btnUpload = document.getElementById('btn-upload');
434
+ const btnClear = document.getElementById('btn-clear');
435
+ const btnCopy = document.getElementById('btn-copy');
436
+ const btnExport = document.getElementById('btn-export');
437
+ const fileInput = document.getElementById('file-input');
438
+ const fileName = document.getElementById('file-name');
439
+ const container = document.getElementById('transcript-container');
440
+ const emptyState = document.getElementById('empty-state');
441
+ const statusDot = document.getElementById('status-dot');
442
+ const statusText = document.getElementById('status-text');
443
+ const chunkEl = document.getElementById('chunk-count');
444
+ const wordEl = document.getElementById('word-count');
445
+ const durationEl = document.getElementById('duration');
446
+ const canvas = document.getElementById('viz');
447
+ const ctx2d = canvas.getContext('2d');
448
+ const toast = document.getElementById('toast');
449
+
450
+ // ── Helpers ───────────────────────────────────────────────────────────────
451
+ function genSessionId() {
452
+ return 'ses_' + Math.random().toString(36).slice(2, 10);
453
+ }
454
+
455
+ function setStatus(state) {
456
+ statusDot.className = 'dot';
457
+ if (state === 'live') { statusDot.classList.add('live'); statusText.textContent = 'RECORDING'; statusDot.classList.add('pulsing'); }
458
+ if (state === 'ready') { statusDot.classList.add('ready'); statusText.textContent = 'CONNECTED'; }
459
+ if (state === 'idle') { statusText.textContent = 'IDLE'; }
460
+ if (state === 'busy') { statusText.textContent = 'PROCESSING'; }
461
+ }
462
+
463
+ function showToast(msg, duration = 4000) {
464
+ toast.textContent = msg;
465
+ toast.style.display = 'block';
466
+ clearTimeout(toast._t);
467
+ toast._t = setTimeout(() => toast.style.display = 'none', duration);
468
+ }
469
+
470
+ function addEntry(chunk, text, language, time) {
471
+ emptyState.style.display = 'none';
472
+ entries.push({ chunk, text, language, time });
473
+
474
+ const words = text.trim().split(/\s+/).length;
475
+ wordCount += words;
476
+ wordEl.textContent = wordCount;
477
+
478
+ const el = document.createElement('div');
479
+ el.className = 'entry';
480
+ el.innerHTML = `
481
+ <div class="entry-chunk">${String(chunk).padStart(2,'0')}</div>
482
+ <div class="entry-body">
483
+ <div class="entry-text">${escapeHtml(text)}</div>
484
+ <div class="entry-meta">
485
+ <span class="lang-badge">${language || '??'}</span>
486
+ <span>${words} word${words !== 1 ? 's' : ''}</span>
487
+ </div>
488
+ </div>
489
+ <div class="entry-time">${time}</div>
490
+ `;
491
+ container.appendChild(el);
492
+ container.scrollTop = container.scrollHeight;
493
+ }
494
+
495
+ function escapeHtml(s) {
496
+ return s.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
497
+ }
498
+
499
+ function nowTime() {
500
+ return new Date().toLocaleTimeString('en-US', { hour12: false });
501
+ }
502
+
503
+ // ── Visualizer ────────────────────────────────────────────────────────────
504
+ function startViz(stream) {
505
+ if (!audioCtx) audioCtx = new (window.AudioContext || window.webkitAudioContext)();
506
+ analyser = audioCtx.createAnalyser();
507
+ analyser.fftSize = 128;
508
+ audioCtx.createMediaStreamSource(stream).connect(analyser);
509
+ const buf = new Uint8Array(analyser.frequencyBinCount);
510
+
511
+ function draw() {
512
+ animId = requestAnimationFrame(draw);
513
+ canvas.width = canvas.offsetWidth;
514
+ canvas.height = canvas.offsetHeight;
515
+ analyser.getByteFrequencyData(buf);
516
+ ctx2d.clearRect(0, 0, canvas.width, canvas.height);
517
+ const bw = canvas.width / buf.length;
518
+ buf.forEach((v, i) => {
519
+ const h = (v / 255) * canvas.height;
520
+ ctx2d.fillStyle = `rgba(232,255,71,${0.3 + (v/255)*0.7})`;
521
+ ctx2d.fillRect(i * bw, canvas.height - h, bw - 1, h);
522
+ });
523
+ }
524
+ draw();
525
+ }
526
+
527
+ function stopViz() {
528
+ if (animId) cancelAnimationFrame(animId);
529
+ ctx2d.clearRect(0, 0, canvas.width, canvas.height);
530
+ }
531
+
532
+ // ── WebSocket recording ────────────────────────────────────────────────────
533
+ async function startRecording() {
534
+ const sessionId = sessionInput.value.trim() || genSessionId();
535
+ sessionInput.value = sessionId;
536
+
537
+ try {
538
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
539
+
540
+ // WebSocket
541
+ ws = new WebSocket(`${WS_BASE}${APP_PATH}/ws/${sessionId}`);
542
+ ws.binaryType = 'arraybuffer';
543
+
544
+ ws.onopen = () => {
545
+ setStatus('live');
546
+ btnRecord.disabled = true;
547
+ btnStop.disabled = false;
548
+ chunkNumber = 0; recSeconds = 0;
549
+
550
+ const CHUNK_MS = 10000; // 10 seconds per chunk
551
+
552
+ function startChunk() {
553
+ if (ws.readyState !== WebSocket.OPEN) return;
554
+
555
+ const chunks = [];
556
+ mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });
557
+
558
+ mediaRecorder.ondataavailable = (e) => {
559
+ if (e.data.size > 0) chunks.push(e.data);
560
+ };
561
+
562
+ mediaRecorder.onstop = () => {
563
+ if (chunks.length && ws.readyState === WebSocket.OPEN) {
564
+ // Complete WebM per cycle: own EBML header, timestamps reset to 0
565
+ const blob = new Blob(chunks, { type: 'audio/webm' });
566
+ ws.send(blob);
567
+ chunkNumber++;
568
+ chunkEl.textContent = chunkNumber;
569
+ }
570
+ if (ws.readyState === WebSocket.OPEN) startChunk();
571
+ };
572
+
573
+ mediaRecorder.start();
574
+ setTimeout(() => {
575
+ if (mediaRecorder.state === 'recording') mediaRecorder.stop();
576
+ }, CHUNK_MS);
577
+ }
578
+
579
+ startChunk();
580
+
581
+ timerHandle = setInterval(() => {
582
+ recSeconds++;
583
+ durationEl.textContent = recSeconds < 60
584
+ ? recSeconds + 's'
585
+ : Math.floor(recSeconds/60) + 'm' + (recSeconds%60) + 's';
586
+ }, 1000);
587
+
588
+ startViz(stream);
589
+ };
590
+
591
+ ws.onmessage = (e) => {
592
+ const data = JSON.parse(e.data);
593
+ if (data.error) { showToast('Server: ' + data.error); return; }
594
+ addEntry(data.chunk_number, data.text, data.language, nowTime());
595
+ };
596
+
597
+ ws.onerror = () => showToast('WebSocket error — check server.');
598
+ ws.onclose = () => { setStatus('idle'); };
599
+
600
+ } catch (err) {
601
+ showToast('Microphone access denied or unavailable.');
602
+ }
603
+ }
604
+
605
+ function stopRecording() {
606
+ if (ws) ws.close(); // close WS first so onstop doesn't start a new chunk
607
+ if (mediaRecorder && mediaRecorder.state !== 'inactive') mediaRecorder.stop();
608
+ clearInterval(timerHandle);
609
+ stopViz();
610
+ setStatus('idle');
611
+ btnRecord.disabled = false;
612
+ btnStop.disabled = true;
613
+ }
614
+
615
+ // ── File upload ────────────────────────────────────────────────────────────
616
+ fileInput.addEventListener('change', () => {
617
+ if (fileInput.files.length > 0) {
618
+ fileName.textContent = fileInput.files[0].name;
619
+ btnUpload.disabled = false;
620
+ }
621
+ });
622
+
623
+ btnUpload.addEventListener('click', async () => {
624
+ if (!fileInput.files.length) return;
625
+ const file = fileInput.files[0];
626
+ const sessionId = sessionInput.value.trim() || genSessionId();
627
+ sessionInput.value = sessionId;
628
+
629
+ const form = new FormData();
630
+ form.append('file', file);
631
+ form.append('session_id', sessionId);
632
+ form.append('chunk_number', '0');
633
+
634
+ btnUpload.disabled = true;
635
+ btnUpload.textContent = 'UPLOADING…';
636
+ setStatus('busy');
637
+
638
+ try {
639
+ const resp = await fetch(`${BASE_URL}${APP_PATH}/transcribe`, { method: 'POST', body: form });
640
+ if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
641
+ const data = await resp.json();
642
+ addEntry(data.chunk_number, data.text, data.language, nowTime());
643
+ setStatus('idle');
644
+ } catch (err) {
645
+ showToast('Upload failed: ' + err.message);
646
+ setStatus('idle');
647
+ } finally {
648
+ btnUpload.textContent = 'TRANSCRIBE FILE';
649
+ btnUpload.disabled = false;
650
+ }
651
+ });
652
+
653
+ // ── Controls ──────────────────────────────────────────────────────────────
654
+ btnRecord.addEventListener('click', startRecording);
655
+ btnStop.addEventListener('click', stopRecording);
656
+
657
+ btnClear.addEventListener('click', () => {
658
+ entries = []; wordCount = 0; chunkNumber = 0; recSeconds = 0;
659
+ wordEl.textContent = '0'; chunkEl.textContent = '0'; durationEl.textContent = '0s';
660
+ container.innerHTML = '';
661
+ container.appendChild(emptyState);
662
+ emptyState.style.display = '';
663
+ });
664
+
665
+ btnCopy.addEventListener('click', () => {
666
+ const text = entries.map(e => `[${e.time}][${e.language}] ${e.text}`).join('\n');
667
+ navigator.clipboard.writeText(text).then(() => showToast('Copied to clipboard!', 2000));
668
+ });
669
+
670
+ btnExport.addEventListener('click', () => {
671
+ const text = entries.map(e => `[Chunk ${e.chunk}][${e.time}][${e.language}]\n${e.text}\n`).join('\n');
672
+ const blob = new Blob([text], { type: 'text/plain' });
673
+ const a = document.createElement('a');
674
+ a.href = URL.createObjectURL(blob);
675
+ a.download = `transcript_${sessionInput.value || 'session'}.txt`;
676
+ a.click();
677
+ });
678
+
679
+ // ── Auto-generate session on load ─────────────────────────────────────────
680
+ sessionInput.value = genSessionId();
681
+ </script>
682
+
683
+ </body>
684
+ </html>
CustomSTT_COLAB.ipynb ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "## Run First two cells only"
23
+ ],
24
+ "metadata": {
25
+ "id": "jRmIrf0il4AC"
26
+ }
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "source": [
31
+ "# ── 1. Install dependencies ───────────────────────────────────────────────────\n",
32
+ "!pip install -q fastapi uvicorn python-multipart pyngrok faster-whisper\n",
33
+ "\n",
34
+ "# ── 2. Imports ────────────────────────────────────────────────────────────────\n",
35
+ "import os\n",
36
+ "import asyncio\n",
37
+ "import tempfile\n",
38
+ "import threading\n",
39
+ "import time\n",
40
+ "import functools\n",
41
+ "import logging\n",
42
+ "\n",
43
+ "from fastapi import FastAPI, File, UploadFile, HTTPException\n",
44
+ "from fastapi.responses import JSONResponse\n",
45
+ "import uvicorn\n",
46
+ "from faster_whisper import WhisperModel\n",
47
+ "from faster_whisper.audio import decode_audio\n",
48
+ "from pyngrok import ngrok\n",
49
+ "\n",
50
+ "# ── 3. Logging setup ─────────────────────────────────────────────────────────\n",
51
+ "LOG_FILE = \"/content/stt_server.log\"\n",
52
+ "\n",
53
+ "for handler in logging.root.handlers[:]:\n",
54
+ " logging.root.removeHandler(handler)\n",
55
+ "\n",
56
+ "logging.basicConfig(\n",
57
+ " level=logging.INFO,\n",
58
+ " format=\"%(asctime)s [%(levelname)s] %(message)s\",\n",
59
+ " handlers=[\n",
60
+ " logging.FileHandler(LOG_FILE),\n",
61
+ " logging.StreamHandler()\n",
62
+ " ],\n",
63
+ " force=True # ← overrides uvicorn's logger hijack\n",
64
+ ")\n",
65
+ "logger = logging.getLogger(__name__)\n",
66
+ "\n",
67
+ "# ── 4. Ngrok auth ─────────────────────────────────────────────────────────────\n",
68
+ "ngrok.set_auth_token(\"3491amoA7gGNvQYJDLUYwNdlTqu_83s75o2cESPNrWdfhTDBN\")\n",
69
+ "\n",
70
+ "# ── 5. Load both models ───────────────────────────────────────────────────────\n",
71
+ "logger.info(\"Loading Arabic model...\")\n",
72
+ "arabic_model = WhisperModel(\"MinaNasser/Whisper-Small-MN-int8\", compute_type=\"int8\", device=\"cuda\")\n",
73
+ "\n",
74
+ "logger.info(\"Loading English model...\")\n",
75
+ "english_model = WhisperModel(\"Systran/faster-whisper-large-v3\", compute_type=\"int8\", device=\"cuda\")\n",
76
+ "\n",
77
+ "logger.info(\"✅ Both models loaded.\")\n",
78
+ "\n",
79
+ "# ── 6. FastAPI app ────────────────────────────────────────────────────────────\n",
80
+ "app = FastAPI(title=\"Custom Arabic/English STT\")\n",
81
+ "\n",
82
+ "@app.get(\"/health\")\n",
83
+ "async def health():\n",
84
+ " return {\"status\": \"ok\", \"models\": [\"arabic\", \"english\"]}\n",
85
+ "\n",
86
+ "@app.post(\"/transcribe\")\n",
87
+ "async def transcribe(file: UploadFile = File(...)):\n",
88
+ " with tempfile.NamedTemporaryFile(delete=False, suffix=\".wav\") as tmp:\n",
89
+ " tmp.write(await file.read())\n",
90
+ " tmp_path = tmp.name\n",
91
+ "\n",
92
+ " try:\n",
93
+ " loop = asyncio.get_event_loop()\n",
94
+ "\n",
95
+ " # ── Step 1: detect language\n",
96
+ " def detect():\n",
97
+ " waveform = decode_audio(tmp_path)\n",
98
+ " language, probability, _ = english_model.detect_language(waveform)\n",
99
+ " return language, probability\n",
100
+ "\n",
101
+ " language, probability = await loop.run_in_executor(None, detect)\n",
102
+ " logger.info(f\"Detected language: {language} ({probability:.2f})\")\n",
103
+ "\n",
104
+ " # ── Step 2: route to correct model\n",
105
+ " if language == \"ar\":\n",
106
+ " logger.info(f\"used OUR MODEL\")\n",
107
+ " fn = functools.partial(arabic_model.transcribe, tmp_path, language=\"ar\",vad_filter=True)\n",
108
+ " else:\n",
109
+ " logger.info(f\"used ENG MODEL\")\n",
110
+ " fn = functools.partial(english_model.transcribe, tmp_path, language=language,vad_filter=True)\n",
111
+ "\n",
112
+ " segments, info = await loop.run_in_executor(None, fn)\n",
113
+ " transcript = \" \".join(seg.text for seg in segments)\n",
114
+ "\n",
115
+ " logger.info(f\"Transcript [{info.language}]: {transcript[:80]}...\")\n",
116
+ "\n",
117
+ " return JSONResponse(content={\n",
118
+ " \"text\": transcript,\n",
119
+ " \"language\": info.language,\n",
120
+ " \"language_probability\": info.language_probability,\n",
121
+ " })\n",
122
+ " except Exception as e:\n",
123
+ " logger.error(f\"Transcription failed: {e}\")\n",
124
+ " raise HTTPException(status_code=500, detail=str(e))\n",
125
+ " finally:\n",
126
+ " if os.path.exists(tmp_path):\n",
127
+ " os.unlink(tmp_path)\n",
128
+ "\n",
129
+ "# ── 7. Cleanup & start server ─────────────────────────────────────────────────\n",
130
+ "try:\n",
131
+ " ngrok.kill()\n",
132
+ "except Exception:\n",
133
+ " pass\n",
134
+ "\n",
135
+ "!fuser -k 8000/tcp || true\n",
136
+ "time.sleep(1)\n",
137
+ "\n",
138
+ "def run_server():\n",
139
+ " uvicorn.run(app, host=\"0.0.0.0\", port=8000, log_level=\"info\", log_config=None)\n",
140
+ "\n",
141
+ "thread = threading.Thread(target=run_server, daemon=True)\n",
142
+ "thread.start()\n",
143
+ "time.sleep(5)\n",
144
+ "\n",
145
+ "# ── 8. Expose via ngrok ───────────────────────────────────────────────────────\n",
146
+ "public_url = ngrok.connect(8000).public_url\n",
147
+ "logger.info(f\"Public URL: {public_url}\")\n",
148
+ "print(f\"\\n Public URL : {public_url}\")\n",
149
+ "print(f\" Health check: {public_url}/health\")\n",
150
+ "print(f\" Transcribe : {public_url}/transcribe\")\n",
151
+ "print(f\"\\n Set CUSTOM_STT_URL={public_url}\")"
152
+ ],
153
+ "metadata": {
154
+ "id": "CETOGHw9ZIzs"
155
+ },
156
+ "execution_count": null,
157
+ "outputs": []
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "source": [
162
+ "!tail -f /content/stt_server.log"
163
+ ],
164
+ "metadata": {
165
+ "id": "INTTVax9ZLYO"
166
+ },
167
+ "execution_count": null,
168
+ "outputs": []
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "source": [
173
+ "# ── 1. Install dependencies ───────────────────────────────────────────────────\n",
174
+ "!pip install -q fastapi uvicorn python-multipart pyngrok faster-whisper\n",
175
+ "\n",
176
+ "# ── 2. Imports ────────────────────────────────────────────────────────────────\n",
177
+ "import os\n",
178
+ "import asyncio\n",
179
+ "import tempfile\n",
180
+ "import threading\n",
181
+ "import time\n",
182
+ "import functools\n",
183
+ "\n",
184
+ "from fastapi import FastAPI, File, UploadFile, HTTPException\n",
185
+ "from fastapi.responses import JSONResponse\n",
186
+ "import uvicorn\n",
187
+ "from faster_whisper import WhisperModel\n",
188
+ "from pyngrok import ngrok\n",
189
+ "\n",
190
+ "# ── 3. Ngrok auth ─────────────────────────────────────────────────────────────\n",
191
+ "ngrok.set_auth_token(\"3491amoA7gGNvQYJDLUYwNdlTqu_83s75o2cESPNrWdfhTDBN\")\n",
192
+ "\n",
193
+ "# ── 4. Load model ─────────────────────────────────────────────────────────────\n",
194
+ "MODEL_NAME = \"Systran/faster-whisper-large-v3\"\n",
195
+ "#MODEL_NAME = \"MinaNasser/Whisper-Base-MN-EG-int8\"\n",
196
+ "#MODEL_NAME = \"MinaNasser/Whisper-Small-MN-int8\"\n",
197
+ "model = WhisperModel(MODEL_NAME, compute_type=\"int8\", device=\"cuda\")\n",
198
+ "\n",
199
+ "# ── 5. FastAPI app ────────────────────────────────────────────────────────────\n",
200
+ "app = FastAPI(title=\"Custom Arabic STT\")\n",
201
+ "\n",
202
+ "@app.get(\"/health\")\n",
203
+ "async def health():\n",
204
+ " return {\"status\": \"ok\"}\n",
205
+ "\n",
206
+ "@app.post(\"/transcribe\")\n",
207
+ "async def transcribe(file: UploadFile = File(...)):\n",
208
+ " with tempfile.NamedTemporaryFile(delete=False, suffix=\".wav\") as tmp:\n",
209
+ " tmp.write(await file.read())\n",
210
+ " tmp_path = tmp.name\n",
211
+ "\n",
212
+ " try:\n",
213
+ " loop = asyncio.get_event_loop()\n",
214
+ " # functools.partial lets us pass keyword args through run_in_executor\n",
215
+ " fn = functools.partial(model.transcribe, tmp_path) #, language=\"ar\"\n",
216
+ " segments, info = await loop.run_in_executor(None, fn)\n",
217
+ " transcript = \" \".join(seg.text for seg in segments)\n",
218
+ " return JSONResponse(content={\n",
219
+ " \"text\": transcript,\n",
220
+ " \"language\": info.language,\n",
221
+ " \"language_probability\": info.language_probability,\n",
222
+ " })\n",
223
+ " except Exception as e:\n",
224
+ " raise HTTPException(status_code=500, detail=str(e))\n",
225
+ " finally:\n",
226
+ " if os.path.exists(tmp_path):\n",
227
+ " os.unlink(tmp_path)\n",
228
+ "\n",
229
+ "# ── 6. Cleanup & start server ─────────────────────────────────────────────────\n",
230
+ "try:\n",
231
+ " ngrok.kill()\n",
232
+ "except Exception:\n",
233
+ " pass\n",
234
+ "\n",
235
+ "!fuser -k 8000/tcp || true\n",
236
+ "time.sleep(1)\n",
237
+ "\n",
238
+ "def run_server():\n",
239
+ " uvicorn.run(app, host=\"0.0.0.0\", port=8000, log_level=\"info\")\n",
240
+ "\n",
241
+ "thread = threading.Thread(target=run_server, daemon=True)\n",
242
+ "thread.start()\n",
243
+ "time.sleep(5) # wait for server to be ready\n",
244
+ "\n",
245
+ "# ── 7. Expose via ngrok ───────────────────────────────────────────────────────\n",
246
+ "public_url = ngrok.connect(8000).public_url\n",
247
+ "print(f\"\\n Public URL : {public_url}\")\n",
248
+ "print(f\" Health check: {public_url}/health\")\n",
249
+ "print(f\" Transcribe : {public_url}/transcribe\")\n",
250
+ "print(f\"\\n Set CUSTOM_STT_URL={public_url} (no trailing slash, no /transcribe)\")"
251
+ ],
252
+ "metadata": {
253
+ "id": "QupX525ER_Kw"
254
+ },
255
+ "execution_count": null,
256
+ "outputs": []
257
+ }
258
+ ]
259
+ }
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ ffmpeg \
5
+ libsndfile1 \
6
+ build-essential \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # HF Spaces runs as a non-root user, this avoids permission issues
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV PATH="/home/user/.local/bin:$PATH"
14
+
15
+ WORKDIR /home/user/app
16
+
17
+ COPY --chown=user requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ COPY --chown=user . .
21
+
22
+ EXPOSE 7860
23
+
24
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
configs.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+
3
+ class Settings(BaseSettings):
4
+ APP_NAME: str
5
+ APP_VERSION: str
6
+ APP_VARIENT: str
7
+
8
+ INFERENCE_TYPE:str ="local" # local or remote
9
+
10
+ DATABASE_URL:str ="sqlite+aiosqlite:///./app.db"
11
+
12
+ LOCAL_INFERENCE_MODEL_SIZE:str ="small" # small , base
13
+
14
+ REMOTE_INFERENCE_PROVIDER:str ="mistral" # mistral , groq
15
+ MISTRAL_MODEL:str="voxtral-mini-latest"
16
+ MISTRAL_API_KEY:str
17
+ GROQ_API_KEY:str
18
+ GROQ_MODEL:str="whisper-large-v3-turbo"
19
+ DEEPGRAM_API_KEY:str="0b47e602a9df5a073fa000986aa4ff465d36c3c6"
20
+ DEEPGRAM_MODEL:str="nova-3"
21
+ ASSEMBLYAI_API_KEY:str="5efd83b230794b3ebf332b28672dc441"
22
+ CUSTOM_STT_URL:str = "https://elwanda-agnathous-tragically.ngrok-free.dev/transcribe" # the URL from Colab
23
+
24
+ class Config:
25
+ env_file = ".env"
26
+
27
+ def get_settings(): ## this makes any got by "get_settings().APP_NAME"
28
+ return Settings()
controllers/TranscriptionController.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from configs import get_settings
2
+ import asyncio
3
+ # import librosa
4
+ # import numpy as np
5
+ from stores.sttremotes import STTRemoteManager
6
+ from faster_whisper.audio import decode_audio # handles webm natively
7
+
8
+ class TranscriptionController:
9
+ def __init__(self,models,logger,remotename):
10
+ self.settings = get_settings()
11
+ self.models = models
12
+ self.logger = logger
13
+ self.remote_max_request_rate=60
14
+ self.remotename=remotename
15
+ self.remote=STTRemoteManager(default_provider=remotename) if remotename else None
16
+
17
+ async def transcribe_audio(self,audio_path: str):
18
+ if self.settings.INFERENCE_TYPE == "local":
19
+ return await self.transcribe_local(audio_path)
20
+ elif self.settings.INFERENCE_TYPE == "remote":
21
+ return await self.transcribe_remote(audio_path)
22
+ else:
23
+ raise ValueError(f"Unsupported INFERENCE_TYPE: {self.settings.INFERENCE_TYPE}")
24
+
25
+
26
+
27
+ async def language_detection(self, audio_path: str):
28
+ if self.settings.INFERENCE_TYPE == "local":
29
+ model_size = self.settings.LOCAL_INFERENCE_MODEL_SIZE
30
+ model = self.models.get(f"{model_size}_english")
31
+ if not model:
32
+ raise ValueError(f"Model {model_size}_language_detection not available")
33
+
34
+ print(f"Detecting language for {audio_path} with {model_size} model...")
35
+
36
+ def process():
37
+ waveform = decode_audio(audio_path)
38
+ language, probability,_ = model.detect_language(waveform)
39
+ return language, probability
40
+ loop = asyncio.get_event_loop()
41
+ language, language_probability = await loop.run_in_executor(None, process)
42
+
43
+ return language, language_probability
44
+
45
+
46
+ async def transcribe_local(self,audio_path: str):
47
+ language, probability = await self.language_detection(audio_path)
48
+ if language == "ar":
49
+ self.logger.info(f"Processing Arabic audio with probability {probability:.2f}")
50
+ return await self.transcribe_local_arabic(audio_path)
51
+ elif language == "en":
52
+ self.logger.info(f"Processing English audio with probability {probability:.2f}")
53
+ return await self.transcribe_local_english(audio_path)
54
+ else:
55
+ self.logger.warning(f"Unsupported language detected: {language}. Skipping transcription.")
56
+ return None, language
57
+
58
+ async def transcribe_local_arabic(self,audio_path: str):
59
+ if self.settings.INFERENCE_TYPE == "local":
60
+ model_size = self.settings.LOCAL_INFERENCE_MODEL_SIZE
61
+ model=self.models.get(f"{model_size}_arabic")
62
+ if not model:
63
+ raise ValueError(f"Model {model_size}_arabic not available")
64
+
65
+ print(f"Transcribing {audio_path} with {model_size} model...")
66
+
67
+ ALLOWED_LANGUAGES = ['ar']
68
+
69
+ def process_with_filter():
70
+ segments, info = model.transcribe(
71
+ audio_path,
72
+ beam_size=5,
73
+ best_of=5,
74
+ language="ar",
75
+ vad_filter=True,
76
+ vad_parameters=dict(min_silence_duration_ms=500,threshold=0.3)
77
+ )
78
+ if info.language not in ALLOWED_LANGUAGES:
79
+ self.logger.info(f"Skipping: Detected {info.language} with prob {info.language_probability:.2f}")
80
+ return None, info.language
81
+
82
+ full_text = ""
83
+ for segment in segments:
84
+ full_text += segment.text + " "
85
+
86
+ return full_text.strip(), info.language
87
+
88
+ loop = asyncio.get_event_loop()
89
+ text, language = await loop.run_in_executor(None, process_with_filter)
90
+
91
+ return text, language
92
+
93
+ async def transcribe_local_english(self,audio_path: str):
94
+ if self.settings.INFERENCE_TYPE == "local":
95
+ model_size = self.settings.LOCAL_INFERENCE_MODEL_SIZE
96
+ model=self.models.get(f"{model_size}_english")
97
+ if not model:
98
+ raise ValueError(f"Model {model_size}_english not available")
99
+
100
+ print(f"Transcribing {audio_path} with {model_size} model...")
101
+
102
+ ALLOWED_LANGUAGES = ['en']
103
+
104
+ def process_with_filter():
105
+ segments, info = model.transcribe(
106
+ audio_path,
107
+ beam_size=5,
108
+ best_of=5,
109
+ language="en",
110
+ vad_filter=True,
111
+ vad_parameters=dict(min_silence_duration_ms=500,threshold=0.3)
112
+ )
113
+ if info.language not in ALLOWED_LANGUAGES:
114
+ self.logger.info(f"Skipping: Detected {info.language} with prob {info.language_probability:.2f}")
115
+ return None, info.language
116
+
117
+ full_text = ""
118
+ for segment in segments:
119
+ full_text += segment.text + " "
120
+
121
+ return full_text.strip(), info.language
122
+
123
+ loop = asyncio.get_event_loop()
124
+ text, language = await loop.run_in_executor(None, process_with_filter)
125
+
126
+ return text, language
127
+
128
+ async def transcribe_remote(self,audio_path: str):
129
+ if not self.remote:
130
+ raise ValueError("Remote STT provider not configured")
131
+
132
+ if not hasattr(self, "_last_request_time"):
133
+ self._last_request_time = 0
134
+
135
+ elapsed = asyncio.get_event_loop().time() - self._last_request_time
136
+ if elapsed < 1 / self.remote_max_request_rate:
137
+ await asyncio.sleep((1 / self.remote_max_request_rate) - elapsed)
138
+
139
+ self._last_request_time = asyncio.get_event_loop().time()
140
+ return await self.remote.transcribe_remote(audio_path,self.remotename)
141
+
controllers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .TranscriptionController import TranscriptionController
main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import logging
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from contextlib import asynccontextmanager
5
+ from routes import base , db , transcripe
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ app = FastAPI(title="Audio Transcription API")
11
+ app.add_middleware(
12
+ CORSMiddleware,
13
+ allow_origins=["*"],
14
+ allow_methods=["*"],
15
+ allow_headers=["*"],
16
+ )
17
+
18
+ app.include_router(base.base_router)
19
+ app.include_router(db.db_router)
20
+ app.include_router(transcripe.transcripe_router)
21
+
22
+ # Register DB startup event
23
+ db.register_startup_events(app)
24
+ transcripe.register_startup_events(app)
25
+ transcripe.register_shutdown_events(app)
models/__init__.py ADDED
File without changes
models/database.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
2
+ from sqlalchemy.orm import sessionmaker, declarative_base
3
+ from sqlalchemy import event
4
+ from sqlalchemy.engine import Engine
5
+ from configs import get_settings
6
+
7
+ import os
8
+
9
+
10
+ #DB_URL = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///./app.db")
11
+ DB_URL= get_settings().DATABASE_URL
12
+
13
+ engine = create_async_engine(DB_URL, echo=True, future=True)
14
+ AsyncSessionLocal = sessionmaker(
15
+ bind=engine, class_=AsyncSession, expire_on_commit=False
16
+ )
17
+
18
+ Base = declarative_base()
19
+
20
+ async def get_db():
21
+ async with AsyncSessionLocal() as session:
22
+ yield session
23
+
24
+
25
+ if DB_URL.startswith("sqlite"):
26
+ @event.listens_for(engine.sync_engine, "connect")
27
+ def enable_sqlite_fk(dbapi_connection, connection_record):
28
+ cursor = dbapi_connection.cursor()
29
+ cursor.execute("PRAGMA foreign_keys=ON")
30
+ cursor.close()
models/sessions.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, String, TIMESTAMP
2
+ from sqlalchemy.sql import func
3
+ from .database import Base
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+ from sqlalchemy.future import select
6
+ from sqlalchemy.orm import relationship
7
+
8
+
9
+ class Session(Base):
10
+ __tablename__ = "sessions"
11
+
12
+ session_id = Column(String, primary_key=True, index=True)
13
+ created_at = Column(TIMESTAMP(timezone=True), server_default=func.now())
14
+ status = Column(String, default="active")
15
+
16
+ transcriptions = relationship(
17
+ "Transcription",
18
+ back_populates="session",
19
+ cascade="all, delete-orphan",
20
+ passive_deletes=True
21
+ )
22
+
23
+ # CRUD operations
24
+ async def create_session(db: AsyncSession, session_id: str, status: str = "active"):
25
+ new_session = Session(session_id=session_id, status=status)
26
+ db.add(new_session)
27
+ await db.commit()
28
+ await db.refresh(new_session)
29
+ return new_session
30
+
31
+ async def get_session(db: AsyncSession, session_id: str):
32
+ result = await db.execute(select(Session).where(Session.session_id == session_id))
33
+ return result.scalar_one_or_none()
34
+
35
+ async def get_all_sessions(db: AsyncSession):
36
+ result = await db.execute(select(Session))
37
+ return result.scalars().all()
38
+
39
+ async def update_session_status(db: AsyncSession, session_id: str, status: str):
40
+ session = await get_session(db, session_id)
41
+ if session:
42
+ session.status = status
43
+ await db.commit()
44
+ await db.refresh(session)
45
+ return session
46
+
47
+ async def delete_session(db: AsyncSession, session_id: str):
48
+ session = await get_session(db, session_id)
49
+ if session:
50
+ await db.delete(session)
51
+ await db.commit()
52
+ return session
models/transcriptions.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, TIMESTAMP, ForeignKey
2
+ from sqlalchemy.sql import func
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+ from sqlalchemy.future import select
5
+ from .database import Base
6
+ from sqlalchemy.orm import relationship
7
+
8
+ class Transcription(Base):
9
+ __tablename__ = "transcriptions"
10
+
11
+ id = Column(Integer, primary_key=True, index=True)
12
+ session_id = Column(String, ForeignKey("sessions.session_id",onupdate='CASCADE',ondelete='CASCADE'))
13
+ chunk_number = Column(Integer)
14
+ text = Column(String)
15
+ language = Column(String)
16
+ created_at = Column(TIMESTAMP(timezone=True), server_default=func.now())
17
+
18
+ session = relationship("Session", back_populates="transcriptions")
19
+
20
+
21
+ # CRUD operations
22
+ async def create_transcription(db: AsyncSession, session_id: str, chunk_number: int, text: str, language: str):
23
+ new_transcription = Transcription(
24
+ session_id=session_id,
25
+ chunk_number=chunk_number,
26
+ text=text,
27
+ language=language
28
+ )
29
+ db.add(new_transcription)
30
+ await db.commit()
31
+ await db.refresh(new_transcription)
32
+ return new_transcription
33
+
34
+ async def get_transcriptions_by_session(db: AsyncSession, session_id: str):
35
+ result = await db.execute(select(Transcription).where(Transcription.session_id == session_id))
36
+ return result.scalars().all()
37
+
38
+ async def update_transcription_text(db: AsyncSession, transcription_id: int, new_text: str):
39
+ result = await db.execute(select(Transcription).where(Transcription.id == transcription_id))
40
+ transcription = result.scalar_one_or_none()
41
+ if transcription:
42
+ transcription.text = new_text
43
+ await db.commit()
44
+ await db.refresh(transcription)
45
+ return transcription
46
+
47
+ async def delete_transcription(db: AsyncSession, transcription_id: int):
48
+ result = await db.execute(select(Transcription).where(Transcription.id == transcription_id))
49
+ transcription = result.scalar_one_or_none()
50
+ if transcription:
51
+ await db.delete(transcription)
52
+ await db.commit()
53
+ return transcription
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn[standard]==0.27.0
3
+ python-multipart==0.0.6
4
+ faster-whisper==1.2.1
5
+ websockets==12.0
6
+ pydantic-settings==2.13.0
7
+ SQLAlchemy==2.0.46
8
+ aiosqlite==0.22.1
9
+ librosa==0.11.0
10
+ mistralai==1.12.3
11
+ groq==1.0.0
12
+ aiohttp==3.13.3
13
+ assemblyai==0.58.0
14
+ deepgram-sdk==6.0.1
15
+ asyncpg==0.31.0
routes/__init__.py ADDED
File without changes
routes/base.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter , Depends
2
+ from configs import Settings , get_settings
3
+
4
+
5
+ base_router = APIRouter(
6
+ prefix=f"/{get_settings().APP_NAME}/{get_settings().APP_VARIENT}",
7
+ tags=["base"])
8
+
9
+ @base_router.get("/")
10
+ async def welcome(app_settings: Settings = Depends(get_settings)):
11
+ app_name = app_settings.APP_NAME
12
+ app_version = app_settings.APP_VERSION
13
+
14
+ return {"app_name": app_name, "app_version": app_version, "conf":app_settings}
15
+
16
+ @base_router.get("/health")
17
+ async def health(app_settings: Settings = Depends(get_settings)):
18
+ app_name = app_settings.APP_NAME
19
+ app_version = app_settings.APP_VERSION
20
+ return {"app_name": app_name, "app_version": app_version , "status": "healthy"}
routes/db.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException
2
+ from sqlalchemy.ext.asyncio import AsyncSession
3
+ from typing import List
4
+ from fastapi import FastAPI
5
+ from configs import get_settings
6
+ from models.database import get_db, Base, engine
7
+ from models.sessions import create_session, get_session, update_session_status, delete_session,get_all_sessions
8
+ from models.transcriptions import create_transcription, get_transcriptions_by_session, update_transcription_text, delete_transcription
9
+
10
+ db_router = APIRouter(
11
+ prefix=f"/{get_settings().APP_NAME}/{get_settings().APP_VARIENT}",
12
+ tags=["db"])
13
+
14
+ @db_router.post("/getallsessions/", response_model=List)
15
+ async def api_get_all_session(db: AsyncSession = Depends(get_db)):
16
+ session = await get_all_sessions(db)
17
+ sess=[]
18
+ for se in session:
19
+ sess.append(
20
+ {
21
+ "session_id":se.session_id,
22
+ "created_at":se.created_at,
23
+ "status":se.status,
24
+ })
25
+
26
+ return sess
27
+ #return {"sessions":f"{sess}"}
28
+
29
+
30
+
31
+ @db_router.post("/sessions/", response_model=dict)
32
+ async def api_create_session(session_id: str, db: AsyncSession = Depends(get_db)):
33
+ session = await create_session(db, session_id)
34
+ return {"session_id": session.session_id, "status": session.status, "created_at": session.created_at}
35
+
36
+ @db_router.get("/sessions/{session_id}", response_model=dict)
37
+ async def api_get_session(session_id: str, db: AsyncSession = Depends(get_db)):
38
+ session = await get_session(db, session_id)
39
+ if not session:
40
+ raise HTTPException(status_code=404, detail="Session not found")
41
+ return {"session_id": session.session_id, "status": session.status, "created_at": session.created_at}
42
+
43
+ @db_router.patch("/sessions/{session_id}", response_model=dict)
44
+ async def api_update_session_status(session_id: str, status: str, db: AsyncSession = Depends(get_db)):
45
+ session = await update_session_status(db, session_id, status)
46
+ if not session:
47
+ raise HTTPException(status_code=404, detail="Session not found")
48
+ return {"session_id": session.session_id, "status": session.status}
49
+
50
+ @db_router.delete("/sessions/{session_id}", response_model=dict)
51
+ async def api_delete_session(session_id: str, db: AsyncSession = Depends(get_db)):
52
+ session = await delete_session(db, session_id)
53
+ if not session:
54
+ raise HTTPException(status_code=404, detail="Session not found")
55
+ return {"detail": "Session deleted successfully"}
56
+
57
+
58
+ @db_router.post("/transcriptions/", response_model=dict)
59
+ async def api_create_transcription(session_id: str, chunk_number: int, text: str, language: str, db: AsyncSession = Depends(get_db)):
60
+ transcription = await create_transcription(db, session_id, chunk_number, text, language)
61
+
62
+ return {
63
+ "id": transcription.id,
64
+ "session_id": transcription.session_id,
65
+ "chunk_number": transcription.chunk_number,
66
+ "text": transcription.text,
67
+ "language": transcription.language,
68
+ "created_at": transcription.created_at
69
+ }
70
+
71
+ @db_router.get("/transcriptions/{session_id}", response_model=List[dict])
72
+ async def api_get_transcriptions(session_id: str, db: AsyncSession = Depends(get_db)):
73
+ transcriptions = await get_transcriptions_by_session(db, session_id)
74
+ if not transcriptions:
75
+ raise HTTPException(status_code=404, detail="Session not found")
76
+ return [
77
+ {
78
+ "id": t.id,
79
+ "session_id": t.session_id,
80
+ "chunk_number": t.chunk_number,
81
+ "text": t.text,
82
+ "language": t.language,
83
+ "created_at": t.created_at
84
+ } for t in transcriptions
85
+ ]
86
+
87
+ @db_router.patch("/transcriptions/{transcription_id}", response_model=dict)
88
+ async def api_update_transcription(transcription_id: int, new_text: str, db: AsyncSession = Depends(get_db)):
89
+ transcription = await update_transcription_text(db, transcription_id, new_text)
90
+ if not transcription:
91
+ raise HTTPException(status_code=404, detail="Transcription not found")
92
+ return {
93
+ "id": transcription.id,
94
+ "text": transcription.text
95
+ }
96
+
97
+ @db_router.delete("/transcriptions/{transcription_id}", response_model=dict)
98
+ async def api_delete_transcription(transcription_id: int, db: AsyncSession = Depends(get_db)):
99
+ transcription = await delete_transcription(db, transcription_id)
100
+ if not transcription:
101
+ raise HTTPException(status_code=404, detail="Transcription not found")
102
+ return {"detail": "Transcription deleted successfully"}
103
+
104
+
105
+ async def init_models():
106
+ async with engine.begin() as conn:
107
+ await conn.run_sync(Base.metadata.create_all)
108
+
109
+ def register_startup_events(app: FastAPI):
110
+ @app.on_event("startup")
111
+ async def startup_event():
112
+ await init_models()
routes/transcripe.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter ,Depends ,FastAPI ,WebSocket, WebSocketDisconnect, UploadFile, File
2
+ from configs import get_settings
3
+ import os
4
+ import tempfile
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+ from faster_whisper import WhisperModel
7
+ import logging
8
+ from controllers.TranscriptionController import TranscriptionController
9
+ import uuid
10
+ from models.sessions import create_session, get_session
11
+ from models.transcriptions import create_transcription
12
+ from models.database import get_db
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ transcripe_router = APIRouter(
18
+ prefix=f"/{get_settings().APP_NAME}/{get_settings().APP_VARIENT}",
19
+ tags=["transcripe"])
20
+
21
+
22
+ @transcripe_router.post("/transcribe")
23
+ async def transcribe_endpoint(file: UploadFile = File(...),session_id: str = None,chunk_number: int = 0, db: AsyncSession = Depends(get_db)):
24
+ if not session_id or session_id.strip() == "":
25
+ session_id = f"ses_{uuid.uuid4().hex[:8]}"
26
+
27
+ session = await get_session(db, session_id)
28
+ if not session:
29
+ session = await create_session(db, session_id)
30
+
31
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
32
+ content = await file.read()
33
+ temp_file.write(content)
34
+ temp_path = temp_file.name
35
+
36
+ try:
37
+ controller = TranscriptionController(models, logger, remotename=get_settings().REMOTE_INFERENCE_PROVIDER)
38
+ text, language = await controller.transcribe_audio(temp_path)
39
+
40
+ if text:
41
+ transcription = await create_transcription(db, session_id, chunk_number, text, language)
42
+
43
+ return {
44
+ "id": transcription.id,
45
+ "session_id": transcription.session_id,
46
+ "chunk_number": transcription.chunk_number,
47
+ "text": transcription.text,
48
+ "language": transcription.language,
49
+ "created_at": transcription.created_at
50
+ }
51
+ finally:
52
+ if os.path.exists(temp_path):
53
+ os.remove(temp_path)
54
+
55
+ @transcripe_router.websocket("/ws/{session_id}")
56
+ async def websocket_endpoint(websocket: WebSocket, session_id: str, db: AsyncSession = Depends(get_db)):
57
+ await websocket.accept()
58
+
59
+ result = await get_session(db, session_id)
60
+ if not result:
61
+ result = await create_session(db, session_id)
62
+
63
+ if not result:
64
+ await websocket.send_json({"error": "Session not found nor created"})
65
+ await websocket.close()
66
+ return
67
+
68
+ chunk_number = 0
69
+ logger.info(f"WebSocket connection established for session {session_id}")
70
+
71
+ try:
72
+ while True:
73
+ data = await websocket.receive_bytes()
74
+ logger.info(f"Received audio chunk {chunk_number} ({len(data)} bytes)")
75
+
76
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_file:
77
+ temp_file.write(data)
78
+ temp_path = temp_file.name
79
+
80
+ try:
81
+ controller = TranscriptionController(models, logger, remotename=get_settings().REMOTE_INFERENCE_PROVIDER)
82
+ text, language = await controller.transcribe_audio(temp_path)
83
+
84
+ if text:
85
+ transcription = await create_transcription(db, session_id, chunk_number, text, language)
86
+ await websocket.send_json({
87
+ "chunk_number": chunk_number,
88
+ "text": text,
89
+ "language": language,
90
+ "session_id": session_id
91
+ })
92
+ logger.info(f"Transcribed chunk {chunk_number} ({language}): {text[:50]}...")
93
+ else:
94
+ logger.info(f"Chunk {chunk_number} ignored (Detected: {language})")
95
+
96
+ chunk_number += 1
97
+ finally:
98
+ if os.path.exists(temp_path):
99
+ os.remove(temp_path)
100
+
101
+ except WebSocketDisconnect:
102
+ logger.info(f"WebSocket disconnected for session {session_id}")
103
+ except Exception as e:
104
+ logger.error(f"Error in WebSocket: {str(e)}")
105
+ await websocket.send_json({"error": str(e)})
106
+ await websocket.close()
107
+
108
+ models={}
109
+
110
+ async def load_models():
111
+ print("Loading Whisper models...")
112
+ if get_settings().INFERENCE_TYPE == "local":
113
+ if get_settings().LOCAL_INFERENCE_MODEL_SIZE == "small":
114
+ models["small_arabic"] = WhisperModel("Whisper-Small-MN-int8", device="cpu", compute_type="int8")
115
+ models["small_english"] = WhisperModel("small", device="cpu", compute_type="int8")
116
+ elif get_settings().LOCAL_INFERENCE_MODEL_SIZE == "base":
117
+ models["base_arabic"] = WhisperModel("Whisper-Base-MN-EG-int8", device="cpu", compute_type="int8")
118
+ models["base_english"] = WhisperModel("base", device="cpu", compute_type="int8")
119
+
120
+ print("Models loaded successfully ")
121
+ print(f"Loaded models: {list(models.keys())}")
122
+
123
+
124
+ def register_startup_events(app: FastAPI):
125
+ @app.on_event("startup")
126
+ async def startup_event():
127
+ await load_models()
128
+
129
+ def register_shutdown_events(app: FastAPI):
130
+ @app.on_event("shutdown")
131
+ async def shutdown_event():
132
+ models.clear()
133
+ print("Models unloaded")
stores/providers/aaistt.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import assemblyai as aai
3
+ from configs import get_settings
4
+
5
+ settings = get_settings()
6
+ aai.settings.api_key = settings.ASSEMBLYAI_API_KEY
7
+ config = aai.TranscriptionConfig(
8
+ speech_models=["universal-2","universal-3-pro"],
9
+ language_detection=True, # auto-detect language
10
+ speaker_labels=True, # diarization
11
+ )
12
+
13
+ class AssemblyAISTT:
14
+ def __init__(self):
15
+ self.client = aai.Transcriber()
16
+
17
+ async def transcribe(self, audio_path: str):
18
+ # AssemblyAI's transcriber is synchronous, so we run it in a thread
19
+ loop = asyncio.get_event_loop()
20
+
21
+ def sync_transcribe():
22
+ transcript = self.client.transcribe(audio_path,config=config)
23
+ if transcript.status == aai.TranscriptStatus.error:
24
+ raise Exception(f"Transcription failed: {transcript.error}")
25
+
26
+ text = transcript.text
27
+ # Language detection is available in transcript.language_code
28
+ language = transcript.language_code if transcript.language_code else "unknown"
29
+ return text, language
30
+
31
+ return await loop.run_in_executor(None, sync_transcribe)
stores/providers/customstt.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import aiohttp
3
+ from configs import get_settings
4
+
5
+ settings = get_settings()
6
+
7
+ class CustomSTT:
8
+ def __init__(self):
9
+ # The base URL from the Colab notebook (set in your environment/config)
10
+ self.base_url = settings.CUSTOM_STT_URL.rstrip("/")
11
+ self.endpoint = f"{self.base_url}/transcribe"
12
+ # Optional API key if you added one to the endpoint
13
+ self.api_key = getattr(settings, "CUSTOM_STT_API_KEY", None)
14
+
15
+ async def transcribe(self, audio_path: str):
16
+ """Send audio file to custom STT server and return (text, language)."""
17
+ async with aiohttp.ClientSession() as session:
18
+ with open(audio_path, "rb") as f:
19
+ form_data = aiohttp.FormData()
20
+ form_data.add_field(
21
+ "file",
22
+ f,
23
+ filename=os.path.basename(audio_path),
24
+ content_type="audio/wav"
25
+ )
26
+ headers = {}
27
+ if self.api_key:
28
+ headers["X-API-Key"] = self.api_key
29
+
30
+ async with session.post(self.endpoint, data=form_data, headers=headers) as resp:
31
+ if resp.status != 200:
32
+ error_text = await resp.text()
33
+ raise Exception(f"Custom STT error {resp.status}: {error_text}")
34
+ data = await resp.json()
35
+ return data["text"], data["language"]
stores/providers/deepgramstt.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from deepgram import DeepgramClient
4
+ from configs import get_settings
5
+
6
+ settings = get_settings()
7
+
8
+ class DeepgramSTT:
9
+ def __init__(self):
10
+ self.client = DeepgramClient(api_key=settings.DEEPGRAM_API_KEY)
11
+ self.model = settings.DEEPGRAM_MODEL
12
+
13
+ async def transcribe(self, audio_path: str):
14
+ loop = asyncio.get_event_loop()
15
+
16
+ def sync_transcribe():
17
+ with open(audio_path, "rb") as audio_file:
18
+ response = self.client.listen.v1.media.transcribe_file(
19
+ request=audio_file.read(),
20
+ model=self.model,
21
+ smart_format=True,
22
+ language=None, # auto-detect
23
+ )
24
+
25
+ # Extract transcript
26
+ transcript = response.results.channels[0].alternatives[0].transcript
27
+
28
+ # Extract language from metadata (if available)
29
+ language = getattr(response.metadata, "language", "unknown")
30
+
31
+ return transcript, language
32
+
33
+ return await loop.run_in_executor(None, sync_transcribe)
stores/providers/groqstt.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from groq import Groq
4
+ from configs import get_settings
5
+
6
+ settings = get_settings()
7
+ client = Groq(api_key=settings.GROQ_API_KEY)
8
+
9
+
10
+ class GroqSTT:
11
+ def __init__(self):
12
+ self.client = client
13
+ self.model = settings.GROQ_MODEL
14
+
15
+ async def transcribe(self, audio_path: str) -> str:
16
+ loop = asyncio.get_event_loop()
17
+
18
+ def sync_transcribe():
19
+ with open(audio_path, "rb") as file:
20
+ transcription = self.client.audio.transcriptions.create(
21
+ file=(os.path.basename(audio_path), file.read()),
22
+ model=self.model,
23
+ response_format="json",
24
+ )
25
+ return transcription.text, "unknown"
26
+
27
+ text, language = await loop.run_in_executor(None, sync_transcribe)
28
+ return text, language
29
+
stores/providers/mistralstt.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from mistralai import Mistral
4
+ from configs import get_settings
5
+
6
+ settings = get_settings()
7
+ client = Mistral(api_key=settings.MISTRAL_API_KEY)
8
+
9
+ class MistralSTT:
10
+ def __init__(self):
11
+ self.client = client
12
+ self.model = settings.MISTRAL_MODEL
13
+
14
+ async def transcribe(self, audio_path: str) -> str:
15
+ loop = asyncio.get_event_loop()
16
+
17
+ def sync_transcribe():
18
+ with open(audio_path, "rb") as f:
19
+ response = self.client.audio.transcriptions.complete(
20
+ model=self.model,
21
+ file={"content": f, "file_name": os.path.basename(audio_path)}
22
+ )
23
+ return response.text, "unknown"
24
+
25
+ text, language = await loop.run_in_executor(None, sync_transcribe)
26
+ return text, language
stores/sttremotes.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Type
2
+ from .providers.mistralstt import MistralSTT
3
+ from .providers.groqstt import GroqSTT
4
+ from .providers.deepgramstt import DeepgramSTT
5
+ from .providers.aaistt import AssemblyAISTT
6
+ from .providers.customstt import CustomSTT
7
+
8
+ class STTFactory:
9
+ _providers: Dict[str, Type] = {
10
+ "mistral": MistralSTT,
11
+ "groq": GroqSTT,
12
+ "deepgram": DeepgramSTT,
13
+ "assemblyai": AssemblyAISTT,
14
+ "custom": CustomSTT,
15
+ }
16
+
17
+ @classmethod
18
+ def get_provider(cls, provider_name: str):
19
+ provider_class = cls._providers.get(provider_name.lower())
20
+ if not provider_class:
21
+ raise ValueError(f"STT provider '{provider_name}' not found")
22
+ return provider_class()
23
+
24
+ class STTRemoteManager:
25
+ def __init__(self, default_provider: str = "mistral"):
26
+ self.default_provider = default_provider
27
+
28
+ async def transcribe_remote(self, audio_path: str, provider_name: str = None) -> str:
29
+ provider_name = provider_name or self.default_provider
30
+ provider = STTFactory.get_provider(provider_name)
31
+ return await provider.transcribe(audio_path)