romizone commited on
Commit
6280e64
Β·
verified Β·
1 Parent(s): 11a9e9a

Upload app_gradio.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app_gradio.py +1158 -0
app_gradio.py ADDED
@@ -0,0 +1,1158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TranscribeAI - Transcription with Speaker Diarization (ZeroGPU)
3
+ ================================================================
4
+ Engine : openai/whisper via transformers pipeline (CUDA ZeroGPU H200)
5
+ Speaker : MFCC + Agglomerative Clustering
6
+ Language: Indonesian, English, Auto-detect (99 languages)
7
+ Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM
8
+ Output : SRT, TXT, DOCX
9
+ """
10
+
11
+ import time
12
+ import tempfile
13
+ import threading
14
+ import torch
15
+ import spaces
16
+ import gradio as gr
17
+ import numpy as np
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ from transformers import pipeline
21
+
22
+ # ============================================================
23
+ # Config β€” Single model (small) for fastest startup & simplicity
24
+ # ============================================================
25
+ MODEL_ID = 'openai/whisper-small'
26
+ MODEL_NAME = 'small'
27
+
28
+ LANGUAGE_MAP = {
29
+ 'Auto-detect': None,
30
+ 'Indonesian': 'id',
31
+ 'English': 'en',
32
+ 'Japanese': 'ja',
33
+ 'Korean': 'ko',
34
+ 'Chinese': 'zh',
35
+ 'Arabic': 'ar',
36
+ 'French': 'fr',
37
+ 'German': 'de',
38
+ 'Spanish': 'es',
39
+ 'Portuguese': 'pt',
40
+ 'Russian': 'ru',
41
+ 'Thai': 'th',
42
+ 'Vietnamese': 'vi',
43
+ 'Malay': 'ms',
44
+ 'Hindi': 'hi',
45
+ 'Turkish': 'tr',
46
+ 'Dutch': 'nl',
47
+ 'Italian': 'it',
48
+ }
49
+
50
+ BATCH_SIZE = 16 # A10G 24GB VRAM β€” safe for whisper-small float16
51
+ OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
52
+ OUTPUT_DIR.mkdir(exist_ok=True)
53
+
54
+ # ============================================================
55
+ # Load pipeline at MODULE LEVEL (ZeroGPU requirement!)
56
+ # Single model = faster startup, no on-demand loading delay
57
+ # ============================================================
58
+ device = 0 if torch.cuda.is_available() else "cpu"
59
+
60
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
61
+
62
+ print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...")
63
+ pipe = pipeline(
64
+ task="automatic-speech-recognition",
65
+ model=MODEL_ID,
66
+ chunk_length_s=30,
67
+ device=device,
68
+ torch_dtype=torch_dtype,
69
+ )
70
+ print(f" {MODEL_NAME} ready!")
71
+
72
+
73
+ # ============================================================
74
+ # Helpers
75
+ # ============================================================
76
+ def fmt_timestamp(seconds):
77
+ h = int(seconds // 3600)
78
+ m = int((seconds % 3600) // 60)
79
+ s = int(seconds % 60)
80
+ ms = int((seconds % 1) * 1000)
81
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
82
+
83
+
84
+ def fmt_time(seconds):
85
+ h = int(seconds // 3600)
86
+ m = int((seconds % 3600) // 60)
87
+ s = int(seconds % 60)
88
+ if h > 0:
89
+ return f"{h:02d}:{m:02d}:{s:02d}"
90
+ return f"{m:02d}:{s:02d}"
91
+
92
+
93
+ # ============================================================
94
+ # Speaker Diarization (MFCC + Clustering) β€” CPU
95
+ # ============================================================
96
+ def perform_diarization(audio_path, segments, num_speakers):
97
+ import librosa
98
+ from sklearn.cluster import AgglomerativeClustering
99
+ from sklearn.preprocessing import StandardScaler
100
+
101
+ if not segments or len(segments) < 2:
102
+ for seg in segments:
103
+ seg['speaker'] = 'Speaker 1'
104
+ seg['speaker_id'] = 0
105
+ return segments
106
+
107
+ y, sr = librosa.load(str(audio_path), sr=16000, mono=True)
108
+
109
+ features = []
110
+ valid_indices = []
111
+
112
+ for i, seg in enumerate(segments):
113
+ s0 = int(seg['start'] * sr)
114
+ s1 = min(int(seg['end'] * sr), len(y))
115
+ if s1 <= s0 or s0 >= len(y):
116
+ continue
117
+ chunk = y[s0:s1]
118
+ if len(chunk) < int(sr * 0.3):
119
+ continue
120
+
121
+ try:
122
+ # Cap analysis to 3s per segment for speed
123
+ max_samples = int(sr * 3)
124
+ analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk
125
+
126
+ # MFCC (13 = industry standard) + delta β€” sufficient for speaker ID
127
+ mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13)
128
+ delta = librosa.feature.delta(mfcc)
129
+
130
+ # F0 (pitch) β€” key differentiator between speakers
131
+ f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
132
+ f0c = f0[f0 > 0]
133
+ f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
134
+ f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
135
+
136
+ combined = np.vstack([mfcc, delta])
137
+ vec = np.concatenate([
138
+ np.mean(combined, axis=1),
139
+ np.std(combined, axis=1),
140
+ [f0_mean, f0_std]
141
+ ])
142
+ features.append(vec)
143
+ valid_indices.append(i)
144
+ except Exception:
145
+ continue
146
+
147
+ if len(features) < 2:
148
+ for seg in segments:
149
+ seg['speaker'] = 'Speaker 1'
150
+ seg['speaker_id'] = 0
151
+ return segments
152
+
153
+ X = np.array(features)
154
+ X_scaled = StandardScaler().fit_transform(X)
155
+
156
+ if num_speakers <= 0:
157
+ from sklearn.metrics import silhouette_score
158
+ best_score, best_n = -1, 2
159
+ max_n = min(6, len(X_scaled) - 1)
160
+ for n in range(2, max_n + 1):
161
+ try:
162
+ lbls = AgglomerativeClustering(
163
+ n_clusters=n, metric='cosine', linkage='average'
164
+ ).fit_predict(X_scaled)
165
+ score = silhouette_score(X_scaled, lbls, metric='cosine')
166
+ if score > best_score:
167
+ best_score, best_n = score, n
168
+ except Exception:
169
+ pass
170
+ num_speakers = best_n
171
+ else:
172
+ num_speakers = min(num_speakers, len(X_scaled))
173
+
174
+ if num_speakers >= 2 and len(X_scaled) >= num_speakers:
175
+ labels = AgglomerativeClustering(
176
+ n_clusters=num_speakers, metric='cosine', linkage='average'
177
+ ).fit_predict(X_scaled)
178
+ else:
179
+ labels = np.zeros(len(X_scaled), dtype=int)
180
+
181
+ label_map = {}
182
+ for lbl in labels:
183
+ if lbl not in label_map:
184
+ label_map[lbl] = len(label_map) + 1
185
+
186
+ assigns = {}
187
+ for idx, seg_idx in enumerate(valid_indices):
188
+ assigns[seg_idx] = label_map[labels[idx]]
189
+
190
+ for i, seg in enumerate(segments):
191
+ if i in assigns:
192
+ seg['speaker'] = f'Speaker {assigns[i]}'
193
+ seg['speaker_id'] = assigns[i] - 1
194
+ else:
195
+ nearest = min(valid_indices, key=lambda x: abs(x - i)) if valid_indices else 0
196
+ seg['speaker'] = f'Speaker {assigns.get(nearest, 1)}'
197
+ seg['speaker_id'] = assigns.get(nearest, 1) - 1
198
+
199
+ return segments
200
+
201
+
202
+ def merge_consecutive(segments):
203
+ if not segments:
204
+ return segments
205
+ merged = [segments[0].copy()]
206
+ for seg in segments[1:]:
207
+ if seg.get('speaker') == merged[-1].get('speaker'):
208
+ merged[-1]['end'] = seg['end']
209
+ merged[-1]['text'] += ' ' + seg['text']
210
+ else:
211
+ merged.append(seg.copy())
212
+ return merged
213
+
214
+
215
+ # ============================================================
216
+ # Export Functions
217
+ # ============================================================
218
+ def generate_srt(segments, path):
219
+ with open(path, 'w', encoding='utf-8') as f:
220
+ for i, seg in enumerate(segments, 1):
221
+ f.write(f"{i}\n")
222
+ f.write(f"{fmt_timestamp(seg['start'])} --> {fmt_timestamp(seg['end'])}\n")
223
+ sp = seg.get('speaker', '')
224
+ f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
225
+
226
+
227
+ LANG_NAMES = {
228
+ 'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean',
229
+ 'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German',
230
+ 'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai',
231
+ 'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish',
232
+ 'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected',
233
+ }
234
+
235
+
236
+ def generate_txt(segments, path, filename='', language='', duration=0):
237
+ with open(path, 'w', encoding='utf-8') as f:
238
+ f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
239
+ if filename:
240
+ f.write(f"File: {filename}\n")
241
+ f.write(f"Language: {LANG_NAMES.get(language, language)}\n")
242
+ f.write(f"Duration: {fmt_time(duration)}\n")
243
+ f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
244
+ speakers = sorted(set(s.get('speaker', '') for s in segments))
245
+ f.write(f"Speakers: {', '.join(speakers)}\n")
246
+ f.write("=" * 60 + "\n\n")
247
+ cur_speaker = None
248
+ for seg in segments:
249
+ sp = seg.get('speaker', '')
250
+ if sp != cur_speaker:
251
+ cur_speaker = sp
252
+ f.write(f"\n[{fmt_time(seg['start'])}] {sp}:\n")
253
+ f.write(f"{seg['text']}\n")
254
+
255
+
256
+ def generate_docx(segments, path, filename='', language='', duration=0):
257
+ from docx import Document
258
+ from docx.shared import Pt, RGBColor
259
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
260
+ colors = {
261
+ 0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
262
+ 2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
263
+ 4: RGBColor(124, 58, 237), 5: RGBColor(219, 39, 119),
264
+ }
265
+
266
+ doc = Document()
267
+ style = doc.styles['Normal']
268
+ style.font.name = 'Calibri'
269
+ style.font.size = Pt(11)
270
+
271
+ title = doc.add_heading('Transcript', level=0)
272
+ title.alignment = WD_ALIGN_PARAGRAPH.CENTER
273
+
274
+ meta = []
275
+ if filename:
276
+ meta.append(('File', filename))
277
+ meta.append(('Language', LANG_NAMES.get(language, language)))
278
+ meta.append(('Duration', fmt_time(duration)))
279
+ meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
280
+ speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
281
+ meta.append(('Speakers', ', '.join(speakers)))
282
+
283
+ for label, val in meta:
284
+ p = doc.add_paragraph()
285
+ r = p.add_run(f'{label}: ')
286
+ r.bold = True
287
+ r.font.size = Pt(10)
288
+ r.font.color.rgb = RGBColor(100, 100, 100)
289
+ r = p.add_run(val)
290
+ r.font.size = Pt(10)
291
+ p.paragraph_format.space_after = Pt(2)
292
+
293
+ doc.add_paragraph('_' * 70)
294
+
295
+ for seg in segments:
296
+ p = doc.add_paragraph()
297
+ r = p.add_run(f'[{fmt_time(seg["start"])}] ')
298
+ r.font.size = Pt(9)
299
+ r.font.color.rgb = RGBColor(150, 150, 150)
300
+
301
+ sp_id = seg.get('speaker_id', 0)
302
+ sp = seg.get('speaker', 'Speaker 1')
303
+ color = colors.get(sp_id, RGBColor(79, 70, 229))
304
+ r = p.add_run(f'{sp}: ')
305
+ r.bold = True
306
+ r.font.size = Pt(11)
307
+ r.font.color.rgb = color
308
+
309
+ r = p.add_run(seg['text'])
310
+ r.font.size = Pt(11)
311
+ p.paragraph_format.space_after = Pt(6)
312
+
313
+ doc.save(path)
314
+
315
+
316
+ # ============================================================
317
+ # GPU Transcription (ZeroGPU β€” proven pattern)
318
+ # ============================================================
319
+ @spaces.GPU(duration=120)
320
+ def transcribe_with_gpu(audio_path, language):
321
+ """Run Whisper inference on GPU. Single model, always ready."""
322
+ generate_kwargs = {"task": "transcribe"}
323
+ if language:
324
+ generate_kwargs["language"] = language
325
+
326
+ result = pipe(
327
+ str(audio_path),
328
+ batch_size=BATCH_SIZE,
329
+ return_timestamps=True,
330
+ generate_kwargs=generate_kwargs,
331
+ )
332
+
333
+ # Parse segments
334
+ raw_segments = []
335
+ duration = 0.0
336
+
337
+ chunks = result.get("chunks", [])
338
+ if chunks:
339
+ for chunk in chunks:
340
+ text = chunk.get("text", "").strip()
341
+ ts = chunk.get("timestamp", (0, 0))
342
+ start = ts[0] if ts[0] is not None else 0
343
+ end = ts[1] if ts[1] is not None else start + 1
344
+ if end > duration:
345
+ duration = end
346
+ if text:
347
+ raw_segments.append({
348
+ 'start': round(start, 2),
349
+ 'end': round(end, 2),
350
+ 'text': text,
351
+ })
352
+ else:
353
+ full_text = result.get("text", "").strip()
354
+ if full_text:
355
+ raw_segments.append({'start': 0, 'end': 1, 'text': full_text})
356
+
357
+ detected_lang = language or "auto"
358
+ return raw_segments, detected_lang, duration
359
+
360
+
361
+ def apply_vad_filter(segments):
362
+ """Filter out segments that are likely silence/noise (very short + filler)."""
363
+ FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]',
364
+ '(music)', '[Musik]', '[musik]', 'β™ͺ', 'β™ͺβ™ͺ', 'β™«'}
365
+ MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise
366
+ filtered = []
367
+ for seg in segments:
368
+ text = seg['text'].strip()
369
+ seg_dur = seg['end'] - seg['start']
370
+ if text in FILLER:
371
+ continue
372
+ if seg_dur < MIN_DURATION and len(text.split()) <= 1:
373
+ continue
374
+ filtered.append(seg)
375
+ return filtered if filtered else segments # fallback: return original if all filtered
376
+
377
+
378
+ # ============================================================
379
+ # Full Pipeline (wired to Gradio)
380
+ # ============================================================
381
+ def transcribe_full(audio_file, language_name, num_speakers,
382
+ enable_diarization, enable_vad, progress=gr.Progress()):
383
+ if audio_file is None:
384
+ raise gr.Error("Please upload an audio file first!")
385
+
386
+ audio_path = audio_file
387
+ filename = Path(audio_path).name
388
+ lang_code = LANGUAGE_MAP.get(language_name, None)
389
+ num_speakers = int(num_speakers) # Gradio slider returns float
390
+
391
+ t0 = time.time() # Start timing from here β€” matches JS timer
392
+ progress(0.05, desc="⏳ Waiting for GPU & processing audio... (may take 30-90 seconds)")
393
+
394
+ # 1. Transcribe on GPU
395
+ try:
396
+ segments, detected_lang, duration = transcribe_with_gpu(
397
+ audio_path, lang_code
398
+ )
399
+ except Exception as e:
400
+ raise gr.Error(f"Transcription failed: {str(e)}")
401
+
402
+ if not segments:
403
+ raise gr.Error("No text detected from the audio.")
404
+
405
+ # 1b. VAD filter β€” remove silence/filler segments
406
+ if enable_vad:
407
+ segments = apply_vad_filter(segments)
408
+
409
+ transcribe_time = time.time() - t0
410
+ progress(0.60, desc=f"βœ… Transcription complete ({transcribe_time:.0f}s) β€” {len(segments)} segments")
411
+
412
+ # 2. Speaker Diarization (CPU)
413
+ diarization_note = ""
414
+ if enable_diarization and len(segments) >= 2:
415
+ progress(0.65, desc="πŸ” Identifying speakers...")
416
+ try:
417
+ segments = perform_diarization(audio_path, segments, num_speakers)
418
+ segments = merge_consecutive(segments)
419
+ except Exception as e:
420
+ print(f" [Diarization] Error: {e}")
421
+ diarization_note = " ⚠️ (diarization failed, fallback to 1 speaker)"
422
+ for seg in segments:
423
+ seg['speaker'] = 'Speaker 1'
424
+ seg['speaker_id'] = 0
425
+ else:
426
+ for seg in segments:
427
+ seg['speaker'] = 'Speaker 1'
428
+ seg['speaker_id'] = 0
429
+
430
+ progress(0.85, desc="πŸ“„ Generating output files...")
431
+
432
+ # 3. Export
433
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
434
+ base_name = Path(filename).stem
435
+
436
+ srt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.srt")
437
+ txt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.txt")
438
+ docx_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.docx")
439
+
440
+ generate_srt(segments, srt_path)
441
+ generate_txt(segments, txt_path, filename, detected_lang, duration)
442
+ generate_docx(segments, docx_path, filename, detected_lang, duration)
443
+
444
+ progress(0.95, desc="πŸ“¦ Preparing results...")
445
+
446
+ # Build display text
447
+ transcript_lines = []
448
+ speakers_found = set()
449
+ for seg in segments:
450
+ sp = seg.get('speaker', 'Speaker 1')
451
+ speakers_found.add(sp)
452
+ transcript_lines.append(f"[{fmt_time(seg['start'])}] {sp}: {seg['text']}")
453
+
454
+ transcript_text = "\n\n".join(transcript_lines)
455
+
456
+ total_time = time.time() - t0
457
+ lang_display = detected_lang.upper() if detected_lang else 'AUTO'
458
+ summary = (
459
+ f"**Transcription Complete!**\n\n"
460
+ f"| Info | Details |\n"
461
+ f"|------|--------|\n"
462
+ f"| File | {filename} |\n"
463
+ f"| Audio Duration | {fmt_time(duration)} |\n"
464
+ f"| Language | {lang_display} |\n"
465
+ f"| Model | {MODEL_NAME} (244M) |\n"
466
+ f"| Speakers | {len(speakers_found)} ({', '.join(sorted(speakers_found))}){diarization_note} |\n"
467
+ f"| Segments | {len(segments)} |\n"
468
+ f"| Processing Time | {total_time:.0f} seconds |\n"
469
+ f"| Engine | Whisper + ZeroGPU H200 |"
470
+ )
471
+
472
+ progress(1.0, desc="πŸŽ‰ Done!")
473
+ return summary, transcript_text, srt_path, txt_path, docx_path
474
+
475
+
476
+ # ============================================================
477
+ # Cleanup old files (>1 hour)
478
+ # ============================================================
479
+ def cleanup_loop():
480
+ while True:
481
+ try:
482
+ now = time.time()
483
+ if OUTPUT_DIR.exists():
484
+ for f in OUTPUT_DIR.iterdir():
485
+ if f.is_file() and (now - f.stat().st_mtime) > 3600:
486
+ f.unlink(missing_ok=True)
487
+ print(f" [Cleanup] Deleted: {f.name}")
488
+ except Exception as e:
489
+ print(f" [Cleanup] Error: {e}")
490
+ time.sleep(300)
491
+
492
+ threading.Thread(target=cleanup_loop, daemon=True).start()
493
+
494
+
495
+ # ============================================================
496
+ # Gradio UI
497
+ # ============================================================
498
+ THEME = gr.themes.Base(
499
+ primary_hue=gr.themes.colors.indigo,
500
+ secondary_hue=gr.themes.colors.purple,
501
+ neutral_hue=gr.themes.colors.gray,
502
+ font=gr.themes.GoogleFont("Inter"),
503
+ ).set(
504
+ body_background_fill="#0f0f11",
505
+ body_background_fill_dark="#0f0f11",
506
+ block_background_fill="#1a1a1f",
507
+ block_background_fill_dark="#1a1a1f",
508
+ block_border_color="#333340",
509
+ block_border_color_dark="#333340",
510
+ block_label_text_color="#a0a0b0",
511
+ block_title_text_color="#e8e8ed",
512
+ body_text_color="#e8e8ed",
513
+ body_text_color_dark="#e8e8ed",
514
+ button_primary_background_fill="#6366f1",
515
+ button_primary_background_fill_dark="#6366f1",
516
+ button_primary_text_color="#ffffff",
517
+ input_background_fill="#222228",
518
+ input_background_fill_dark="#222228",
519
+ input_border_color="#333340",
520
+ input_border_color_dark="#333340",
521
+ )
522
+
523
+ CUSTOM_CSS = """
524
+ /* Global */
525
+ .gradio-container {
526
+ max-width: 960px !important;
527
+ margin: 0 auto !important;
528
+ }
529
+ footer { display: none !important; }
530
+
531
+ /* Header */
532
+ .header-wrap {
533
+ text-align: center;
534
+ padding: 32px 0 20px;
535
+ }
536
+ .header-wrap h1 {
537
+ font-size: 32px !important;
538
+ font-weight: 800 !important;
539
+ background: linear-gradient(135deg, #818cf8, #8b5cf6) !important;
540
+ -webkit-background-clip: text !important;
541
+ -webkit-text-fill-color: transparent !important;
542
+ background-clip: text !important;
543
+ letter-spacing: -0.5px;
544
+ margin-bottom: 6px !important;
545
+ }
546
+ .header-wrap p {
547
+ color: #a0a0b0 !important;
548
+ font-size: 14px !important;
549
+ }
550
+ .badge-gpu {
551
+ display: inline-flex;
552
+ align-items: center;
553
+ gap: 6px;
554
+ background: rgba(99,102,241,.12);
555
+ color: #818cf8;
556
+ font-size: 12px;
557
+ padding: 4px 14px;
558
+ border-radius: 20px;
559
+ font-weight: 600;
560
+ margin-top: 8px;
561
+ }
562
+ .badge-gpu::before {
563
+ content: '';
564
+ width: 7px;
565
+ height: 7px;
566
+ background: #10b981;
567
+ border-radius: 50%;
568
+ display: inline-block;
569
+ }
570
+
571
+ /* Cards */
572
+ .card-section {
573
+ background: #1a1a1f !important;
574
+ border: 1px solid #333340 !important;
575
+ border-radius: 14px !important;
576
+ padding: 20px 24px !important;
577
+ margin-bottom: 12px !important;
578
+ }
579
+ .card-title {
580
+ font-size: 14px !important;
581
+ font-weight: 700 !important;
582
+ color: #e8e8ed !important;
583
+ margin-bottom: 12px !important;
584
+ display: flex;
585
+ align-items: center;
586
+ gap: 8px;
587
+ }
588
+
589
+ /* Primary button */
590
+ .btn-start {
591
+ background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
592
+ border: none !important;
593
+ border-radius: 12px !important;
594
+ font-size: 16px !important;
595
+ font-weight: 700 !important;
596
+ padding: 14px 32px !important;
597
+ transition: all 0.2s !important;
598
+ box-shadow: 0 4px 15px rgba(99,102,241,.3) !important;
599
+ }
600
+ .btn-start:hover {
601
+ transform: translateY(-1px) !important;
602
+ box-shadow: 0 6px 20px rgba(99,102,241,.4) !important;
603
+ }
604
+
605
+ /* Settings grid */
606
+ .settings-row {
607
+ gap: 8px !important;
608
+ }
609
+
610
+ /* Transcript output */
611
+ .transcript-box textarea {
612
+ font-family: 'Inter', 'SF Mono', monospace !important;
613
+ font-size: 13px !important;
614
+ line-height: 1.7 !important;
615
+ background: #16161a !important;
616
+ border-radius: 10px !important;
617
+ }
618
+
619
+ /* Download cards β€” labels (dark bg) */
620
+ .download-row label span,
621
+ .download-row .label-wrap span {
622
+ color: #e8e8ed !important;
623
+ font-weight: 700 !important;
624
+ }
625
+ /* Download cards β€” file items (white bg β†’ black bold text) */
626
+ .download-row .file-preview,
627
+ .download-row .download-file,
628
+ .download-row .file-component {
629
+ border-radius: 10px !important;
630
+ }
631
+ .download-row .file-preview *,
632
+ .download-row .download-file *,
633
+ .download-row .file-component *,
634
+ .download-row a,
635
+ .download-row .file-name,
636
+ .download-row .file-size {
637
+ color: #111 !important;
638
+ font-weight: 700 !important;
639
+ }
640
+
641
+ /* Result summary */
642
+ .summary-box {
643
+ background: #1a1a1f !important;
644
+ border: 1px solid #2a2a35 !important;
645
+ border-radius: 12px !important;
646
+ padding: 16px !important;
647
+ }
648
+ .summary-box table {
649
+ width: 100% !important;
650
+ }
651
+ .summary-box td, .summary-box th {
652
+ padding: 6px 12px !important;
653
+ font-size: 13px !important;
654
+ border-bottom: 1px solid #222230 !important;
655
+ }
656
+
657
+ /* Toggle checkboxes */
658
+ .toggle-row {
659
+ gap: 24px !important;
660
+ }
661
+
662
+ /* Audio upload area */
663
+ .audio-upload {
664
+ border: 2px dashed #333340 !important;
665
+ border-radius: 14px !important;
666
+ transition: all 0.2s !important;
667
+ }
668
+ .audio-upload:hover {
669
+ border-color: #6366f1 !important;
670
+ }
671
+
672
+ /* How-to steps */
673
+ .howto {
674
+ display: flex;
675
+ gap: 16px;
676
+ margin: 12px 0 4px;
677
+ flex-wrap: wrap;
678
+ }
679
+ .howto-step {
680
+ display: flex;
681
+ align-items: center;
682
+ gap: 8px;
683
+ font-size: 13px;
684
+ color: #a0a0b0;
685
+ }
686
+ .howto-num {
687
+ width: 24px;
688
+ height: 24px;
689
+ border-radius: 50%;
690
+ background: linear-gradient(135deg, #6366f1, #8b5cf6);
691
+ color: #fff;
692
+ font-size: 12px;
693
+ font-weight: 700;
694
+ display: flex;
695
+ align-items: center;
696
+ justify-content: center;
697
+ flex-shrink: 0;
698
+ }
699
+
700
+ /* Feature tags */
701
+ .features {
702
+ display: flex;
703
+ gap: 8px;
704
+ flex-wrap: wrap;
705
+ justify-content: center;
706
+ margin-top: 12px;
707
+ }
708
+ .feat-tag {
709
+ font-size: 11px;
710
+ padding: 4px 10px;
711
+ border-radius: 6px;
712
+ background: #1a1a1f;
713
+ border: 1px solid #333340;
714
+ color: #a0a0b0;
715
+ }
716
+
717
+ /* Footer */
718
+ .footer-text {
719
+ text-align: center;
720
+ padding: 20px 0 8px;
721
+ color: #6a6a7a;
722
+ font-size: 12px;
723
+ }
724
+ .footer-text a {
725
+ color: #818cf8;
726
+ text-decoration: none;
727
+ }
728
+
729
+ /* ===== FIX: Dropdown text visibility ===== */
730
+ /* Selected value text */
731
+ .gr-dropdown .wrap .wrap-inner .secondary-wrap,
732
+ .gr-dropdown .wrap .wrap-inner .secondary-wrap span,
733
+ .gr-dropdown .wrap .wrap-inner input,
734
+ .gr-dropdown input,
735
+ .dropdown .wrap span,
736
+ .dropdown input[type="text"],
737
+ div[data-testid="dropdown"] span,
738
+ div[data-testid="dropdown"] input {
739
+ color: #e8e8ed !important;
740
+ }
741
+
742
+ /* Dropdown options list */
743
+ .gr-dropdown ul[role="listbox"],
744
+ .gr-dropdown .options,
745
+ .dropdown ul, .dropdown li,
746
+ ul[role="listbox"],
747
+ li[role="option"],
748
+ div[role="option"] {
749
+ color: #e8e8ed !important;
750
+ background-color: #1a1a1f !important;
751
+ }
752
+ li[role="option"]:hover,
753
+ div[role="option"]:hover,
754
+ li[role="option"].selected,
755
+ li[role="option"][aria-selected="true"] {
756
+ background-color: rgba(99,102,241,.2) !important;
757
+ color: #c7c7ff !important;
758
+ }
759
+
760
+ /* Dropdown container border */
761
+ .gr-dropdown .wrap, .dropdown .wrap {
762
+ background: #222228 !important;
763
+ border-color: #333340 !important;
764
+ }
765
+
766
+ /* Dropdown info text */
767
+ .gr-dropdown .info-text, .dropdown .info-text,
768
+ span[data-testid="info-text"] {
769
+ color: #8888a0 !important;
770
+ }
771
+
772
+ /* ===== FIX: Upload progress visibility ===== */
773
+ /* Gradio upload progress bar */
774
+ .upload-container .progress-bar,
775
+ .uploading .progress-bar,
776
+ .file-upload .progress-bar {
777
+ background: #333340 !important;
778
+ border-radius: 6px !important;
779
+ overflow: hidden !important;
780
+ }
781
+ .upload-container .progress-bar .progress,
782
+ .uploading .progress-bar .progress,
783
+ .file-upload .progress-bar .progress {
784
+ background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
785
+ }
786
+
787
+ /* Upload progress text */
788
+ .upload-container .progress-text,
789
+ .uploading .progress-text,
790
+ .file-upload-text,
791
+ .upload-text,
792
+ .eta-bar {
793
+ color: #e8e8ed !important;
794
+ font-weight: 600 !important;
795
+ }
796
+
797
+ /* Gradio's built-in ETA bar */
798
+ .eta-bar {
799
+ background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
800
+ opacity: 0.3 !important;
801
+ }
802
+
803
+ /* Progress level / status text */
804
+ .progress-level, .progress-level span,
805
+ .progress-level .progress-level-inner {
806
+ color: #e8e8ed !important;
807
+ font-size: 13px !important;
808
+ }
809
+
810
+ /* Upload button area */
811
+ .upload-button, .upload-button span {
812
+ color: #e8e8ed !important;
813
+ border-color: #6366f1 !important;
814
+ }
815
+
816
+ /* Audio component loading state */
817
+ .audio-upload .uploading,
818
+ .audio-upload .loading {
819
+ color: #e8e8ed !important;
820
+ }
821
+
822
+ /* Spinner / loading indicator */
823
+ .audio-upload .loading svg,
824
+ .audio-upload .spinner {
825
+ color: #818cf8 !important;
826
+ }
827
+
828
+ /* ===== Live Timer ===== */
829
+ .live-timer {
830
+ display: none;
831
+ align-items: center;
832
+ justify-content: center;
833
+ gap: 10px;
834
+ background: rgba(99,102,241,.08);
835
+ border: 1px solid rgba(99,102,241,.3);
836
+ color: #c7c7ff;
837
+ padding: 12px 24px;
838
+ border-radius: 12px;
839
+ font-size: 15px;
840
+ font-weight: 700;
841
+ font-family: 'Inter', 'SF Mono', monospace;
842
+ margin-bottom: 12px;
843
+ letter-spacing: 0.5px;
844
+ }
845
+ .live-timer.active {
846
+ display: flex !important;
847
+ }
848
+ .live-timer.done {
849
+ background: rgba(16,185,129,.08) !important;
850
+ border-color: rgba(16,185,129,.3) !important;
851
+ color: #6ee7b7 !important;
852
+ }
853
+ .live-timer.error {
854
+ background: rgba(239,68,68,.08) !important;
855
+ border-color: rgba(239,68,68,.3) !important;
856
+ color: #fca5a5 !important;
857
+ }
858
+ .pulse-dot {
859
+ width: 10px;
860
+ height: 10px;
861
+ border-radius: 50%;
862
+ background: #818cf8;
863
+ animation: pulse-blink 1s ease-in-out infinite;
864
+ flex-shrink: 0;
865
+ }
866
+ .live-timer.done .pulse-dot { display: none; }
867
+ .live-timer.error .pulse-dot { display: none; }
868
+ @keyframes pulse-blink {
869
+ 0%, 100% { opacity: 1; transform: scale(1); }
870
+ 50% { opacity: 0.3; transform: scale(0.7); }
871
+ }
872
+ .timer-clock {
873
+ font-variant-numeric: tabular-nums;
874
+ min-width: 52px;
875
+ text-align: center;
876
+ }
877
+
878
+ /* Responsive */
879
+ @media (max-width: 640px) {
880
+ .howto { flex-direction: column; gap: 8px; }
881
+ .features { gap: 4px; }
882
+ .header-wrap h1 { font-size: 26px !important; }
883
+ }
884
+ """
885
+
886
+ UPLOAD_PROGRESS_JS = """
887
+ <style>
888
+ #upload-bar-wrap{display:none;position:fixed;top:0;left:0;right:0;z-index:99999;height:5px;background:#222228}
889
+ #upload-bar{height:100%;width:0%;background:linear-gradient(90deg,#6366f1,#a78bfa);transition:width .2s;border-radius:0 3px 3px 0}
890
+ #upload-pct{display:none;position:fixed;top:12px;right:16px;z-index:99999;background:#1a1a1f;border:1px solid #6366f1;
891
+ color:#c7c7ff;padding:7px 16px;border-radius:10px;font-size:13px;font-weight:700;font-family:Inter,sans-serif;
892
+ box-shadow:0 4px 20px rgba(99,102,241,.3)}
893
+ </style>
894
+ <script>
895
+ (function(){
896
+ var barW=document.createElement('div');barW.id='upload-bar-wrap';
897
+ barW.innerHTML='<div id="upload-bar"></div>';document.body.appendChild(barW);
898
+ var pctEl=document.createElement('div');pctEl.id='upload-pct';document.body.appendChild(pctEl);
899
+
900
+ function show(p){
901
+ barW.style.display='block';pctEl.style.display='block';
902
+ document.getElementById('upload-bar').style.width=p+'%';
903
+ pctEl.textContent='\\u{1F4E4} Uploading... '+p+'%';
904
+ }
905
+ function hide(){
906
+ show(100);
907
+ setTimeout(function(){
908
+ barW.style.display='none';pctEl.style.display='none';
909
+ document.getElementById('upload-bar').style.width='0%';
910
+ },800);
911
+ }
912
+
913
+ var _fetch=window.fetch;
914
+ window.fetch=function(input,init){
915
+ var url=typeof input==='string'?input:(input&&input.url?input.url:'');
916
+ if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){
917
+ return new Promise(function(resolve,reject){
918
+ var xhr=new XMLHttpRequest();
919
+ xhr.open('POST',url,true);
920
+ xhr.responseType='text';
921
+ if(init.headers){
922
+ try{
923
+ var h=init.headers instanceof Headers?init.headers:new Headers(init.headers);
924
+ h.forEach(function(v,k){
925
+ if(k.toLowerCase()!=='content-type')xhr.setRequestHeader(k,v);
926
+ });
927
+ }catch(e){}
928
+ }
929
+ xhr.upload.onprogress=function(e){
930
+ if(e.lengthComputable)show(Math.round(e.loaded/e.total*100));
931
+ };
932
+ xhr.onload=function(){
933
+ hide();
934
+ var headers=new Headers();
935
+ try{
936
+ xhr.getAllResponseHeaders().trim().split('\\r\\n').forEach(function(line){
937
+ var i=line.indexOf(':');
938
+ if(i>0)headers.append(line.slice(0,i).trim(),line.slice(i+1).trim());
939
+ });
940
+ }catch(e){}
941
+ resolve(new Response(xhr.responseText,{status:xhr.status,statusText:xhr.statusText,headers:headers}));
942
+ };
943
+ xhr.onerror=function(){hide();reject(new TypeError('Network request failed'));};
944
+ xhr.onabort=function(){hide();reject(new DOMException('Aborted','AbortError'));};
945
+ xhr.send(init.body);
946
+ });
947
+ }
948
+ return _fetch.apply(this,arguments);
949
+ };
950
+ })();
951
+
952
+ /* ===== Live Timer ===== */
953
+ window._timerInterval=null;
954
+ window._timerStart=0;
955
+ window._timerHideTimeout=null;
956
+ window.startTranscribeTimer=function(){
957
+ var el=document.getElementById('live-timer');
958
+ if(!el)return;
959
+ /* Clear previous timer & auto-hide timeout */
960
+ if(window._timerInterval){clearInterval(window._timerInterval);window._timerInterval=null;}
961
+ if(window._timerHideTimeout){clearTimeout(window._timerHideTimeout);window._timerHideTimeout=null;}
962
+ window._timerStart=Date.now();
963
+ el.className='live-timer active';
964
+ el.innerHTML='<span class="pulse-dot"></span><span>Processing...</span><span class="timer-clock">00:00</span>';
965
+ window._timerInterval=setInterval(function(){
966
+ var sec=Math.floor((Date.now()-window._timerStart)/1000);
967
+ var m=Math.floor(sec/60);var s=sec%60;
968
+ var clock=el.querySelector('.timer-clock');
969
+ if(clock)clock.textContent=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0');
970
+ },1000);
971
+ };
972
+ window.stopTranscribeTimer=function(ok){
973
+ if(!window._timerInterval)return; /* Already stopped β€” prevent double-stop */
974
+ clearInterval(window._timerInterval);
975
+ window._timerInterval=null; /* Null it so MutationObserver won't re-trigger */
976
+ var el=document.getElementById('live-timer');
977
+ if(!el)return;
978
+ var sec=Math.floor((Date.now()-window._timerStart)/1000);
979
+ var m=Math.floor(sec/60);var s=sec%60;
980
+ var t=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0');
981
+ if(ok!==false){
982
+ el.className='live-timer active done';
983
+ el.innerHTML='\\u2705 Completed in <strong>'+t+'</strong>';
984
+ }else{
985
+ el.className='live-timer active error';
986
+ el.innerHTML='\\u274C Error after <strong>'+t+'</strong>';
987
+ }
988
+ window._timerHideTimeout=setTimeout(function(){
989
+ el.className='live-timer';
990
+ window._timerHideTimeout=null;
991
+ },60000);
992
+ };
993
+
994
+ /* Auto-start timer when EXPLICIT progress() text appears (contains ⏳).
995
+ Gradio StatusTracker (.eta-bar, .progress-level) appears on ALL fn calls,
996
+ but our ⏳ marker only appears when progress(0.05,"⏳ Menunggu GPU...") is called,
997
+ which happens AFTER the audio_file validation passes.
998
+ - No file β†’ gr.Error() before progress() β†’ no ⏳ β†’ timer never starts
999
+ - File OK β†’ progress(0.05,"⏳...") β†’ ⏳ detected β†’ timer starts
1000
+ Auto-stop on error toast. */
1001
+ new MutationObserver(function(muts){
1002
+ muts.forEach(function(m){
1003
+ if(m.type==='childList'){
1004
+ m.addedNodes.forEach(function(n){
1005
+ /* Element node: check text for ⏳ marker */
1006
+ if(n.nodeType===1){
1007
+ if(!window._timerInterval&&n.textContent&&n.textContent.indexOf('\u23f3')!==-1){
1008
+ window.startTranscribeTimer();
1009
+ }
1010
+ /* Detect error toast β†’ stop timer */
1011
+ var isToast=n.classList&&(n.classList.contains('toast-wrap')||n.classList.contains('error'));
1012
+ var hasError=n.querySelector&&n.querySelector('.error,.toast-body');
1013
+ if((isToast||hasError)&&window._timerInterval){
1014
+ window.stopTranscribeTimer(false);
1015
+ }
1016
+ }
1017
+ /* Text node with ⏳ */
1018
+ if(n.nodeType===3&&!window._timerInterval&&n.nodeValue&&n.nodeValue.indexOf('\u23f3')!==-1){
1019
+ window.startTranscribeTimer();
1020
+ }
1021
+ });
1022
+ }
1023
+ /* Text content change containing ⏳ (progress update on existing node) */
1024
+ if(m.type==='characterData'&&!window._timerInterval&&m.target.nodeValue&&m.target.nodeValue.indexOf('\u23f3')!==-1){
1025
+ window.startTranscribeTimer();
1026
+ }
1027
+ });
1028
+ }).observe(document.body,{childList:true,subtree:true,characterData:true});
1029
+ </script>
1030
+ """
1031
+
1032
+ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS, head=UPLOAD_PROGRESS_JS) as demo:
1033
+
1034
+ # ---- Header ----
1035
+ gr.HTML("""
1036
+ <div class="header-wrap">
1037
+ <h1>TranscribeAI</h1>
1038
+ <p>Audio Transcription with Speaker Diarization &mdash; Free & Fast</p>
1039
+ <div class="badge-gpu">ZeroGPU H200 &bull; Whisper &bull; No API Key</div>
1040
+ <div class="features">
1041
+ <span class="feat-tag">99+ Languages</span>
1042
+ <span class="feat-tag">Speaker ID</span>
1043
+ <span class="feat-tag">SRT / TXT / DOCX</span>
1044
+ <span class="feat-tag">GPU Accelerated</span>
1045
+ <span class="feat-tag">Auto Language Detection</span>
1046
+ </div>
1047
+ <div class="howto">
1048
+ <div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
1049
+ <div class="howto-step"><div class="howto-num">2</div> Click Start</div>
1050
+ <div class="howto-step"><div class="howto-num">3</div> Download results</div>
1051
+ </div>
1052
+ </div>
1053
+ """)
1054
+
1055
+ # ---- Upload ----
1056
+ with gr.Group(elem_classes="card-section"):
1057
+ gr.HTML('<div class="card-title">🎡 Upload Audio</div>')
1058
+ audio_input = gr.Audio(
1059
+ label="Drag & drop audio/video file, or click to browse. You can also record directly.",
1060
+ type="filepath",
1061
+ sources=["upload", "microphone"],
1062
+ elem_classes="audio-upload",
1063
+ )
1064
+ gr.HTML('<div style="font-size:11px;color:#6a6a7a;margin-top:6px;">Formats: MP3, MP4, WAV, M4A, OGG, FLAC, WEBM &bull; Max ~1 hour audio</div>')
1065
+
1066
+ # ---- Settings ----
1067
+ with gr.Group(elem_classes="card-section"):
1068
+ gr.HTML('<div class="card-title">βš™οΈ Settings</div>')
1069
+ gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) &mdash; auto-loaded, ready to use</div>')
1070
+ with gr.Row():
1071
+ language_choice = gr.Dropdown(
1072
+ choices=list(LANGUAGE_MAP.keys()),
1073
+ value="Auto-detect",
1074
+ label="Language",
1075
+ info="Auto-detect or select a specific language",
1076
+ scale=2,
1077
+ )
1078
+ speaker_count = gr.Slider(
1079
+ minimum=0, maximum=10, step=1, value=0,
1080
+ label="Number of Speakers",
1081
+ info="0 = auto-detect",
1082
+ scale=1,
1083
+ )
1084
+ with gr.Row(elem_classes="toggle-row"):
1085
+ enable_diarization = gr.Checkbox(
1086
+ value=True,
1087
+ label="Speaker Diarization",
1088
+ info="Identify who is speaking"
1089
+ )
1090
+ enable_vad = gr.Checkbox(
1091
+ value=True,
1092
+ label="VAD Filter",
1093
+ info="Skip silent parts for cleaner results"
1094
+ )
1095
+
1096
+ # ---- Start Button ----
1097
+ btn_start = gr.Button(
1098
+ "πŸš€ Start Transcription",
1099
+ variant="primary",
1100
+ size="lg",
1101
+ elem_classes="btn-start",
1102
+ )
1103
+
1104
+ # ---- Live Timer ----
1105
+ gr.HTML('<div id="live-timer" class="live-timer"></div>')
1106
+
1107
+ # ---- Results ----
1108
+ with gr.Group(elem_classes="card-section"):
1109
+ gr.HTML('<div class="card-title">πŸ“Š Transcription Results</div>')
1110
+ summary_output = gr.Markdown(
1111
+ elem_classes="summary-box",
1112
+ value="*Upload audio and click 'Start Transcription' to begin.*"
1113
+ )
1114
+ transcript_output = gr.Textbox(
1115
+ label="Transcript Text",
1116
+ lines=20,
1117
+ max_lines=50,
1118
+ show_copy_button=True,
1119
+ interactive=False,
1120
+ elem_classes="transcript-box",
1121
+ placeholder="Transcription results with timestamps and speaker labels will appear here...\n\n[00:00] Speaker 1: example transcription text...",
1122
+ )
1123
+
1124
+ # ---- Downloads ----
1125
+ with gr.Group(elem_classes="card-section"):
1126
+ gr.HTML('<div class="card-title">πŸ“₯ Download Files</div>')
1127
+ gr.HTML('<div style="font-size:12px;color:#6a6a7a;margin-bottom:8px;">Files are automatically deleted after 1 hour.</div>')
1128
+ with gr.Row(elem_classes="download-row"):
1129
+ srt_file = gr.File(label="SRT β€” Subtitles for video players")
1130
+ txt_file = gr.File(label="TXT β€” Text with speaker labels")
1131
+ docx_file = gr.File(label="DOCX β€” Colored Word document")
1132
+
1133
+ # ---- Connect ----
1134
+ # Timer is started by MutationObserver when Gradio progress() appears in DOM.
1135
+ # This ensures timer ONLY starts after validation passes (no file β†’ no progress).
1136
+ # Timer success-stop via .then(); error-stop via MutationObserver on error toast.
1137
+ btn_start.click(
1138
+ fn=transcribe_full,
1139
+ inputs=[audio_input, language_choice, speaker_count,
1140
+ enable_diarization, enable_vad],
1141
+ outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
1142
+ ).then(
1143
+ fn=lambda: None,
1144
+ inputs=None,
1145
+ outputs=None,
1146
+ js="() => { window.stopTranscribeTimer(true); }",
1147
+ )
1148
+
1149
+ # ---- Footer ----
1150
+ gr.HTML("""
1151
+ <div class="footer-text">
1152
+ <strong>TranscribeAI</strong> by <a href="https://huggingface.co/romizone">romizone</a>
1153
+ &bull; <a href="https://github.com/romizone/transcribeAI">GitHub</a>
1154
+ &bull; ZeroGPU H200 &bull; Whisper + PyTorch
1155
+ </div>
1156
+ """)
1157
+
1158
+ demo.queue().launch(ssr_mode=False)