""" TranscribeAI - Transcription with Speaker Diarization (ZeroGPU) ================================================================ Engine : openai/whisper via transformers pipeline (CUDA ZeroGPU H200) Speaker : MFCC + Agglomerative Clustering Language: Indonesian, English, Auto-detect (99 languages) Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM Output : SRT, TXT, DOCX """ import time import tempfile import threading import torch import spaces import gradio as gr import numpy as np from datetime import datetime from pathlib import Path from transformers import pipeline # ============================================================ # Config — Single model (small) for fastest startup & simplicity # ============================================================ MODEL_ID = 'openai/whisper-small' MODEL_NAME = 'small' LANGUAGE_MAP = { 'Auto-detect': None, 'Indonesian': 'id', 'English': 'en', 'Japanese': 'ja', 'Korean': 'ko', 'Chinese': 'zh', 'Arabic': 'ar', 'French': 'fr', 'German': 'de', 'Spanish': 'es', 'Portuguese': 'pt', 'Russian': 'ru', 'Thai': 'th', 'Vietnamese': 'vi', 'Malay': 'ms', 'Hindi': 'hi', 'Turkish': 'tr', 'Dutch': 'nl', 'Italian': 'it', } BATCH_SIZE = 16 # A10G 24GB VRAM — safe for whisper-small float16 OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output' OUTPUT_DIR.mkdir(exist_ok=True) # ============================================================ # Load pipeline at MODULE LEVEL (ZeroGPU requirement!) # Single model = faster startup, no on-demand loading delay # ============================================================ device = 0 if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...") pipe = pipeline( task="automatic-speech-recognition", model=MODEL_ID, chunk_length_s=30, device=device, torch_dtype=torch_dtype, ) print(f" {MODEL_NAME} ready!") # ============================================================ # Helpers # ============================================================ def fmt_timestamp(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = int(seconds % 60) ms = int((seconds % 1) * 1000) return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" def fmt_time(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = int(seconds % 60) if h > 0: return f"{h:02d}:{m:02d}:{s:02d}" return f"{m:02d}:{s:02d}" # ============================================================ # Speaker Diarization (MFCC + Clustering) — CPU # ============================================================ def perform_diarization(audio_path, segments, num_speakers): import librosa from sklearn.cluster import AgglomerativeClustering from sklearn.preprocessing import StandardScaler if not segments or len(segments) < 2: for seg in segments: seg['speaker'] = 'Speaker 1' seg['speaker_id'] = 0 return segments y, sr = librosa.load(str(audio_path), sr=16000, mono=True) features = [] valid_indices = [] for i, seg in enumerate(segments): s0 = int(seg['start'] * sr) s1 = min(int(seg['end'] * sr), len(y)) if s1 <= s0 or s0 >= len(y): continue chunk = y[s0:s1] if len(chunk) < int(sr * 0.3): continue try: # Cap analysis to 3s per segment for speed max_samples = int(sr * 3) analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk # MFCC (13 = industry standard) + delta — sufficient for speaker ID mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13) delta = librosa.feature.delta(mfcc) # F0 (pitch) — key differentiator between speakers f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr) f0c = f0[f0 > 0] f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0 f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0 combined = np.vstack([mfcc, delta]) vec = np.concatenate([ np.mean(combined, axis=1), np.std(combined, axis=1), [f0_mean, f0_std] ]) features.append(vec) valid_indices.append(i) except Exception: continue if len(features) < 2: for seg in segments: seg['speaker'] = 'Speaker 1' seg['speaker_id'] = 0 return segments X = np.array(features) X_scaled = StandardScaler().fit_transform(X) if num_speakers <= 0: from sklearn.metrics import silhouette_score best_score, best_n = -1, 2 max_n = min(6, len(X_scaled) - 1) for n in range(2, max_n + 1): try: lbls = AgglomerativeClustering( n_clusters=n, metric='cosine', linkage='average' ).fit_predict(X_scaled) score = silhouette_score(X_scaled, lbls, metric='cosine') if score > best_score: best_score, best_n = score, n except Exception: pass num_speakers = best_n else: num_speakers = min(num_speakers, len(X_scaled)) if num_speakers >= 2 and len(X_scaled) >= num_speakers: labels = AgglomerativeClustering( n_clusters=num_speakers, metric='cosine', linkage='average' ).fit_predict(X_scaled) else: labels = np.zeros(len(X_scaled), dtype=int) label_map = {} for lbl in labels: if lbl not in label_map: label_map[lbl] = len(label_map) + 1 assigns = {} for idx, seg_idx in enumerate(valid_indices): assigns[seg_idx] = label_map[labels[idx]] for i, seg in enumerate(segments): if i in assigns: seg['speaker'] = f'Speaker {assigns[i]}' seg['speaker_id'] = assigns[i] - 1 else: nearest = min(valid_indices, key=lambda x: abs(x - i)) if valid_indices else 0 seg['speaker'] = f'Speaker {assigns.get(nearest, 1)}' seg['speaker_id'] = assigns.get(nearest, 1) - 1 return segments def merge_consecutive(segments): if not segments: return segments merged = [segments[0].copy()] for seg in segments[1:]: if seg.get('speaker') == merged[-1].get('speaker'): merged[-1]['end'] = seg['end'] merged[-1]['text'] += ' ' + seg['text'] else: merged.append(seg.copy()) return merged # ============================================================ # Export Functions # ============================================================ def generate_srt(segments, path): with open(path, 'w', encoding='utf-8') as f: for i, seg in enumerate(segments, 1): f.write(f"{i}\n") f.write(f"{fmt_timestamp(seg['start'])} --> {fmt_timestamp(seg['end'])}\n") sp = seg.get('speaker', '') f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n") LANG_NAMES = { 'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean', 'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German', 'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai', 'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish', 'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected', } def generate_txt(segments, path, filename='', language='', duration=0): with open(path, 'w', encoding='utf-8') as f: f.write("TRANSCRIPT\n" + "=" * 60 + "\n") if filename: f.write(f"File: {filename}\n") f.write(f"Language: {LANG_NAMES.get(language, language)}\n") f.write(f"Duration: {fmt_time(duration)}\n") f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") speakers = sorted(set(s.get('speaker', '') for s in segments)) f.write(f"Speakers: {', '.join(speakers)}\n") f.write("=" * 60 + "\n\n") cur_speaker = None for seg in segments: sp = seg.get('speaker', '') if sp != cur_speaker: cur_speaker = sp f.write(f"\n[{fmt_time(seg['start'])}] {sp}:\n") f.write(f"{seg['text']}\n") def generate_docx(segments, path, filename='', language='', duration=0): from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH colors = { 0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38), 2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6), 4: RGBColor(124, 58, 237), 5: RGBColor(219, 39, 119), } doc = Document() style = doc.styles['Normal'] style.font.name = 'Calibri' style.font.size = Pt(11) title = doc.add_heading('Transcript', level=0) title.alignment = WD_ALIGN_PARAGRAPH.CENTER meta = [] if filename: meta.append(('File', filename)) meta.append(('Language', LANG_NAMES.get(language, language))) meta.append(('Duration', fmt_time(duration))) meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments)) meta.append(('Speakers', ', '.join(speakers))) for label, val in meta: p = doc.add_paragraph() r = p.add_run(f'{label}: ') r.bold = True r.font.size = Pt(10) r.font.color.rgb = RGBColor(100, 100, 100) r = p.add_run(val) r.font.size = Pt(10) p.paragraph_format.space_after = Pt(2) doc.add_paragraph('_' * 70) for seg in segments: p = doc.add_paragraph() r = p.add_run(f'[{fmt_time(seg["start"])}] ') r.font.size = Pt(9) r.font.color.rgb = RGBColor(150, 150, 150) sp_id = seg.get('speaker_id', 0) sp = seg.get('speaker', 'Speaker 1') color = colors.get(sp_id, RGBColor(79, 70, 229)) r = p.add_run(f'{sp}: ') r.bold = True r.font.size = Pt(11) r.font.color.rgb = color r = p.add_run(seg['text']) r.font.size = Pt(11) p.paragraph_format.space_after = Pt(6) doc.save(path) # ============================================================ # GPU Transcription (ZeroGPU — proven pattern) # ============================================================ @spaces.GPU(duration=120) def transcribe_with_gpu(audio_path, language): """Run Whisper inference on GPU. Single model, always ready.""" generate_kwargs = {"task": "transcribe"} if language: generate_kwargs["language"] = language result = pipe( str(audio_path), batch_size=BATCH_SIZE, return_timestamps=True, generate_kwargs=generate_kwargs, ) # Parse segments raw_segments = [] duration = 0.0 chunks = result.get("chunks", []) if chunks: for chunk in chunks: text = chunk.get("text", "").strip() ts = chunk.get("timestamp", (0, 0)) start = ts[0] if ts[0] is not None else 0 end = ts[1] if ts[1] is not None else start + 1 if end > duration: duration = end if text: raw_segments.append({ 'start': round(start, 2), 'end': round(end, 2), 'text': text, }) else: full_text = result.get("text", "").strip() if full_text: raw_segments.append({'start': 0, 'end': 1, 'text': full_text}) detected_lang = language or "auto" return raw_segments, detected_lang, duration def apply_vad_filter(segments): """Filter out segments that are likely silence/noise (very short + filler).""" FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]', '(music)', '[Musik]', '[musik]', '♪', '♪♪', '♫'} MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise filtered = [] for seg in segments: text = seg['text'].strip() seg_dur = seg['end'] - seg['start'] if text in FILLER: continue if seg_dur < MIN_DURATION and len(text.split()) <= 1: continue filtered.append(seg) return filtered if filtered else segments # fallback: return original if all filtered # ============================================================ # Full Pipeline (wired to Gradio) # ============================================================ def transcribe_full(audio_file, language_name, num_speakers, enable_diarization, enable_vad, progress=gr.Progress()): if audio_file is None: raise gr.Error("Upload file audio terlebih dahulu!") audio_path = audio_file filename = Path(audio_path).name lang_code = LANGUAGE_MAP.get(language_name, None) num_speakers = int(num_speakers) # Gradio slider returns float t0 = time.time() # Start timing from here — matches JS timer progress(0.05, desc="⏳ Menunggu GPU & memproses audio... (bisa 30-90 detik)") # 1. Transcribe on GPU try: segments, detected_lang, duration = transcribe_with_gpu( audio_path, lang_code ) except Exception as e: raise gr.Error(f"Gagal transkripsi: {str(e)}") if not segments: raise gr.Error("Tidak ada teks yang terdeteksi dari audio.") # 1b. VAD filter — remove silence/filler segments if enable_vad: segments = apply_vad_filter(segments) transcribe_time = time.time() - t0 progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s) — {len(segments)} segmen") # 2. Speaker Diarization (CPU) diarization_note = "" if enable_diarization and len(segments) >= 2: progress(0.65, desc="🔍 Mengidentifikasi pembicara...") try: segments = perform_diarization(audio_path, segments, num_speakers) segments = merge_consecutive(segments) except Exception as e: print(f" [Diarization] Error: {e}") diarization_note = " ⚠️ (diarization gagal, fallback 1 speaker)" for seg in segments: seg['speaker'] = 'Speaker 1' seg['speaker_id'] = 0 else: for seg in segments: seg['speaker'] = 'Speaker 1' seg['speaker_id'] = 0 progress(0.85, desc="📄 Membuat file output...") # 3. Export timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') base_name = Path(filename).stem srt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.srt") txt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.txt") docx_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.docx") generate_srt(segments, srt_path) generate_txt(segments, txt_path, filename, detected_lang, duration) generate_docx(segments, docx_path, filename, detected_lang, duration) progress(0.95, desc="📦 Menyiapkan hasil...") # Build display text transcript_lines = [] speakers_found = set() for seg in segments: sp = seg.get('speaker', 'Speaker 1') speakers_found.add(sp) transcript_lines.append(f"[{fmt_time(seg['start'])}] {sp}: {seg['text']}") transcript_text = "\n\n".join(transcript_lines) total_time = time.time() - t0 lang_display = detected_lang.upper() if detected_lang else 'AUTO' summary = ( f"**Transkripsi Selesai!**\n\n" f"| Info | Detail |\n" f"|------|--------|\n" f"| File | {filename} |\n" f"| Durasi Audio | {fmt_time(duration)} |\n" f"| Bahasa | {lang_display} |\n" f"| Model | {MODEL_NAME} (244M) |\n" f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}){diarization_note} |\n" f"| Segmen | {len(segments)} |\n" f"| Waktu Proses | {total_time:.0f} detik |\n" f"| Engine | Whisper + ZeroGPU H200 |" ) progress(1.0, desc="🎉 Selesai!") return summary, transcript_text, srt_path, txt_path, docx_path # ============================================================ # Cleanup old files (>1 hour) # ============================================================ def cleanup_loop(): while True: try: now = time.time() if OUTPUT_DIR.exists(): for f in OUTPUT_DIR.iterdir(): if f.is_file() and (now - f.stat().st_mtime) > 3600: f.unlink(missing_ok=True) print(f" [Cleanup] Deleted: {f.name}") except Exception as e: print(f" [Cleanup] Error: {e}") time.sleep(300) threading.Thread(target=cleanup_loop, daemon=True).start() # ============================================================ # Gradio UI # ============================================================ THEME = gr.themes.Base( primary_hue=gr.themes.colors.indigo, secondary_hue=gr.themes.colors.purple, neutral_hue=gr.themes.colors.gray, font=gr.themes.GoogleFont("Inter"), ).set( body_background_fill="#0f0f11", body_background_fill_dark="#0f0f11", block_background_fill="#1a1a1f", block_background_fill_dark="#1a1a1f", block_border_color="#333340", block_border_color_dark="#333340", block_label_text_color="#a0a0b0", block_title_text_color="#e8e8ed", body_text_color="#e8e8ed", body_text_color_dark="#e8e8ed", button_primary_background_fill="#6366f1", button_primary_background_fill_dark="#6366f1", button_primary_text_color="#ffffff", input_background_fill="#222228", input_background_fill_dark="#222228", input_border_color="#333340", input_border_color_dark="#333340", ) CUSTOM_CSS = """ /* Global */ .gradio-container { max-width: 960px !important; margin: 0 auto !important; } footer { display: none !important; } /* Header */ .header-wrap { text-align: center; padding: 32px 0 20px; } .header-wrap h1 { font-size: 32px !important; font-weight: 800 !important; background: linear-gradient(135deg, #818cf8, #8b5cf6) !important; -webkit-background-clip: text !important; -webkit-text-fill-color: transparent !important; background-clip: text !important; letter-spacing: -0.5px; margin-bottom: 6px !important; } .header-wrap p { color: #a0a0b0 !important; font-size: 14px !important; } .badge-gpu { display: inline-flex; align-items: center; gap: 6px; background: rgba(99,102,241,.12); color: #818cf8; font-size: 12px; padding: 4px 14px; border-radius: 20px; font-weight: 600; margin-top: 8px; } .badge-gpu::before { content: ''; width: 7px; height: 7px; background: #10b981; border-radius: 50%; display: inline-block; } /* Cards */ .card-section { background: #1a1a1f !important; border: 1px solid #333340 !important; border-radius: 14px !important; padding: 20px 24px !important; margin-bottom: 12px !important; } .card-title { font-size: 14px !important; font-weight: 700 !important; color: #e8e8ed !important; margin-bottom: 12px !important; display: flex; align-items: center; gap: 8px; } /* Primary button */ .btn-start { background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; border: none !important; border-radius: 12px !important; font-size: 16px !important; font-weight: 700 !important; padding: 14px 32px !important; transition: all 0.2s !important; box-shadow: 0 4px 15px rgba(99,102,241,.3) !important; } .btn-start:hover { transform: translateY(-1px) !important; box-shadow: 0 6px 20px rgba(99,102,241,.4) !important; } /* Settings grid */ .settings-row { gap: 8px !important; } /* Transcript output */ .transcript-box textarea { font-family: 'Inter', 'SF Mono', monospace !important; font-size: 13px !important; line-height: 1.7 !important; background: #16161a !important; border-radius: 10px !important; } /* Download cards — labels (dark bg) */ .download-row label span, .download-row .label-wrap span { color: #e8e8ed !important; font-weight: 700 !important; } /* Download cards — file items (white bg → black bold text) */ .download-row .file-preview, .download-row .download-file, .download-row .file-component { border-radius: 10px !important; } .download-row .file-preview *, .download-row .download-file *, .download-row .file-component *, .download-row a, .download-row .file-name, .download-row .file-size { color: #111 !important; font-weight: 700 !important; } /* Result summary */ .summary-box { background: #1a1a1f !important; border: 1px solid #2a2a35 !important; border-radius: 12px !important; padding: 16px !important; } .summary-box table { width: 100% !important; } .summary-box td, .summary-box th { padding: 6px 12px !important; font-size: 13px !important; border-bottom: 1px solid #222230 !important; } /* Toggle checkboxes */ .toggle-row { gap: 24px !important; } /* Audio upload area */ .audio-upload { border: 2px dashed #333340 !important; border-radius: 14px !important; transition: all 0.2s !important; } .audio-upload:hover { border-color: #6366f1 !important; } /* How-to steps */ .howto { display: flex; gap: 16px; margin: 12px 0 4px; flex-wrap: wrap; } .howto-step { display: flex; align-items: center; gap: 8px; font-size: 13px; color: #a0a0b0; } .howto-num { width: 24px; height: 24px; border-radius: 50%; background: linear-gradient(135deg, #6366f1, #8b5cf6); color: #fff; font-size: 12px; font-weight: 700; display: flex; align-items: center; justify-content: center; flex-shrink: 0; } /* Feature tags */ .features { display: flex; gap: 8px; flex-wrap: wrap; justify-content: center; margin-top: 12px; } .feat-tag { font-size: 11px; padding: 4px 10px; border-radius: 6px; background: #1a1a1f; border: 1px solid #333340; color: #a0a0b0; } /* Footer */ .footer-text { text-align: center; padding: 20px 0 8px; color: #6a6a7a; font-size: 12px; } .footer-text a { color: #818cf8; text-decoration: none; } /* ===== FIX: Dropdown text visibility ===== */ /* Selected value text */ .gr-dropdown .wrap .wrap-inner .secondary-wrap, .gr-dropdown .wrap .wrap-inner .secondary-wrap span, .gr-dropdown .wrap .wrap-inner input, .gr-dropdown input, .dropdown .wrap span, .dropdown input[type="text"], div[data-testid="dropdown"] span, div[data-testid="dropdown"] input { color: #e8e8ed !important; } /* Dropdown options list */ .gr-dropdown ul[role="listbox"], .gr-dropdown .options, .dropdown ul, .dropdown li, ul[role="listbox"], li[role="option"], div[role="option"] { color: #e8e8ed !important; background-color: #1a1a1f !important; } li[role="option"]:hover, div[role="option"]:hover, li[role="option"].selected, li[role="option"][aria-selected="true"] { background-color: rgba(99,102,241,.2) !important; color: #c7c7ff !important; } /* Dropdown container border */ .gr-dropdown .wrap, .dropdown .wrap { background: #222228 !important; border-color: #333340 !important; } /* Dropdown info text */ .gr-dropdown .info-text, .dropdown .info-text, span[data-testid="info-text"] { color: #8888a0 !important; } /* ===== FIX: Upload progress visibility ===== */ /* Gradio upload progress bar */ .upload-container .progress-bar, .uploading .progress-bar, .file-upload .progress-bar { background: #333340 !important; border-radius: 6px !important; overflow: hidden !important; } .upload-container .progress-bar .progress, .uploading .progress-bar .progress, .file-upload .progress-bar .progress { background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; } /* Upload progress text */ .upload-container .progress-text, .uploading .progress-text, .file-upload-text, .upload-text, .eta-bar { color: #e8e8ed !important; font-weight: 600 !important; } /* Gradio's built-in ETA bar */ .eta-bar { background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; opacity: 0.3 !important; } /* Progress level / status text */ .progress-level, .progress-level span, .progress-level .progress-level-inner { color: #e8e8ed !important; font-size: 13px !important; } /* Upload button area */ .upload-button, .upload-button span { color: #e8e8ed !important; border-color: #6366f1 !important; } /* Audio component loading state */ .audio-upload .uploading, .audio-upload .loading { color: #e8e8ed !important; } /* Spinner / loading indicator */ .audio-upload .loading svg, .audio-upload .spinner { color: #818cf8 !important; } /* ===== Live Timer ===== */ .live-timer { display: none; align-items: center; justify-content: center; gap: 10px; background: rgba(99,102,241,.08); border: 1px solid rgba(99,102,241,.3); color: #c7c7ff; padding: 12px 24px; border-radius: 12px; font-size: 15px; font-weight: 700; font-family: 'Inter', 'SF Mono', monospace; margin-bottom: 12px; letter-spacing: 0.5px; } .live-timer.active { display: flex !important; } .live-timer.done { background: rgba(16,185,129,.08) !important; border-color: rgba(16,185,129,.3) !important; color: #6ee7b7 !important; } .live-timer.error { background: rgba(239,68,68,.08) !important; border-color: rgba(239,68,68,.3) !important; color: #fca5a5 !important; } .pulse-dot { width: 10px; height: 10px; border-radius: 50%; background: #818cf8; animation: pulse-blink 1s ease-in-out infinite; flex-shrink: 0; } .live-timer.done .pulse-dot { display: none; } .live-timer.error .pulse-dot { display: none; } @keyframes pulse-blink { 0%, 100% { opacity: 1; transform: scale(1); } 50% { opacity: 0.3; transform: scale(0.7); } } .timer-clock { font-variant-numeric: tabular-nums; min-width: 52px; text-align: center; } /* Responsive */ @media (max-width: 640px) { .howto { flex-direction: column; gap: 8px; } .features { gap: 4px; } .header-wrap h1 { font-size: 26px !important; } } """ UPLOAD_PROGRESS_JS = """ """ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS, head=UPLOAD_PROGRESS_JS) as demo: # ---- Header ---- gr.HTML("""

TranscribeAI

Transkripsi Audio dengan Speaker Diarization — Gratis & Cepat

ZeroGPU H200 • Whisper • Tanpa API Key
99+ Bahasa Speaker ID SRT / TXT / DOCX GPU Accelerated Auto-detect Bahasa
1
Upload audio
2
Klik Mulai
3
Download hasil
""") # ---- Upload ---- with gr.Group(elem_classes="card-section"): gr.HTML('
🎵 Upload Audio
') audio_input = gr.Audio( label="Drag & drop file audio/video, atau klik untuk pilih file. Bisa juga rekam langsung.", type="filepath", sources=["upload", "microphone"], elem_classes="audio-upload", ) gr.HTML('
Format: MP3, MP4, WAV, M4A, OGG, FLAC, WEBM • Maks ~1 jam audio
') # ---- Settings ---- with gr.Group(elem_classes="card-section"): gr.HTML('
⚙️ Pengaturan
') gr.HTML('
Model: Whisper Small (244M) — auto-loaded, siap pakai
') with gr.Row(): language_choice = gr.Dropdown( choices=list(LANGUAGE_MAP.keys()), value="Auto-detect", label="Bahasa", info="Auto-detect atau pilih bahasa spesifik", scale=2, ) speaker_count = gr.Slider( minimum=0, maximum=10, step=1, value=0, label="Jumlah Pembicara", info="0 = auto-detect", scale=1, ) with gr.Row(elem_classes="toggle-row"): enable_diarization = gr.Checkbox( value=True, label="Speaker Diarization", info="Identifikasi siapa yang berbicara" ) enable_vad = gr.Checkbox( value=True, label="VAD Filter", info="Lewati bagian hening untuk hasil lebih bersih" ) # ---- Start Button ---- btn_start = gr.Button( "🚀 Mulai Transkripsi", variant="primary", size="lg", elem_classes="btn-start", ) # ---- Live Timer ---- gr.HTML('
') # ---- Results ---- with gr.Group(elem_classes="card-section"): gr.HTML('
📊 Hasil Transkripsi
') summary_output = gr.Markdown( elem_classes="summary-box", value="*Upload audio dan klik 'Mulai Transkripsi' untuk memulai.*" ) transcript_output = gr.Textbox( label="Teks Transkripsi", lines=20, max_lines=50, show_copy_button=True, interactive=False, elem_classes="transcript-box", placeholder="Hasil transkripsi dengan timestamp dan speaker label akan muncul di sini...\n\n[00:00] Speaker 1: contoh teks transkripsi...", ) # ---- Downloads ---- with gr.Group(elem_classes="card-section"): gr.HTML('
📥 Download File
') gr.HTML('
File otomatis dihapus setelah 1 jam.
') with gr.Row(elem_classes="download-row"): srt_file = gr.File(label="SRT — Subtitle untuk video player") txt_file = gr.File(label="TXT — Teks dengan speaker label") docx_file = gr.File(label="DOCX — Dokumen Word berwarna") # ---- Connect ---- # Timer is started by MutationObserver when Gradio progress() appears in DOM. # This ensures timer ONLY starts after validation passes (no file → no progress). # Timer success-stop via .then(); error-stop via MutationObserver on error toast. btn_start.click( fn=transcribe_full, inputs=[audio_input, language_choice, speaker_count, enable_diarization, enable_vad], outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file], ).then( fn=lambda: None, inputs=None, outputs=None, js="() => { window.stopTranscribeTimer(true); }", ) # ---- Footer ---- gr.HTML(""" """) demo.queue().launch(ssr_mode=False)