Spaces:
Running on Zero
Running on Zero
| """ | |
| TranscribeAI - Transcription with Speaker Diarization (ZeroGPU) | |
| ================================================================ | |
| Engine : openai/whisper via transformers pipeline (CUDA ZeroGPU H200) | |
| Speaker : MFCC + Agglomerative Clustering | |
| Language: Indonesian, English, Auto-detect (99 languages) | |
| Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM | |
| Output : SRT, TXT, DOCX | |
| """ | |
| import time | |
| import tempfile | |
| import threading | |
| import torch | |
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| from datetime import datetime | |
| from pathlib import Path | |
| from transformers import pipeline | |
| # ============================================================ | |
| # Config β Single model (small) for fastest startup & simplicity | |
| # ============================================================ | |
| MODEL_ID = 'openai/whisper-small' | |
| MODEL_NAME = 'small' | |
| LANGUAGE_MAP = { | |
| 'Auto-detect': None, | |
| 'Indonesian': 'id', | |
| 'English': 'en', | |
| 'Japanese': 'ja', | |
| 'Korean': 'ko', | |
| 'Chinese': 'zh', | |
| 'Arabic': 'ar', | |
| 'French': 'fr', | |
| 'German': 'de', | |
| 'Spanish': 'es', | |
| 'Portuguese': 'pt', | |
| 'Russian': 'ru', | |
| 'Thai': 'th', | |
| 'Vietnamese': 'vi', | |
| 'Malay': 'ms', | |
| 'Hindi': 'hi', | |
| 'Turkish': 'tr', | |
| 'Dutch': 'nl', | |
| 'Italian': 'it', | |
| } | |
| BATCH_SIZE = 16 # A10G 24GB VRAM β safe for whisper-small float16 | |
| OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output' | |
| OUTPUT_DIR.mkdir(exist_ok=True) | |
| # ============================================================ | |
| # Load pipeline at MODULE LEVEL (ZeroGPU requirement!) | |
| # Single model = faster startup, no on-demand loading delay | |
| # ============================================================ | |
| device = 0 if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...") | |
| pipe = pipeline( | |
| task="automatic-speech-recognition", | |
| model=MODEL_ID, | |
| chunk_length_s=30, | |
| device=device, | |
| torch_dtype=torch_dtype, | |
| ) | |
| print(f" {MODEL_NAME} ready!") | |
| # ============================================================ | |
| # Helpers | |
| # ============================================================ | |
| def fmt_timestamp(seconds): | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = int(seconds % 60) | |
| ms = int((seconds % 1) * 1000) | |
| return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" | |
| def fmt_time(seconds): | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = int(seconds % 60) | |
| if h > 0: | |
| return f"{h:02d}:{m:02d}:{s:02d}" | |
| return f"{m:02d}:{s:02d}" | |
| # ============================================================ | |
| # Speaker Diarization (MFCC + Clustering) β CPU | |
| # ============================================================ | |
| def perform_diarization(audio_path, segments, num_speakers): | |
| import librosa | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.preprocessing import StandardScaler | |
| if not segments or len(segments) < 2: | |
| for seg in segments: | |
| seg['speaker'] = 'Speaker 1' | |
| seg['speaker_id'] = 0 | |
| return segments | |
| y, sr = librosa.load(str(audio_path), sr=16000, mono=True) | |
| features = [] | |
| valid_indices = [] | |
| for i, seg in enumerate(segments): | |
| s0 = int(seg['start'] * sr) | |
| s1 = min(int(seg['end'] * sr), len(y)) | |
| if s1 <= s0 or s0 >= len(y): | |
| continue | |
| chunk = y[s0:s1] | |
| if len(chunk) < int(sr * 0.3): | |
| continue | |
| try: | |
| # Cap analysis to 3s per segment for speed | |
| max_samples = int(sr * 3) | |
| analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk | |
| # MFCC (13 = industry standard) + delta β sufficient for speaker ID | |
| mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13) | |
| delta = librosa.feature.delta(mfcc) | |
| # F0 (pitch) β key differentiator between speakers | |
| f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr) | |
| f0c = f0[f0 > 0] | |
| f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0 | |
| f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0 | |
| combined = np.vstack([mfcc, delta]) | |
| vec = np.concatenate([ | |
| np.mean(combined, axis=1), | |
| np.std(combined, axis=1), | |
| [f0_mean, f0_std] | |
| ]) | |
| features.append(vec) | |
| valid_indices.append(i) | |
| except Exception: | |
| continue | |
| if len(features) < 2: | |
| for seg in segments: | |
| seg['speaker'] = 'Speaker 1' | |
| seg['speaker_id'] = 0 | |
| return segments | |
| X = np.array(features) | |
| X_scaled = StandardScaler().fit_transform(X) | |
| if num_speakers <= 0: | |
| from sklearn.metrics import silhouette_score | |
| best_score, best_n = -1, 2 | |
| max_n = min(6, len(X_scaled) - 1) | |
| for n in range(2, max_n + 1): | |
| try: | |
| lbls = AgglomerativeClustering( | |
| n_clusters=n, metric='cosine', linkage='average' | |
| ).fit_predict(X_scaled) | |
| score = silhouette_score(X_scaled, lbls, metric='cosine') | |
| if score > best_score: | |
| best_score, best_n = score, n | |
| except Exception: | |
| pass | |
| num_speakers = best_n | |
| else: | |
| num_speakers = min(num_speakers, len(X_scaled)) | |
| if num_speakers >= 2 and len(X_scaled) >= num_speakers: | |
| labels = AgglomerativeClustering( | |
| n_clusters=num_speakers, metric='cosine', linkage='average' | |
| ).fit_predict(X_scaled) | |
| else: | |
| labels = np.zeros(len(X_scaled), dtype=int) | |
| label_map = {} | |
| for lbl in labels: | |
| if lbl not in label_map: | |
| label_map[lbl] = len(label_map) + 1 | |
| assigns = {} | |
| for idx, seg_idx in enumerate(valid_indices): | |
| assigns[seg_idx] = label_map[labels[idx]] | |
| for i, seg in enumerate(segments): | |
| if i in assigns: | |
| seg['speaker'] = f'Speaker {assigns[i]}' | |
| seg['speaker_id'] = assigns[i] - 1 | |
| else: | |
| nearest = min(valid_indices, key=lambda x: abs(x - i)) if valid_indices else 0 | |
| seg['speaker'] = f'Speaker {assigns.get(nearest, 1)}' | |
| seg['speaker_id'] = assigns.get(nearest, 1) - 1 | |
| return segments | |
| def merge_consecutive(segments): | |
| if not segments: | |
| return segments | |
| merged = [segments[0].copy()] | |
| for seg in segments[1:]: | |
| if seg.get('speaker') == merged[-1].get('speaker'): | |
| merged[-1]['end'] = seg['end'] | |
| merged[-1]['text'] += ' ' + seg['text'] | |
| else: | |
| merged.append(seg.copy()) | |
| return merged | |
| # ============================================================ | |
| # Export Functions | |
| # ============================================================ | |
| def generate_srt(segments, path): | |
| with open(path, 'w', encoding='utf-8') as f: | |
| for i, seg in enumerate(segments, 1): | |
| f.write(f"{i}\n") | |
| f.write(f"{fmt_timestamp(seg['start'])} --> {fmt_timestamp(seg['end'])}\n") | |
| sp = seg.get('speaker', '') | |
| f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n") | |
| LANG_NAMES = { | |
| 'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean', | |
| 'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German', | |
| 'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai', | |
| 'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish', | |
| 'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected', | |
| } | |
| def generate_txt(segments, path, filename='', language='', duration=0): | |
| with open(path, 'w', encoding='utf-8') as f: | |
| f.write("TRANSCRIPT\n" + "=" * 60 + "\n") | |
| if filename: | |
| f.write(f"File: {filename}\n") | |
| f.write(f"Language: {LANG_NAMES.get(language, language)}\n") | |
| f.write(f"Duration: {fmt_time(duration)}\n") | |
| f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") | |
| speakers = sorted(set(s.get('speaker', '') for s in segments)) | |
| f.write(f"Speakers: {', '.join(speakers)}\n") | |
| f.write("=" * 60 + "\n\n") | |
| cur_speaker = None | |
| for seg in segments: | |
| sp = seg.get('speaker', '') | |
| if sp != cur_speaker: | |
| cur_speaker = sp | |
| f.write(f"\n[{fmt_time(seg['start'])}] {sp}:\n") | |
| f.write(f"{seg['text']}\n") | |
| def generate_docx(segments, path, filename='', language='', duration=0): | |
| from docx import Document | |
| from docx.shared import Pt, RGBColor | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| colors = { | |
| 0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38), | |
| 2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6), | |
| 4: RGBColor(124, 58, 237), 5: RGBColor(219, 39, 119), | |
| } | |
| doc = Document() | |
| style = doc.styles['Normal'] | |
| style.font.name = 'Calibri' | |
| style.font.size = Pt(11) | |
| title = doc.add_heading('Transcript', level=0) | |
| title.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| meta = [] | |
| if filename: | |
| meta.append(('File', filename)) | |
| meta.append(('Language', LANG_NAMES.get(language, language))) | |
| meta.append(('Duration', fmt_time(duration))) | |
| meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) | |
| speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments)) | |
| meta.append(('Speakers', ', '.join(speakers))) | |
| for label, val in meta: | |
| p = doc.add_paragraph() | |
| r = p.add_run(f'{label}: ') | |
| r.bold = True | |
| r.font.size = Pt(10) | |
| r.font.color.rgb = RGBColor(100, 100, 100) | |
| r = p.add_run(val) | |
| r.font.size = Pt(10) | |
| p.paragraph_format.space_after = Pt(2) | |
| doc.add_paragraph('_' * 70) | |
| for seg in segments: | |
| p = doc.add_paragraph() | |
| r = p.add_run(f'[{fmt_time(seg["start"])}] ') | |
| r.font.size = Pt(9) | |
| r.font.color.rgb = RGBColor(150, 150, 150) | |
| sp_id = seg.get('speaker_id', 0) | |
| sp = seg.get('speaker', 'Speaker 1') | |
| color = colors.get(sp_id, RGBColor(79, 70, 229)) | |
| r = p.add_run(f'{sp}: ') | |
| r.bold = True | |
| r.font.size = Pt(11) | |
| r.font.color.rgb = color | |
| r = p.add_run(seg['text']) | |
| r.font.size = Pt(11) | |
| p.paragraph_format.space_after = Pt(6) | |
| doc.save(path) | |
| # ============================================================ | |
| # GPU Transcription (ZeroGPU β proven pattern) | |
| # ============================================================ | |
| def transcribe_with_gpu(audio_path, language): | |
| """Run Whisper inference on GPU. Single model, always ready.""" | |
| generate_kwargs = {"task": "transcribe"} | |
| if language: | |
| generate_kwargs["language"] = language | |
| result = pipe( | |
| str(audio_path), | |
| batch_size=BATCH_SIZE, | |
| return_timestamps=True, | |
| generate_kwargs=generate_kwargs, | |
| ) | |
| # Parse segments | |
| raw_segments = [] | |
| duration = 0.0 | |
| chunks = result.get("chunks", []) | |
| if chunks: | |
| for chunk in chunks: | |
| text = chunk.get("text", "").strip() | |
| ts = chunk.get("timestamp", (0, 0)) | |
| start = ts[0] if ts[0] is not None else 0 | |
| end = ts[1] if ts[1] is not None else start + 1 | |
| if end > duration: | |
| duration = end | |
| if text: | |
| raw_segments.append({ | |
| 'start': round(start, 2), | |
| 'end': round(end, 2), | |
| 'text': text, | |
| }) | |
| else: | |
| full_text = result.get("text", "").strip() | |
| if full_text: | |
| raw_segments.append({'start': 0, 'end': 1, 'text': full_text}) | |
| detected_lang = language or "auto" | |
| return raw_segments, detected_lang, duration | |
| def apply_vad_filter(segments): | |
| """Filter out segments that are likely silence/noise (very short + filler).""" | |
| FILLER = {'', '.', '..', '...', 'β¦', '-', 'β', '[Music]', '[music]', | |
| '(music)', '[Musik]', '[musik]', 'βͺ', 'βͺβͺ', 'β«'} | |
| MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise | |
| filtered = [] | |
| for seg in segments: | |
| text = seg['text'].strip() | |
| seg_dur = seg['end'] - seg['start'] | |
| if text in FILLER: | |
| continue | |
| if seg_dur < MIN_DURATION and len(text.split()) <= 1: | |
| continue | |
| filtered.append(seg) | |
| return filtered if filtered else segments # fallback: return original if all filtered | |
| # ============================================================ | |
| # Full Pipeline (wired to Gradio) | |
| # ============================================================ | |
| def transcribe_full(audio_file, language_name, num_speakers, | |
| enable_diarization, enable_vad, progress=gr.Progress()): | |
| if audio_file is None: | |
| raise gr.Error("Upload file audio terlebih dahulu!") | |
| audio_path = audio_file | |
| filename = Path(audio_path).name | |
| lang_code = LANGUAGE_MAP.get(language_name, None) | |
| num_speakers = int(num_speakers) # Gradio slider returns float | |
| t0 = time.time() # Start timing from here β matches JS timer | |
| progress(0.05, desc="β³ Menunggu GPU & memproses audio... (bisa 30-90 detik)") | |
| # 1. Transcribe on GPU | |
| try: | |
| segments, detected_lang, duration = transcribe_with_gpu( | |
| audio_path, lang_code | |
| ) | |
| except Exception as e: | |
| raise gr.Error(f"Gagal transkripsi: {str(e)}") | |
| if not segments: | |
| raise gr.Error("Tidak ada teks yang terdeteksi dari audio.") | |
| # 1b. VAD filter β remove silence/filler segments | |
| if enable_vad: | |
| segments = apply_vad_filter(segments) | |
| transcribe_time = time.time() - t0 | |
| progress(0.60, desc=f"β Transkripsi selesai ({transcribe_time:.0f}s) β {len(segments)} segmen") | |
| # 2. Speaker Diarization (CPU) | |
| diarization_note = "" | |
| if enable_diarization and len(segments) >= 2: | |
| progress(0.65, desc="π Mengidentifikasi pembicara...") | |
| try: | |
| segments = perform_diarization(audio_path, segments, num_speakers) | |
| segments = merge_consecutive(segments) | |
| except Exception as e: | |
| print(f" [Diarization] Error: {e}") | |
| diarization_note = " β οΈ (diarization gagal, fallback 1 speaker)" | |
| for seg in segments: | |
| seg['speaker'] = 'Speaker 1' | |
| seg['speaker_id'] = 0 | |
| else: | |
| for seg in segments: | |
| seg['speaker'] = 'Speaker 1' | |
| seg['speaker_id'] = 0 | |
| progress(0.85, desc="π Membuat file output...") | |
| # 3. Export | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| base_name = Path(filename).stem | |
| srt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.srt") | |
| txt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.txt") | |
| docx_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.docx") | |
| generate_srt(segments, srt_path) | |
| generate_txt(segments, txt_path, filename, detected_lang, duration) | |
| generate_docx(segments, docx_path, filename, detected_lang, duration) | |
| progress(0.95, desc="π¦ Menyiapkan hasil...") | |
| # Build display text | |
| transcript_lines = [] | |
| speakers_found = set() | |
| for seg in segments: | |
| sp = seg.get('speaker', 'Speaker 1') | |
| speakers_found.add(sp) | |
| transcript_lines.append(f"[{fmt_time(seg['start'])}] {sp}: {seg['text']}") | |
| transcript_text = "\n\n".join(transcript_lines) | |
| total_time = time.time() - t0 | |
| lang_display = detected_lang.upper() if detected_lang else 'AUTO' | |
| summary = ( | |
| f"**Transkripsi Selesai!**\n\n" | |
| f"| Info | Detail |\n" | |
| f"|------|--------|\n" | |
| f"| File | {filename} |\n" | |
| f"| Durasi Audio | {fmt_time(duration)} |\n" | |
| f"| Bahasa | {lang_display} |\n" | |
| f"| Model | {MODEL_NAME} (244M) |\n" | |
| f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}){diarization_note} |\n" | |
| f"| Segmen | {len(segments)} |\n" | |
| f"| Waktu Proses | {total_time:.0f} detik |\n" | |
| f"| Engine | Whisper + ZeroGPU H200 |" | |
| ) | |
| progress(1.0, desc="π Selesai!") | |
| return summary, transcript_text, srt_path, txt_path, docx_path | |
| # ============================================================ | |
| # Cleanup old files (>1 hour) | |
| # ============================================================ | |
| def cleanup_loop(): | |
| while True: | |
| try: | |
| now = time.time() | |
| if OUTPUT_DIR.exists(): | |
| for f in OUTPUT_DIR.iterdir(): | |
| if f.is_file() and (now - f.stat().st_mtime) > 3600: | |
| f.unlink(missing_ok=True) | |
| print(f" [Cleanup] Deleted: {f.name}") | |
| except Exception as e: | |
| print(f" [Cleanup] Error: {e}") | |
| time.sleep(300) | |
| threading.Thread(target=cleanup_loop, daemon=True).start() | |
| # ============================================================ | |
| # Gradio UI | |
| # ============================================================ | |
| THEME = gr.themes.Base( | |
| primary_hue=gr.themes.colors.indigo, | |
| secondary_hue=gr.themes.colors.purple, | |
| neutral_hue=gr.themes.colors.gray, | |
| font=gr.themes.GoogleFont("Inter"), | |
| ).set( | |
| body_background_fill="#0f0f11", | |
| body_background_fill_dark="#0f0f11", | |
| block_background_fill="#1a1a1f", | |
| block_background_fill_dark="#1a1a1f", | |
| block_border_color="#333340", | |
| block_border_color_dark="#333340", | |
| block_label_text_color="#a0a0b0", | |
| block_title_text_color="#e8e8ed", | |
| body_text_color="#e8e8ed", | |
| body_text_color_dark="#e8e8ed", | |
| button_primary_background_fill="#6366f1", | |
| button_primary_background_fill_dark="#6366f1", | |
| button_primary_text_color="#ffffff", | |
| input_background_fill="#222228", | |
| input_background_fill_dark="#222228", | |
| input_border_color="#333340", | |
| input_border_color_dark="#333340", | |
| ) | |
| CUSTOM_CSS = """ | |
| /* Global */ | |
| .gradio-container { | |
| max-width: 960px !important; | |
| margin: 0 auto !important; | |
| } | |
| footer { display: none !important; } | |
| /* Header */ | |
| .header-wrap { | |
| text-align: center; | |
| padding: 32px 0 20px; | |
| } | |
| .header-wrap h1 { | |
| font-size: 32px !important; | |
| font-weight: 800 !important; | |
| background: linear-gradient(135deg, #818cf8, #8b5cf6) !important; | |
| -webkit-background-clip: text !important; | |
| -webkit-text-fill-color: transparent !important; | |
| background-clip: text !important; | |
| letter-spacing: -0.5px; | |
| margin-bottom: 6px !important; | |
| } | |
| .header-wrap p { | |
| color: #a0a0b0 !important; | |
| font-size: 14px !important; | |
| } | |
| .badge-gpu { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| background: rgba(99,102,241,.12); | |
| color: #818cf8; | |
| font-size: 12px; | |
| padding: 4px 14px; | |
| border-radius: 20px; | |
| font-weight: 600; | |
| margin-top: 8px; | |
| } | |
| .badge-gpu::before { | |
| content: ''; | |
| width: 7px; | |
| height: 7px; | |
| background: #10b981; | |
| border-radius: 50%; | |
| display: inline-block; | |
| } | |
| /* Cards */ | |
| .card-section { | |
| background: #1a1a1f !important; | |
| border: 1px solid #333340 !important; | |
| border-radius: 14px !important; | |
| padding: 20px 24px !important; | |
| margin-bottom: 12px !important; | |
| } | |
| .card-title { | |
| font-size: 14px !important; | |
| font-weight: 700 !important; | |
| color: #e8e8ed !important; | |
| margin-bottom: 12px !important; | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| } | |
| /* Primary button */ | |
| .btn-start { | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; | |
| border: none !important; | |
| border-radius: 12px !important; | |
| font-size: 16px !important; | |
| font-weight: 700 !important; | |
| padding: 14px 32px !important; | |
| transition: all 0.2s !important; | |
| box-shadow: 0 4px 15px rgba(99,102,241,.3) !important; | |
| } | |
| .btn-start:hover { | |
| transform: translateY(-1px) !important; | |
| box-shadow: 0 6px 20px rgba(99,102,241,.4) !important; | |
| } | |
| /* Settings grid */ | |
| .settings-row { | |
| gap: 8px !important; | |
| } | |
| /* Transcript output */ | |
| .transcript-box textarea { | |
| font-family: 'Inter', 'SF Mono', monospace !important; | |
| font-size: 13px !important; | |
| line-height: 1.7 !important; | |
| background: #16161a !important; | |
| border-radius: 10px !important; | |
| } | |
| /* Download cards β labels (dark bg) */ | |
| .download-row label span, | |
| .download-row .label-wrap span { | |
| color: #e8e8ed !important; | |
| font-weight: 700 !important; | |
| } | |
| /* Download cards β file items (white bg β black bold text) */ | |
| .download-row .file-preview, | |
| .download-row .download-file, | |
| .download-row .file-component { | |
| border-radius: 10px !important; | |
| } | |
| .download-row .file-preview *, | |
| .download-row .download-file *, | |
| .download-row .file-component *, | |
| .download-row a, | |
| .download-row .file-name, | |
| .download-row .file-size { | |
| color: #111 !important; | |
| font-weight: 700 !important; | |
| } | |
| /* Result summary */ | |
| .summary-box { | |
| background: #1a1a1f !important; | |
| border: 1px solid #2a2a35 !important; | |
| border-radius: 12px !important; | |
| padding: 16px !important; | |
| } | |
| .summary-box table { | |
| width: 100% !important; | |
| } | |
| .summary-box td, .summary-box th { | |
| padding: 6px 12px !important; | |
| font-size: 13px !important; | |
| border-bottom: 1px solid #222230 !important; | |
| } | |
| /* Toggle checkboxes */ | |
| .toggle-row { | |
| gap: 24px !important; | |
| } | |
| /* Audio upload area */ | |
| .audio-upload { | |
| border: 2px dashed #333340 !important; | |
| border-radius: 14px !important; | |
| transition: all 0.2s !important; | |
| } | |
| .audio-upload:hover { | |
| border-color: #6366f1 !important; | |
| } | |
| /* How-to steps */ | |
| .howto { | |
| display: flex; | |
| gap: 16px; | |
| margin: 12px 0 4px; | |
| flex-wrap: wrap; | |
| } | |
| .howto-step { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| font-size: 13px; | |
| color: #a0a0b0; | |
| } | |
| .howto-num { | |
| width: 24px; | |
| height: 24px; | |
| border-radius: 50%; | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6); | |
| color: #fff; | |
| font-size: 12px; | |
| font-weight: 700; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| flex-shrink: 0; | |
| } | |
| /* Feature tags */ | |
| .features { | |
| display: flex; | |
| gap: 8px; | |
| flex-wrap: wrap; | |
| justify-content: center; | |
| margin-top: 12px; | |
| } | |
| .feat-tag { | |
| font-size: 11px; | |
| padding: 4px 10px; | |
| border-radius: 6px; | |
| background: #1a1a1f; | |
| border: 1px solid #333340; | |
| color: #a0a0b0; | |
| } | |
| /* Footer */ | |
| .footer-text { | |
| text-align: center; | |
| padding: 20px 0 8px; | |
| color: #6a6a7a; | |
| font-size: 12px; | |
| } | |
| .footer-text a { | |
| color: #818cf8; | |
| text-decoration: none; | |
| } | |
| /* ===== FIX: Dropdown text visibility ===== */ | |
| /* Selected value text */ | |
| .gr-dropdown .wrap .wrap-inner .secondary-wrap, | |
| .gr-dropdown .wrap .wrap-inner .secondary-wrap span, | |
| .gr-dropdown .wrap .wrap-inner input, | |
| .gr-dropdown input, | |
| .dropdown .wrap span, | |
| .dropdown input[type="text"], | |
| div[data-testid="dropdown"] span, | |
| div[data-testid="dropdown"] input { | |
| color: #e8e8ed !important; | |
| } | |
| /* Dropdown options list */ | |
| .gr-dropdown ul[role="listbox"], | |
| .gr-dropdown .options, | |
| .dropdown ul, .dropdown li, | |
| ul[role="listbox"], | |
| li[role="option"], | |
| div[role="option"] { | |
| color: #e8e8ed !important; | |
| background-color: #1a1a1f !important; | |
| } | |
| li[role="option"]:hover, | |
| div[role="option"]:hover, | |
| li[role="option"].selected, | |
| li[role="option"][aria-selected="true"] { | |
| background-color: rgba(99,102,241,.2) !important; | |
| color: #c7c7ff !important; | |
| } | |
| /* Dropdown container border */ | |
| .gr-dropdown .wrap, .dropdown .wrap { | |
| background: #222228 !important; | |
| border-color: #333340 !important; | |
| } | |
| /* Dropdown info text */ | |
| .gr-dropdown .info-text, .dropdown .info-text, | |
| span[data-testid="info-text"] { | |
| color: #8888a0 !important; | |
| } | |
| /* ===== FIX: Upload progress visibility ===== */ | |
| /* Gradio upload progress bar */ | |
| .upload-container .progress-bar, | |
| .uploading .progress-bar, | |
| .file-upload .progress-bar { | |
| background: #333340 !important; | |
| border-radius: 6px !important; | |
| overflow: hidden !important; | |
| } | |
| .upload-container .progress-bar .progress, | |
| .uploading .progress-bar .progress, | |
| .file-upload .progress-bar .progress { | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; | |
| } | |
| /* Upload progress text */ | |
| .upload-container .progress-text, | |
| .uploading .progress-text, | |
| .file-upload-text, | |
| .upload-text, | |
| .eta-bar { | |
| color: #e8e8ed !important; | |
| font-weight: 600 !important; | |
| } | |
| /* Gradio's built-in ETA bar */ | |
| .eta-bar { | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6) !important; | |
| opacity: 0.3 !important; | |
| } | |
| /* Progress level / status text */ | |
| .progress-level, .progress-level span, | |
| .progress-level .progress-level-inner { | |
| color: #e8e8ed !important; | |
| font-size: 13px !important; | |
| } | |
| /* Upload button area */ | |
| .upload-button, .upload-button span { | |
| color: #e8e8ed !important; | |
| border-color: #6366f1 !important; | |
| } | |
| /* Audio component loading state */ | |
| .audio-upload .uploading, | |
| .audio-upload .loading { | |
| color: #e8e8ed !important; | |
| } | |
| /* Spinner / loading indicator */ | |
| .audio-upload .loading svg, | |
| .audio-upload .spinner { | |
| color: #818cf8 !important; | |
| } | |
| /* ===== Live Timer ===== */ | |
| .live-timer { | |
| display: none; | |
| align-items: center; | |
| justify-content: center; | |
| gap: 10px; | |
| background: rgba(99,102,241,.08); | |
| border: 1px solid rgba(99,102,241,.3); | |
| color: #c7c7ff; | |
| padding: 12px 24px; | |
| border-radius: 12px; | |
| font-size: 15px; | |
| font-weight: 700; | |
| font-family: 'Inter', 'SF Mono', monospace; | |
| margin-bottom: 12px; | |
| letter-spacing: 0.5px; | |
| } | |
| .live-timer.active { | |
| display: flex !important; | |
| } | |
| .live-timer.done { | |
| background: rgba(16,185,129,.08) !important; | |
| border-color: rgba(16,185,129,.3) !important; | |
| color: #6ee7b7 !important; | |
| } | |
| .live-timer.error { | |
| background: rgba(239,68,68,.08) !important; | |
| border-color: rgba(239,68,68,.3) !important; | |
| color: #fca5a5 !important; | |
| } | |
| .pulse-dot { | |
| width: 10px; | |
| height: 10px; | |
| border-radius: 50%; | |
| background: #818cf8; | |
| animation: pulse-blink 1s ease-in-out infinite; | |
| flex-shrink: 0; | |
| } | |
| .live-timer.done .pulse-dot { display: none; } | |
| .live-timer.error .pulse-dot { display: none; } | |
| @keyframes pulse-blink { | |
| 0%, 100% { opacity: 1; transform: scale(1); } | |
| 50% { opacity: 0.3; transform: scale(0.7); } | |
| } | |
| .timer-clock { | |
| font-variant-numeric: tabular-nums; | |
| min-width: 52px; | |
| text-align: center; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 640px) { | |
| .howto { flex-direction: column; gap: 8px; } | |
| .features { gap: 4px; } | |
| .header-wrap h1 { font-size: 26px !important; } | |
| } | |
| """ | |
| UPLOAD_PROGRESS_JS = """ | |
| <style> | |
| #upload-bar-wrap{display:none;position:fixed;top:0;left:0;right:0;z-index:99999;height:5px;background:#222228} | |
| #upload-bar{height:100%;width:0%;background:linear-gradient(90deg,#6366f1,#a78bfa);transition:width .2s;border-radius:0 3px 3px 0} | |
| #upload-pct{display:none;position:fixed;top:12px;right:16px;z-index:99999;background:#1a1a1f;border:1px solid #6366f1; | |
| color:#c7c7ff;padding:7px 16px;border-radius:10px;font-size:13px;font-weight:700;font-family:Inter,sans-serif; | |
| box-shadow:0 4px 20px rgba(99,102,241,.3)} | |
| </style> | |
| <script> | |
| (function(){ | |
| var barW=document.createElement('div');barW.id='upload-bar-wrap'; | |
| barW.innerHTML='<div id="upload-bar"></div>';document.body.appendChild(barW); | |
| var pctEl=document.createElement('div');pctEl.id='upload-pct';document.body.appendChild(pctEl); | |
| function show(p){ | |
| barW.style.display='block';pctEl.style.display='block'; | |
| document.getElementById('upload-bar').style.width=p+'%'; | |
| pctEl.textContent='\\u{1F4E4} Uploading... '+p+'%'; | |
| } | |
| function hide(){ | |
| show(100); | |
| setTimeout(function(){ | |
| barW.style.display='none';pctEl.style.display='none'; | |
| document.getElementById('upload-bar').style.width='0%'; | |
| },800); | |
| } | |
| var _fetch=window.fetch; | |
| window.fetch=function(input,init){ | |
| var url=typeof input==='string'?input:(input&&input.url?input.url:''); | |
| if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){ | |
| return new Promise(function(resolve,reject){ | |
| var xhr=new XMLHttpRequest(); | |
| xhr.open('POST',url,true); | |
| xhr.responseType='text'; | |
| if(init.headers){ | |
| try{ | |
| var h=init.headers instanceof Headers?init.headers:new Headers(init.headers); | |
| h.forEach(function(v,k){ | |
| if(k.toLowerCase()!=='content-type')xhr.setRequestHeader(k,v); | |
| }); | |
| }catch(e){} | |
| } | |
| xhr.upload.onprogress=function(e){ | |
| if(e.lengthComputable)show(Math.round(e.loaded/e.total*100)); | |
| }; | |
| xhr.onload=function(){ | |
| hide(); | |
| var headers=new Headers(); | |
| try{ | |
| xhr.getAllResponseHeaders().trim().split('\\r\\n').forEach(function(line){ | |
| var i=line.indexOf(':'); | |
| if(i>0)headers.append(line.slice(0,i).trim(),line.slice(i+1).trim()); | |
| }); | |
| }catch(e){} | |
| resolve(new Response(xhr.responseText,{status:xhr.status,statusText:xhr.statusText,headers:headers})); | |
| }; | |
| xhr.onerror=function(){hide();reject(new TypeError('Network request failed'));}; | |
| xhr.onabort=function(){hide();reject(new DOMException('Aborted','AbortError'));}; | |
| xhr.send(init.body); | |
| }); | |
| } | |
| return _fetch.apply(this,arguments); | |
| }; | |
| })(); | |
| /* ===== Live Timer ===== */ | |
| window._timerInterval=null; | |
| window._timerStart=0; | |
| window._timerHideTimeout=null; | |
| window.startTranscribeTimer=function(){ | |
| var el=document.getElementById('live-timer'); | |
| if(!el)return; | |
| /* Clear previous timer & auto-hide timeout */ | |
| if(window._timerInterval){clearInterval(window._timerInterval);window._timerInterval=null;} | |
| if(window._timerHideTimeout){clearTimeout(window._timerHideTimeout);window._timerHideTimeout=null;} | |
| window._timerStart=Date.now(); | |
| el.className='live-timer active'; | |
| el.innerHTML='<span class="pulse-dot"></span><span>Memproses...</span><span class="timer-clock">00:00</span>'; | |
| window._timerInterval=setInterval(function(){ | |
| var sec=Math.floor((Date.now()-window._timerStart)/1000); | |
| var m=Math.floor(sec/60);var s=sec%60; | |
| var clock=el.querySelector('.timer-clock'); | |
| if(clock)clock.textContent=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0'); | |
| },1000); | |
| }; | |
| window.stopTranscribeTimer=function(ok){ | |
| if(!window._timerInterval)return; /* Already stopped β prevent double-stop */ | |
| clearInterval(window._timerInterval); | |
| window._timerInterval=null; /* Null it so MutationObserver won't re-trigger */ | |
| var el=document.getElementById('live-timer'); | |
| if(!el)return; | |
| var sec=Math.floor((Date.now()-window._timerStart)/1000); | |
| var m=Math.floor(sec/60);var s=sec%60; | |
| var t=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0'); | |
| if(ok!==false){ | |
| el.className='live-timer active done'; | |
| el.innerHTML='\\u2705 Selesai dalam <strong>'+t+'</strong>'; | |
| }else{ | |
| el.className='live-timer active error'; | |
| el.innerHTML='\\u274C Error setelah <strong>'+t+'</strong>'; | |
| } | |
| window._timerHideTimeout=setTimeout(function(){ | |
| el.className='live-timer'; | |
| window._timerHideTimeout=null; | |
| },60000); | |
| }; | |
| /* Auto-start timer when EXPLICIT progress() text appears (contains β³). | |
| Gradio StatusTracker (.eta-bar, .progress-level) appears on ALL fn calls, | |
| but our β³ marker only appears when progress(0.05,"β³ Menunggu GPU...") is called, | |
| which happens AFTER the audio_file validation passes. | |
| - No file β gr.Error() before progress() β no β³ β timer never starts | |
| - File OK β progress(0.05,"β³...") β β³ detected β timer starts | |
| Auto-stop on error toast. */ | |
| new MutationObserver(function(muts){ | |
| muts.forEach(function(m){ | |
| if(m.type==='childList'){ | |
| m.addedNodes.forEach(function(n){ | |
| /* Element node: check text for β³ marker */ | |
| if(n.nodeType===1){ | |
| if(!window._timerInterval&&n.textContent&&n.textContent.indexOf('\u23f3')!==-1){ | |
| window.startTranscribeTimer(); | |
| } | |
| /* Detect error toast β stop timer */ | |
| var isToast=n.classList&&(n.classList.contains('toast-wrap')||n.classList.contains('error')); | |
| var hasError=n.querySelector&&n.querySelector('.error,.toast-body'); | |
| if((isToast||hasError)&&window._timerInterval){ | |
| window.stopTranscribeTimer(false); | |
| } | |
| } | |
| /* Text node with β³ */ | |
| if(n.nodeType===3&&!window._timerInterval&&n.nodeValue&&n.nodeValue.indexOf('\u23f3')!==-1){ | |
| window.startTranscribeTimer(); | |
| } | |
| }); | |
| } | |
| /* Text content change containing β³ (progress update on existing node) */ | |
| if(m.type==='characterData'&&!window._timerInterval&&m.target.nodeValue&&m.target.nodeValue.indexOf('\u23f3')!==-1){ | |
| window.startTranscribeTimer(); | |
| } | |
| }); | |
| }).observe(document.body,{childList:true,subtree:true,characterData:true}); | |
| </script> | |
| """ | |
| with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS, head=UPLOAD_PROGRESS_JS) as demo: | |
| # ---- Header ---- | |
| gr.HTML(""" | |
| <div class="header-wrap"> | |
| <h1>TranscribeAI</h1> | |
| <p>Transkripsi Audio dengan Speaker Diarization — Gratis & Cepat</p> | |
| <div class="badge-gpu">ZeroGPU H200 • Whisper • Tanpa API Key</div> | |
| <div class="features"> | |
| <span class="feat-tag">99+ Bahasa</span> | |
| <span class="feat-tag">Speaker ID</span> | |
| <span class="feat-tag">SRT / TXT / DOCX</span> | |
| <span class="feat-tag">GPU Accelerated</span> | |
| <span class="feat-tag">Auto-detect Bahasa</span> | |
| </div> | |
| <div class="howto"> | |
| <div class="howto-step"><div class="howto-num">1</div> Upload audio</div> | |
| <div class="howto-step"><div class="howto-num">2</div> Klik Mulai</div> | |
| <div class="howto-step"><div class="howto-num">3</div> Download hasil</div> | |
| </div> | |
| </div> | |
| """) | |
| # ---- Upload ---- | |
| with gr.Group(elem_classes="card-section"): | |
| gr.HTML('<div class="card-title">π΅ Upload Audio</div>') | |
| audio_input = gr.Audio( | |
| label="Drag & drop file audio/video, atau klik untuk pilih file. Bisa juga rekam langsung.", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| elem_classes="audio-upload", | |
| ) | |
| gr.HTML('<div style="font-size:11px;color:#6a6a7a;margin-top:6px;">Format: MP3, MP4, WAV, M4A, OGG, FLAC, WEBM • Maks ~1 jam audio</div>') | |
| # ---- Settings ---- | |
| with gr.Group(elem_classes="card-section"): | |
| gr.HTML('<div class="card-title">βοΈ Pengaturan</div>') | |
| gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) — auto-loaded, siap pakai</div>') | |
| with gr.Row(): | |
| language_choice = gr.Dropdown( | |
| choices=list(LANGUAGE_MAP.keys()), | |
| value="Auto-detect", | |
| label="Bahasa", | |
| info="Auto-detect atau pilih bahasa spesifik", | |
| scale=2, | |
| ) | |
| speaker_count = gr.Slider( | |
| minimum=0, maximum=10, step=1, value=0, | |
| label="Jumlah Pembicara", | |
| info="0 = auto-detect", | |
| scale=1, | |
| ) | |
| with gr.Row(elem_classes="toggle-row"): | |
| enable_diarization = gr.Checkbox( | |
| value=True, | |
| label="Speaker Diarization", | |
| info="Identifikasi siapa yang berbicara" | |
| ) | |
| enable_vad = gr.Checkbox( | |
| value=True, | |
| label="VAD Filter", | |
| info="Lewati bagian hening untuk hasil lebih bersih" | |
| ) | |
| # ---- Start Button ---- | |
| btn_start = gr.Button( | |
| "π Mulai Transkripsi", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="btn-start", | |
| ) | |
| # ---- Live Timer ---- | |
| gr.HTML('<div id="live-timer" class="live-timer"></div>') | |
| # ---- Results ---- | |
| with gr.Group(elem_classes="card-section"): | |
| gr.HTML('<div class="card-title">π Hasil Transkripsi</div>') | |
| summary_output = gr.Markdown( | |
| elem_classes="summary-box", | |
| value="*Upload audio dan klik 'Mulai Transkripsi' untuk memulai.*" | |
| ) | |
| transcript_output = gr.Textbox( | |
| label="Teks Transkripsi", | |
| lines=20, | |
| max_lines=50, | |
| show_copy_button=True, | |
| interactive=False, | |
| elem_classes="transcript-box", | |
| placeholder="Hasil transkripsi dengan timestamp dan speaker label akan muncul di sini...\n\n[00:00] Speaker 1: contoh teks transkripsi...", | |
| ) | |
| # ---- Downloads ---- | |
| with gr.Group(elem_classes="card-section"): | |
| gr.HTML('<div class="card-title">π₯ Download File</div>') | |
| gr.HTML('<div style="font-size:12px;color:#6a6a7a;margin-bottom:8px;">File otomatis dihapus setelah 1 jam.</div>') | |
| with gr.Row(elem_classes="download-row"): | |
| srt_file = gr.File(label="SRT β Subtitle untuk video player") | |
| txt_file = gr.File(label="TXT β Teks dengan speaker label") | |
| docx_file = gr.File(label="DOCX β Dokumen Word berwarna") | |
| # ---- Connect ---- | |
| # Timer is started by MutationObserver when Gradio progress() appears in DOM. | |
| # This ensures timer ONLY starts after validation passes (no file β no progress). | |
| # Timer success-stop via .then(); error-stop via MutationObserver on error toast. | |
| btn_start.click( | |
| fn=transcribe_full, | |
| inputs=[audio_input, language_choice, speaker_count, | |
| enable_diarization, enable_vad], | |
| outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file], | |
| ).then( | |
| fn=lambda: None, | |
| inputs=None, | |
| outputs=None, | |
| js="() => { window.stopTranscribeTimer(true); }", | |
| ) | |
| # ---- Footer ---- | |
| gr.HTML(""" | |
| <div class="footer-text"> | |
| <strong>TranscribeAI</strong> by <a href="https://huggingface.co/romizone">romizone</a> | |
| • <a href="https://github.com/romizone/transcribeAI">GitHub</a> | |
| • ZeroGPU H200 • Whisper + PyTorch | |
| </div> | |
| """) | |
| demo.queue().launch(ssr_mode=False) | |