TranscribeAI / app.py
romizone's picture
Upload app.py with huggingface_hub
11a9e9a verified
"""
TranscribeAI - Transcription with Speaker Diarization (ZeroGPU)
================================================================
Engine : openai/whisper via transformers pipeline (CUDA ZeroGPU H200)
Speaker : MFCC + Agglomerative Clustering
Language: Indonesian, English, Auto-detect (99 languages)
Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM
Output : SRT, TXT, DOCX
"""
import time
import tempfile
import threading
import torch
import spaces
import gradio as gr
import numpy as np
from datetime import datetime
from pathlib import Path
from transformers import pipeline
# ============================================================
# Config β€” Single model (small) for fastest startup & simplicity
# ============================================================
MODEL_ID = 'openai/whisper-small'
MODEL_NAME = 'small'
LANGUAGE_MAP = {
'Auto-detect': None,
'Indonesian': 'id',
'English': 'en',
'Japanese': 'ja',
'Korean': 'ko',
'Chinese': 'zh',
'Arabic': 'ar',
'French': 'fr',
'German': 'de',
'Spanish': 'es',
'Portuguese': 'pt',
'Russian': 'ru',
'Thai': 'th',
'Vietnamese': 'vi',
'Malay': 'ms',
'Hindi': 'hi',
'Turkish': 'tr',
'Dutch': 'nl',
'Italian': 'it',
}
BATCH_SIZE = 16 # A10G 24GB VRAM β€” safe for whisper-small float16
OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
OUTPUT_DIR.mkdir(exist_ok=True)
# ============================================================
# Load pipeline at MODULE LEVEL (ZeroGPU requirement!)
# Single model = faster startup, no on-demand loading delay
# ============================================================
device = 0 if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...")
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_ID,
chunk_length_s=30,
device=device,
torch_dtype=torch_dtype,
)
print(f" {MODEL_NAME} ready!")
# ============================================================
# Helpers
# ============================================================
def fmt_timestamp(seconds):
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def fmt_time(seconds):
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
if h > 0:
return f"{h:02d}:{m:02d}:{s:02d}"
return f"{m:02d}:{s:02d}"
# ============================================================
# Speaker Diarization (MFCC + Clustering) β€” CPU
# ============================================================
def perform_diarization(audio_path, segments, num_speakers):
import librosa
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
if not segments or len(segments) < 2:
for seg in segments:
seg['speaker'] = 'Speaker 1'
seg['speaker_id'] = 0
return segments
y, sr = librosa.load(str(audio_path), sr=16000, mono=True)
features = []
valid_indices = []
for i, seg in enumerate(segments):
s0 = int(seg['start'] * sr)
s1 = min(int(seg['end'] * sr), len(y))
if s1 <= s0 or s0 >= len(y):
continue
chunk = y[s0:s1]
if len(chunk) < int(sr * 0.3):
continue
try:
# Cap analysis to 3s per segment for speed
max_samples = int(sr * 3)
analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk
# MFCC (13 = industry standard) + delta β€” sufficient for speaker ID
mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13)
delta = librosa.feature.delta(mfcc)
# F0 (pitch) β€” key differentiator between speakers
f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
f0c = f0[f0 > 0]
f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
combined = np.vstack([mfcc, delta])
vec = np.concatenate([
np.mean(combined, axis=1),
np.std(combined, axis=1),
[f0_mean, f0_std]
])
features.append(vec)
valid_indices.append(i)
except Exception:
continue
if len(features) < 2:
for seg in segments:
seg['speaker'] = 'Speaker 1'
seg['speaker_id'] = 0
return segments
X = np.array(features)
X_scaled = StandardScaler().fit_transform(X)
if num_speakers <= 0:
from sklearn.metrics import silhouette_score
best_score, best_n = -1, 2
max_n = min(6, len(X_scaled) - 1)
for n in range(2, max_n + 1):
try:
lbls = AgglomerativeClustering(
n_clusters=n, metric='cosine', linkage='average'
).fit_predict(X_scaled)
score = silhouette_score(X_scaled, lbls, metric='cosine')
if score > best_score:
best_score, best_n = score, n
except Exception:
pass
num_speakers = best_n
else:
num_speakers = min(num_speakers, len(X_scaled))
if num_speakers >= 2 and len(X_scaled) >= num_speakers:
labels = AgglomerativeClustering(
n_clusters=num_speakers, metric='cosine', linkage='average'
).fit_predict(X_scaled)
else:
labels = np.zeros(len(X_scaled), dtype=int)
label_map = {}
for lbl in labels:
if lbl not in label_map:
label_map[lbl] = len(label_map) + 1
assigns = {}
for idx, seg_idx in enumerate(valid_indices):
assigns[seg_idx] = label_map[labels[idx]]
for i, seg in enumerate(segments):
if i in assigns:
seg['speaker'] = f'Speaker {assigns[i]}'
seg['speaker_id'] = assigns[i] - 1
else:
nearest = min(valid_indices, key=lambda x: abs(x - i)) if valid_indices else 0
seg['speaker'] = f'Speaker {assigns.get(nearest, 1)}'
seg['speaker_id'] = assigns.get(nearest, 1) - 1
return segments
def merge_consecutive(segments):
if not segments:
return segments
merged = [segments[0].copy()]
for seg in segments[1:]:
if seg.get('speaker') == merged[-1].get('speaker'):
merged[-1]['end'] = seg['end']
merged[-1]['text'] += ' ' + seg['text']
else:
merged.append(seg.copy())
return merged
# ============================================================
# Export Functions
# ============================================================
def generate_srt(segments, path):
with open(path, 'w', encoding='utf-8') as f:
for i, seg in enumerate(segments, 1):
f.write(f"{i}\n")
f.write(f"{fmt_timestamp(seg['start'])} --> {fmt_timestamp(seg['end'])}\n")
sp = seg.get('speaker', '')
f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
LANG_NAMES = {
'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean',
'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German',
'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai',
'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish',
'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected',
}
def generate_txt(segments, path, filename='', language='', duration=0):
with open(path, 'w', encoding='utf-8') as f:
f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
if filename:
f.write(f"File: {filename}\n")
f.write(f"Language: {LANG_NAMES.get(language, language)}\n")
f.write(f"Duration: {fmt_time(duration)}\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
speakers = sorted(set(s.get('speaker', '') for s in segments))
f.write(f"Speakers: {', '.join(speakers)}\n")
f.write("=" * 60 + "\n\n")
cur_speaker = None
for seg in segments:
sp = seg.get('speaker', '')
if sp != cur_speaker:
cur_speaker = sp
f.write(f"\n[{fmt_time(seg['start'])}] {sp}:\n")
f.write(f"{seg['text']}\n")
def generate_docx(segments, path, filename='', language='', duration=0):
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
colors = {
0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
4: RGBColor(124, 58, 237), 5: RGBColor(219, 39, 119),
}
doc = Document()
style = doc.styles['Normal']
style.font.name = 'Calibri'
style.font.size = Pt(11)
title = doc.add_heading('Transcript', level=0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
meta = []
if filename:
meta.append(('File', filename))
meta.append(('Language', LANG_NAMES.get(language, language)))
meta.append(('Duration', fmt_time(duration)))
meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
meta.append(('Speakers', ', '.join(speakers)))
for label, val in meta:
p = doc.add_paragraph()
r = p.add_run(f'{label}: ')
r.bold = True
r.font.size = Pt(10)
r.font.color.rgb = RGBColor(100, 100, 100)
r = p.add_run(val)
r.font.size = Pt(10)
p.paragraph_format.space_after = Pt(2)
doc.add_paragraph('_' * 70)
for seg in segments:
p = doc.add_paragraph()
r = p.add_run(f'[{fmt_time(seg["start"])}] ')
r.font.size = Pt(9)
r.font.color.rgb = RGBColor(150, 150, 150)
sp_id = seg.get('speaker_id', 0)
sp = seg.get('speaker', 'Speaker 1')
color = colors.get(sp_id, RGBColor(79, 70, 229))
r = p.add_run(f'{sp}: ')
r.bold = True
r.font.size = Pt(11)
r.font.color.rgb = color
r = p.add_run(seg['text'])
r.font.size = Pt(11)
p.paragraph_format.space_after = Pt(6)
doc.save(path)
# ============================================================
# GPU Transcription (ZeroGPU β€” proven pattern)
# ============================================================
@spaces.GPU(duration=120)
def transcribe_with_gpu(audio_path, language):
"""Run Whisper inference on GPU. Single model, always ready."""
generate_kwargs = {"task": "transcribe"}
if language:
generate_kwargs["language"] = language
result = pipe(
str(audio_path),
batch_size=BATCH_SIZE,
return_timestamps=True,
generate_kwargs=generate_kwargs,
)
# Parse segments
raw_segments = []
duration = 0.0
chunks = result.get("chunks", [])
if chunks:
for chunk in chunks:
text = chunk.get("text", "").strip()
ts = chunk.get("timestamp", (0, 0))
start = ts[0] if ts[0] is not None else 0
end = ts[1] if ts[1] is not None else start + 1
if end > duration:
duration = end
if text:
raw_segments.append({
'start': round(start, 2),
'end': round(end, 2),
'text': text,
})
else:
full_text = result.get("text", "").strip()
if full_text:
raw_segments.append({'start': 0, 'end': 1, 'text': full_text})
detected_lang = language or "auto"
return raw_segments, detected_lang, duration
def apply_vad_filter(segments):
"""Filter out segments that are likely silence/noise (very short + filler)."""
FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]',
'(music)', '[Musik]', '[musik]', 'β™ͺ', 'β™ͺβ™ͺ', 'β™«'}
MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise
filtered = []
for seg in segments:
text = seg['text'].strip()
seg_dur = seg['end'] - seg['start']
if text in FILLER:
continue
if seg_dur < MIN_DURATION and len(text.split()) <= 1:
continue
filtered.append(seg)
return filtered if filtered else segments # fallback: return original if all filtered
# ============================================================
# Full Pipeline (wired to Gradio)
# ============================================================
def transcribe_full(audio_file, language_name, num_speakers,
enable_diarization, enable_vad, progress=gr.Progress()):
if audio_file is None:
raise gr.Error("Upload file audio terlebih dahulu!")
audio_path = audio_file
filename = Path(audio_path).name
lang_code = LANGUAGE_MAP.get(language_name, None)
num_speakers = int(num_speakers) # Gradio slider returns float
t0 = time.time() # Start timing from here β€” matches JS timer
progress(0.05, desc="⏳ Menunggu GPU & memproses audio... (bisa 30-90 detik)")
# 1. Transcribe on GPU
try:
segments, detected_lang, duration = transcribe_with_gpu(
audio_path, lang_code
)
except Exception as e:
raise gr.Error(f"Gagal transkripsi: {str(e)}")
if not segments:
raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
# 1b. VAD filter β€” remove silence/filler segments
if enable_vad:
segments = apply_vad_filter(segments)
transcribe_time = time.time() - t0
progress(0.60, desc=f"βœ… Transkripsi selesai ({transcribe_time:.0f}s) β€” {len(segments)} segmen")
# 2. Speaker Diarization (CPU)
diarization_note = ""
if enable_diarization and len(segments) >= 2:
progress(0.65, desc="πŸ” Mengidentifikasi pembicara...")
try:
segments = perform_diarization(audio_path, segments, num_speakers)
segments = merge_consecutive(segments)
except Exception as e:
print(f" [Diarization] Error: {e}")
diarization_note = " ⚠️ (diarization gagal, fallback 1 speaker)"
for seg in segments:
seg['speaker'] = 'Speaker 1'
seg['speaker_id'] = 0
else:
for seg in segments:
seg['speaker'] = 'Speaker 1'
seg['speaker_id'] = 0
progress(0.85, desc="πŸ“„ Membuat file output...")
# 3. Export
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_name = Path(filename).stem
srt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.srt")
txt_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.txt")
docx_path = str(OUTPUT_DIR / f"{base_name}_{timestamp}.docx")
generate_srt(segments, srt_path)
generate_txt(segments, txt_path, filename, detected_lang, duration)
generate_docx(segments, docx_path, filename, detected_lang, duration)
progress(0.95, desc="πŸ“¦ Menyiapkan hasil...")
# Build display text
transcript_lines = []
speakers_found = set()
for seg in segments:
sp = seg.get('speaker', 'Speaker 1')
speakers_found.add(sp)
transcript_lines.append(f"[{fmt_time(seg['start'])}] {sp}: {seg['text']}")
transcript_text = "\n\n".join(transcript_lines)
total_time = time.time() - t0
lang_display = detected_lang.upper() if detected_lang else 'AUTO'
summary = (
f"**Transkripsi Selesai!**\n\n"
f"| Info | Detail |\n"
f"|------|--------|\n"
f"| File | {filename} |\n"
f"| Durasi Audio | {fmt_time(duration)} |\n"
f"| Bahasa | {lang_display} |\n"
f"| Model | {MODEL_NAME} (244M) |\n"
f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}){diarization_note} |\n"
f"| Segmen | {len(segments)} |\n"
f"| Waktu Proses | {total_time:.0f} detik |\n"
f"| Engine | Whisper + ZeroGPU H200 |"
)
progress(1.0, desc="πŸŽ‰ Selesai!")
return summary, transcript_text, srt_path, txt_path, docx_path
# ============================================================
# Cleanup old files (>1 hour)
# ============================================================
def cleanup_loop():
while True:
try:
now = time.time()
if OUTPUT_DIR.exists():
for f in OUTPUT_DIR.iterdir():
if f.is_file() and (now - f.stat().st_mtime) > 3600:
f.unlink(missing_ok=True)
print(f" [Cleanup] Deleted: {f.name}")
except Exception as e:
print(f" [Cleanup] Error: {e}")
time.sleep(300)
threading.Thread(target=cleanup_loop, daemon=True).start()
# ============================================================
# Gradio UI
# ============================================================
THEME = gr.themes.Base(
primary_hue=gr.themes.colors.indigo,
secondary_hue=gr.themes.colors.purple,
neutral_hue=gr.themes.colors.gray,
font=gr.themes.GoogleFont("Inter"),
).set(
body_background_fill="#0f0f11",
body_background_fill_dark="#0f0f11",
block_background_fill="#1a1a1f",
block_background_fill_dark="#1a1a1f",
block_border_color="#333340",
block_border_color_dark="#333340",
block_label_text_color="#a0a0b0",
block_title_text_color="#e8e8ed",
body_text_color="#e8e8ed",
body_text_color_dark="#e8e8ed",
button_primary_background_fill="#6366f1",
button_primary_background_fill_dark="#6366f1",
button_primary_text_color="#ffffff",
input_background_fill="#222228",
input_background_fill_dark="#222228",
input_border_color="#333340",
input_border_color_dark="#333340",
)
CUSTOM_CSS = """
/* Global */
.gradio-container {
max-width: 960px !important;
margin: 0 auto !important;
}
footer { display: none !important; }
/* Header */
.header-wrap {
text-align: center;
padding: 32px 0 20px;
}
.header-wrap h1 {
font-size: 32px !important;
font-weight: 800 !important;
background: linear-gradient(135deg, #818cf8, #8b5cf6) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
background-clip: text !important;
letter-spacing: -0.5px;
margin-bottom: 6px !important;
}
.header-wrap p {
color: #a0a0b0 !important;
font-size: 14px !important;
}
.badge-gpu {
display: inline-flex;
align-items: center;
gap: 6px;
background: rgba(99,102,241,.12);
color: #818cf8;
font-size: 12px;
padding: 4px 14px;
border-radius: 20px;
font-weight: 600;
margin-top: 8px;
}
.badge-gpu::before {
content: '';
width: 7px;
height: 7px;
background: #10b981;
border-radius: 50%;
display: inline-block;
}
/* Cards */
.card-section {
background: #1a1a1f !important;
border: 1px solid #333340 !important;
border-radius: 14px !important;
padding: 20px 24px !important;
margin-bottom: 12px !important;
}
.card-title {
font-size: 14px !important;
font-weight: 700 !important;
color: #e8e8ed !important;
margin-bottom: 12px !important;
display: flex;
align-items: center;
gap: 8px;
}
/* Primary button */
.btn-start {
background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
border: none !important;
border-radius: 12px !important;
font-size: 16px !important;
font-weight: 700 !important;
padding: 14px 32px !important;
transition: all 0.2s !important;
box-shadow: 0 4px 15px rgba(99,102,241,.3) !important;
}
.btn-start:hover {
transform: translateY(-1px) !important;
box-shadow: 0 6px 20px rgba(99,102,241,.4) !important;
}
/* Settings grid */
.settings-row {
gap: 8px !important;
}
/* Transcript output */
.transcript-box textarea {
font-family: 'Inter', 'SF Mono', monospace !important;
font-size: 13px !important;
line-height: 1.7 !important;
background: #16161a !important;
border-radius: 10px !important;
}
/* Download cards β€” labels (dark bg) */
.download-row label span,
.download-row .label-wrap span {
color: #e8e8ed !important;
font-weight: 700 !important;
}
/* Download cards β€” file items (white bg β†’ black bold text) */
.download-row .file-preview,
.download-row .download-file,
.download-row .file-component {
border-radius: 10px !important;
}
.download-row .file-preview *,
.download-row .download-file *,
.download-row .file-component *,
.download-row a,
.download-row .file-name,
.download-row .file-size {
color: #111 !important;
font-weight: 700 !important;
}
/* Result summary */
.summary-box {
background: #1a1a1f !important;
border: 1px solid #2a2a35 !important;
border-radius: 12px !important;
padding: 16px !important;
}
.summary-box table {
width: 100% !important;
}
.summary-box td, .summary-box th {
padding: 6px 12px !important;
font-size: 13px !important;
border-bottom: 1px solid #222230 !important;
}
/* Toggle checkboxes */
.toggle-row {
gap: 24px !important;
}
/* Audio upload area */
.audio-upload {
border: 2px dashed #333340 !important;
border-radius: 14px !important;
transition: all 0.2s !important;
}
.audio-upload:hover {
border-color: #6366f1 !important;
}
/* How-to steps */
.howto {
display: flex;
gap: 16px;
margin: 12px 0 4px;
flex-wrap: wrap;
}
.howto-step {
display: flex;
align-items: center;
gap: 8px;
font-size: 13px;
color: #a0a0b0;
}
.howto-num {
width: 24px;
height: 24px;
border-radius: 50%;
background: linear-gradient(135deg, #6366f1, #8b5cf6);
color: #fff;
font-size: 12px;
font-weight: 700;
display: flex;
align-items: center;
justify-content: center;
flex-shrink: 0;
}
/* Feature tags */
.features {
display: flex;
gap: 8px;
flex-wrap: wrap;
justify-content: center;
margin-top: 12px;
}
.feat-tag {
font-size: 11px;
padding: 4px 10px;
border-radius: 6px;
background: #1a1a1f;
border: 1px solid #333340;
color: #a0a0b0;
}
/* Footer */
.footer-text {
text-align: center;
padding: 20px 0 8px;
color: #6a6a7a;
font-size: 12px;
}
.footer-text a {
color: #818cf8;
text-decoration: none;
}
/* ===== FIX: Dropdown text visibility ===== */
/* Selected value text */
.gr-dropdown .wrap .wrap-inner .secondary-wrap,
.gr-dropdown .wrap .wrap-inner .secondary-wrap span,
.gr-dropdown .wrap .wrap-inner input,
.gr-dropdown input,
.dropdown .wrap span,
.dropdown input[type="text"],
div[data-testid="dropdown"] span,
div[data-testid="dropdown"] input {
color: #e8e8ed !important;
}
/* Dropdown options list */
.gr-dropdown ul[role="listbox"],
.gr-dropdown .options,
.dropdown ul, .dropdown li,
ul[role="listbox"],
li[role="option"],
div[role="option"] {
color: #e8e8ed !important;
background-color: #1a1a1f !important;
}
li[role="option"]:hover,
div[role="option"]:hover,
li[role="option"].selected,
li[role="option"][aria-selected="true"] {
background-color: rgba(99,102,241,.2) !important;
color: #c7c7ff !important;
}
/* Dropdown container border */
.gr-dropdown .wrap, .dropdown .wrap {
background: #222228 !important;
border-color: #333340 !important;
}
/* Dropdown info text */
.gr-dropdown .info-text, .dropdown .info-text,
span[data-testid="info-text"] {
color: #8888a0 !important;
}
/* ===== FIX: Upload progress visibility ===== */
/* Gradio upload progress bar */
.upload-container .progress-bar,
.uploading .progress-bar,
.file-upload .progress-bar {
background: #333340 !important;
border-radius: 6px !important;
overflow: hidden !important;
}
.upload-container .progress-bar .progress,
.uploading .progress-bar .progress,
.file-upload .progress-bar .progress {
background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
}
/* Upload progress text */
.upload-container .progress-text,
.uploading .progress-text,
.file-upload-text,
.upload-text,
.eta-bar {
color: #e8e8ed !important;
font-weight: 600 !important;
}
/* Gradio's built-in ETA bar */
.eta-bar {
background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
opacity: 0.3 !important;
}
/* Progress level / status text */
.progress-level, .progress-level span,
.progress-level .progress-level-inner {
color: #e8e8ed !important;
font-size: 13px !important;
}
/* Upload button area */
.upload-button, .upload-button span {
color: #e8e8ed !important;
border-color: #6366f1 !important;
}
/* Audio component loading state */
.audio-upload .uploading,
.audio-upload .loading {
color: #e8e8ed !important;
}
/* Spinner / loading indicator */
.audio-upload .loading svg,
.audio-upload .spinner {
color: #818cf8 !important;
}
/* ===== Live Timer ===== */
.live-timer {
display: none;
align-items: center;
justify-content: center;
gap: 10px;
background: rgba(99,102,241,.08);
border: 1px solid rgba(99,102,241,.3);
color: #c7c7ff;
padding: 12px 24px;
border-radius: 12px;
font-size: 15px;
font-weight: 700;
font-family: 'Inter', 'SF Mono', monospace;
margin-bottom: 12px;
letter-spacing: 0.5px;
}
.live-timer.active {
display: flex !important;
}
.live-timer.done {
background: rgba(16,185,129,.08) !important;
border-color: rgba(16,185,129,.3) !important;
color: #6ee7b7 !important;
}
.live-timer.error {
background: rgba(239,68,68,.08) !important;
border-color: rgba(239,68,68,.3) !important;
color: #fca5a5 !important;
}
.pulse-dot {
width: 10px;
height: 10px;
border-radius: 50%;
background: #818cf8;
animation: pulse-blink 1s ease-in-out infinite;
flex-shrink: 0;
}
.live-timer.done .pulse-dot { display: none; }
.live-timer.error .pulse-dot { display: none; }
@keyframes pulse-blink {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.3; transform: scale(0.7); }
}
.timer-clock {
font-variant-numeric: tabular-nums;
min-width: 52px;
text-align: center;
}
/* Responsive */
@media (max-width: 640px) {
.howto { flex-direction: column; gap: 8px; }
.features { gap: 4px; }
.header-wrap h1 { font-size: 26px !important; }
}
"""
UPLOAD_PROGRESS_JS = """
<style>
#upload-bar-wrap{display:none;position:fixed;top:0;left:0;right:0;z-index:99999;height:5px;background:#222228}
#upload-bar{height:100%;width:0%;background:linear-gradient(90deg,#6366f1,#a78bfa);transition:width .2s;border-radius:0 3px 3px 0}
#upload-pct{display:none;position:fixed;top:12px;right:16px;z-index:99999;background:#1a1a1f;border:1px solid #6366f1;
color:#c7c7ff;padding:7px 16px;border-radius:10px;font-size:13px;font-weight:700;font-family:Inter,sans-serif;
box-shadow:0 4px 20px rgba(99,102,241,.3)}
</style>
<script>
(function(){
var barW=document.createElement('div');barW.id='upload-bar-wrap';
barW.innerHTML='<div id="upload-bar"></div>';document.body.appendChild(barW);
var pctEl=document.createElement('div');pctEl.id='upload-pct';document.body.appendChild(pctEl);
function show(p){
barW.style.display='block';pctEl.style.display='block';
document.getElementById('upload-bar').style.width=p+'%';
pctEl.textContent='\\u{1F4E4} Uploading... '+p+'%';
}
function hide(){
show(100);
setTimeout(function(){
barW.style.display='none';pctEl.style.display='none';
document.getElementById('upload-bar').style.width='0%';
},800);
}
var _fetch=window.fetch;
window.fetch=function(input,init){
var url=typeof input==='string'?input:(input&&input.url?input.url:'');
if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){
return new Promise(function(resolve,reject){
var xhr=new XMLHttpRequest();
xhr.open('POST',url,true);
xhr.responseType='text';
if(init.headers){
try{
var h=init.headers instanceof Headers?init.headers:new Headers(init.headers);
h.forEach(function(v,k){
if(k.toLowerCase()!=='content-type')xhr.setRequestHeader(k,v);
});
}catch(e){}
}
xhr.upload.onprogress=function(e){
if(e.lengthComputable)show(Math.round(e.loaded/e.total*100));
};
xhr.onload=function(){
hide();
var headers=new Headers();
try{
xhr.getAllResponseHeaders().trim().split('\\r\\n').forEach(function(line){
var i=line.indexOf(':');
if(i>0)headers.append(line.slice(0,i).trim(),line.slice(i+1).trim());
});
}catch(e){}
resolve(new Response(xhr.responseText,{status:xhr.status,statusText:xhr.statusText,headers:headers}));
};
xhr.onerror=function(){hide();reject(new TypeError('Network request failed'));};
xhr.onabort=function(){hide();reject(new DOMException('Aborted','AbortError'));};
xhr.send(init.body);
});
}
return _fetch.apply(this,arguments);
};
})();
/* ===== Live Timer ===== */
window._timerInterval=null;
window._timerStart=0;
window._timerHideTimeout=null;
window.startTranscribeTimer=function(){
var el=document.getElementById('live-timer');
if(!el)return;
/* Clear previous timer & auto-hide timeout */
if(window._timerInterval){clearInterval(window._timerInterval);window._timerInterval=null;}
if(window._timerHideTimeout){clearTimeout(window._timerHideTimeout);window._timerHideTimeout=null;}
window._timerStart=Date.now();
el.className='live-timer active';
el.innerHTML='<span class="pulse-dot"></span><span>Memproses...</span><span class="timer-clock">00:00</span>';
window._timerInterval=setInterval(function(){
var sec=Math.floor((Date.now()-window._timerStart)/1000);
var m=Math.floor(sec/60);var s=sec%60;
var clock=el.querySelector('.timer-clock');
if(clock)clock.textContent=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0');
},1000);
};
window.stopTranscribeTimer=function(ok){
if(!window._timerInterval)return; /* Already stopped β€” prevent double-stop */
clearInterval(window._timerInterval);
window._timerInterval=null; /* Null it so MutationObserver won't re-trigger */
var el=document.getElementById('live-timer');
if(!el)return;
var sec=Math.floor((Date.now()-window._timerStart)/1000);
var m=Math.floor(sec/60);var s=sec%60;
var t=String(m).padStart(2,'0')+':'+String(s).padStart(2,'0');
if(ok!==false){
el.className='live-timer active done';
el.innerHTML='\\u2705 Selesai dalam <strong>'+t+'</strong>';
}else{
el.className='live-timer active error';
el.innerHTML='\\u274C Error setelah <strong>'+t+'</strong>';
}
window._timerHideTimeout=setTimeout(function(){
el.className='live-timer';
window._timerHideTimeout=null;
},60000);
};
/* Auto-start timer when EXPLICIT progress() text appears (contains ⏳).
Gradio StatusTracker (.eta-bar, .progress-level) appears on ALL fn calls,
but our ⏳ marker only appears when progress(0.05,"⏳ Menunggu GPU...") is called,
which happens AFTER the audio_file validation passes.
- No file β†’ gr.Error() before progress() β†’ no ⏳ β†’ timer never starts
- File OK β†’ progress(0.05,"⏳...") β†’ ⏳ detected β†’ timer starts
Auto-stop on error toast. */
new MutationObserver(function(muts){
muts.forEach(function(m){
if(m.type==='childList'){
m.addedNodes.forEach(function(n){
/* Element node: check text for ⏳ marker */
if(n.nodeType===1){
if(!window._timerInterval&&n.textContent&&n.textContent.indexOf('\u23f3')!==-1){
window.startTranscribeTimer();
}
/* Detect error toast β†’ stop timer */
var isToast=n.classList&&(n.classList.contains('toast-wrap')||n.classList.contains('error'));
var hasError=n.querySelector&&n.querySelector('.error,.toast-body');
if((isToast||hasError)&&window._timerInterval){
window.stopTranscribeTimer(false);
}
}
/* Text node with ⏳ */
if(n.nodeType===3&&!window._timerInterval&&n.nodeValue&&n.nodeValue.indexOf('\u23f3')!==-1){
window.startTranscribeTimer();
}
});
}
/* Text content change containing ⏳ (progress update on existing node) */
if(m.type==='characterData'&&!window._timerInterval&&m.target.nodeValue&&m.target.nodeValue.indexOf('\u23f3')!==-1){
window.startTranscribeTimer();
}
});
}).observe(document.body,{childList:true,subtree:true,characterData:true});
</script>
"""
with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS, head=UPLOAD_PROGRESS_JS) as demo:
# ---- Header ----
gr.HTML("""
<div class="header-wrap">
<h1>TranscribeAI</h1>
<p>Transkripsi Audio dengan Speaker Diarization &mdash; Gratis & Cepat</p>
<div class="badge-gpu">ZeroGPU H200 &bull; Whisper &bull; Tanpa API Key</div>
<div class="features">
<span class="feat-tag">99+ Bahasa</span>
<span class="feat-tag">Speaker ID</span>
<span class="feat-tag">SRT / TXT / DOCX</span>
<span class="feat-tag">GPU Accelerated</span>
<span class="feat-tag">Auto-detect Bahasa</span>
</div>
<div class="howto">
<div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
<div class="howto-step"><div class="howto-num">2</div> Klik Mulai</div>
<div class="howto-step"><div class="howto-num">3</div> Download hasil</div>
</div>
</div>
""")
# ---- Upload ----
with gr.Group(elem_classes="card-section"):
gr.HTML('<div class="card-title">🎡 Upload Audio</div>')
audio_input = gr.Audio(
label="Drag & drop file audio/video, atau klik untuk pilih file. Bisa juga rekam langsung.",
type="filepath",
sources=["upload", "microphone"],
elem_classes="audio-upload",
)
gr.HTML('<div style="font-size:11px;color:#6a6a7a;margin-top:6px;">Format: MP3, MP4, WAV, M4A, OGG, FLAC, WEBM &bull; Maks ~1 jam audio</div>')
# ---- Settings ----
with gr.Group(elem_classes="card-section"):
gr.HTML('<div class="card-title">βš™οΈ Pengaturan</div>')
gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) &mdash; auto-loaded, siap pakai</div>')
with gr.Row():
language_choice = gr.Dropdown(
choices=list(LANGUAGE_MAP.keys()),
value="Auto-detect",
label="Bahasa",
info="Auto-detect atau pilih bahasa spesifik",
scale=2,
)
speaker_count = gr.Slider(
minimum=0, maximum=10, step=1, value=0,
label="Jumlah Pembicara",
info="0 = auto-detect",
scale=1,
)
with gr.Row(elem_classes="toggle-row"):
enable_diarization = gr.Checkbox(
value=True,
label="Speaker Diarization",
info="Identifikasi siapa yang berbicara"
)
enable_vad = gr.Checkbox(
value=True,
label="VAD Filter",
info="Lewati bagian hening untuk hasil lebih bersih"
)
# ---- Start Button ----
btn_start = gr.Button(
"πŸš€ Mulai Transkripsi",
variant="primary",
size="lg",
elem_classes="btn-start",
)
# ---- Live Timer ----
gr.HTML('<div id="live-timer" class="live-timer"></div>')
# ---- Results ----
with gr.Group(elem_classes="card-section"):
gr.HTML('<div class="card-title">πŸ“Š Hasil Transkripsi</div>')
summary_output = gr.Markdown(
elem_classes="summary-box",
value="*Upload audio dan klik 'Mulai Transkripsi' untuk memulai.*"
)
transcript_output = gr.Textbox(
label="Teks Transkripsi",
lines=20,
max_lines=50,
show_copy_button=True,
interactive=False,
elem_classes="transcript-box",
placeholder="Hasil transkripsi dengan timestamp dan speaker label akan muncul di sini...\n\n[00:00] Speaker 1: contoh teks transkripsi...",
)
# ---- Downloads ----
with gr.Group(elem_classes="card-section"):
gr.HTML('<div class="card-title">πŸ“₯ Download File</div>')
gr.HTML('<div style="font-size:12px;color:#6a6a7a;margin-bottom:8px;">File otomatis dihapus setelah 1 jam.</div>')
with gr.Row(elem_classes="download-row"):
srt_file = gr.File(label="SRT β€” Subtitle untuk video player")
txt_file = gr.File(label="TXT β€” Teks dengan speaker label")
docx_file = gr.File(label="DOCX β€” Dokumen Word berwarna")
# ---- Connect ----
# Timer is started by MutationObserver when Gradio progress() appears in DOM.
# This ensures timer ONLY starts after validation passes (no file β†’ no progress).
# Timer success-stop via .then(); error-stop via MutationObserver on error toast.
btn_start.click(
fn=transcribe_full,
inputs=[audio_input, language_choice, speaker_count,
enable_diarization, enable_vad],
outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
).then(
fn=lambda: None,
inputs=None,
outputs=None,
js="() => { window.stopTranscribeTimer(true); }",
)
# ---- Footer ----
gr.HTML("""
<div class="footer-text">
<strong>TranscribeAI</strong> by <a href="https://huggingface.co/romizone">romizone</a>
&bull; <a href="https://github.com/romizone/transcribeAI">GitHub</a>
&bull; ZeroGPU H200 &bull; Whisper + PyTorch
</div>
""")
demo.queue().launch(ssr_mode=False)