Spaces:
Running on Zero
Running on Zero
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -8,9 +8,9 @@ Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM
|
|
| 8 |
Output : SRT, TXT, DOCX
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
import os
|
| 12 |
import time
|
| 13 |
import tempfile
|
|
|
|
| 14 |
import torch
|
| 15 |
import spaces
|
| 16 |
import gradio as gr
|
|
@@ -47,7 +47,7 @@ LANGUAGE_MAP = {
|
|
| 47 |
'Italian': 'it',
|
| 48 |
}
|
| 49 |
|
| 50 |
-
BATCH_SIZE =
|
| 51 |
OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
|
| 52 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 53 |
|
|
@@ -57,12 +57,15 @@ OUTPUT_DIR.mkdir(exist_ok=True)
|
|
| 57 |
# ============================================================
|
| 58 |
device = 0 if torch.cuda.is_available() else "cpu"
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
pipe = pipeline(
|
| 62 |
task="automatic-speech-recognition",
|
| 63 |
model=MODEL_ID,
|
| 64 |
chunk_length_s=30,
|
| 65 |
device=device,
|
|
|
|
| 66 |
)
|
| 67 |
print(f" {MODEL_NAME} ready!")
|
| 68 |
|
|
@@ -79,8 +82,11 @@ def fmt_timestamp(seconds):
|
|
| 79 |
|
| 80 |
|
| 81 |
def fmt_time(seconds):
|
| 82 |
-
|
|
|
|
| 83 |
s = int(seconds % 60)
|
|
|
|
|
|
|
| 84 |
return f"{m:02d}:{s:02d}"
|
| 85 |
|
| 86 |
|
|
@@ -113,20 +119,21 @@ def perform_diarization(audio_path, segments, num_speakers):
|
|
| 113 |
continue
|
| 114 |
|
| 115 |
try:
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
delta = librosa.feature.delta(mfcc)
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
sb = librosa.feature.spectral_bandwidth(y=analysis_chunk, sr=sr)
|
| 122 |
-
ro = librosa.feature.spectral_rolloff(y=analysis_chunk, sr=sr)
|
| 123 |
-
zcr = librosa.feature.zero_crossing_rate(analysis_chunk)
|
| 124 |
f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
|
| 125 |
f0c = f0[f0 > 0]
|
| 126 |
f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
|
| 127 |
f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
|
| 128 |
|
| 129 |
-
combined = np.vstack([mfcc, delta
|
| 130 |
vec = np.concatenate([
|
| 131 |
np.mean(combined, axis=1),
|
| 132 |
np.std(combined, axis=1),
|
|
@@ -217,13 +224,21 @@ def generate_srt(segments, path):
|
|
| 217 |
f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
|
| 218 |
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
def generate_txt(segments, path, filename='', language='', duration=0):
|
| 221 |
-
lang_names = {'id': 'Indonesian', 'en': 'English'}
|
| 222 |
with open(path, 'w', encoding='utf-8') as f:
|
| 223 |
f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
|
| 224 |
if filename:
|
| 225 |
f.write(f"File: {filename}\n")
|
| 226 |
-
f.write(f"Language: {
|
| 227 |
f.write(f"Duration: {fmt_time(duration)}\n")
|
| 228 |
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 229 |
speakers = sorted(set(s.get('speaker', '') for s in segments))
|
|
@@ -242,8 +257,6 @@ def generate_docx(segments, path, filename='', language='', duration=0):
|
|
| 242 |
from docx import Document
|
| 243 |
from docx.shared import Pt, RGBColor
|
| 244 |
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 245 |
-
|
| 246 |
-
lang_names = {'id': 'Indonesian', 'en': 'English'}
|
| 247 |
colors = {
|
| 248 |
0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
|
| 249 |
2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
|
|
@@ -261,7 +274,7 @@ def generate_docx(segments, path, filename='', language='', duration=0):
|
|
| 261 |
meta = []
|
| 262 |
if filename:
|
| 263 |
meta.append(('File', filename))
|
| 264 |
-
meta.append(('Language',
|
| 265 |
meta.append(('Duration', fmt_time(duration)))
|
| 266 |
meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
| 267 |
speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
|
|
@@ -345,6 +358,23 @@ def transcribe_with_gpu(audio_path, language):
|
|
| 345 |
return raw_segments, detected_lang, duration
|
| 346 |
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
# ============================================================
|
| 349 |
# Full Pipeline (wired to Gradio)
|
| 350 |
# ============================================================
|
|
@@ -371,8 +401,12 @@ def transcribe_full(audio_file, language_name, num_speakers,
|
|
| 371 |
if not segments:
|
| 372 |
raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
transcribe_time = time.time() - t0
|
| 375 |
-
progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s)")
|
| 376 |
|
| 377 |
# 2. Speaker Diarization (CPU)
|
| 378 |
if enable_diarization and len(segments) >= 2:
|
|
@@ -439,8 +473,6 @@ def transcribe_full(audio_file, language_name, num_speakers,
|
|
| 439 |
# ============================================================
|
| 440 |
# Cleanup old files (>1 hour)
|
| 441 |
# ============================================================
|
| 442 |
-
import threading
|
| 443 |
-
|
| 444 |
def cleanup_loop():
|
| 445 |
while True:
|
| 446 |
try:
|
|
@@ -626,30 +658,6 @@ footer { display: none !important; }
|
|
| 626 |
border-color: #6366f1 !important;
|
| 627 |
}
|
| 628 |
|
| 629 |
-
/* Model info chips */
|
| 630 |
-
.model-chips {
|
| 631 |
-
display: flex;
|
| 632 |
-
gap: 6px;
|
| 633 |
-
flex-wrap: wrap;
|
| 634 |
-
margin-top: 8px;
|
| 635 |
-
}
|
| 636 |
-
.model-chip {
|
| 637 |
-
display: inline-flex;
|
| 638 |
-
align-items: center;
|
| 639 |
-
gap: 4px;
|
| 640 |
-
background: #222228;
|
| 641 |
-
color: #a0a0b0;
|
| 642 |
-
font-size: 11px;
|
| 643 |
-
padding: 3px 10px;
|
| 644 |
-
border-radius: 6px;
|
| 645 |
-
border: 1px solid #333340;
|
| 646 |
-
}
|
| 647 |
-
.model-chip.active {
|
| 648 |
-
background: rgba(99,102,241,.12);
|
| 649 |
-
color: #818cf8;
|
| 650 |
-
border-color: #6366f1;
|
| 651 |
-
}
|
| 652 |
-
|
| 653 |
/* How-to steps */
|
| 654 |
.howto {
|
| 655 |
display: flex;
|
|
@@ -844,7 +852,7 @@ box-shadow:0 4px 20px rgba(99,102,241,.3)}
|
|
| 844 |
var _fetch=window.fetch;
|
| 845 |
window.fetch=function(input,init){
|
| 846 |
var url=typeof input==='string'?input:(input&&input.url?input.url:'');
|
| 847 |
-
if(url.indexOf('/upload')!==-1 && init && init.method==='POST' && init.body){
|
| 848 |
return new Promise(function(resolve,reject){
|
| 849 |
var xhr=new XMLHttpRequest();
|
| 850 |
xhr.open('POST',url,true);
|
|
|
|
| 8 |
Output : SRT, TXT, DOCX
|
| 9 |
"""
|
| 10 |
|
|
|
|
| 11 |
import time
|
| 12 |
import tempfile
|
| 13 |
+
import threading
|
| 14 |
import torch
|
| 15 |
import spaces
|
| 16 |
import gradio as gr
|
|
|
|
| 47 |
'Italian': 'it',
|
| 48 |
}
|
| 49 |
|
| 50 |
+
BATCH_SIZE = 16 # A10G 24GB VRAM — safe for whisper-small float16
|
| 51 |
OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
|
| 52 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 53 |
|
|
|
|
| 57 |
# ============================================================
|
| 58 |
device = 0 if torch.cuda.is_available() else "cpu"
|
| 59 |
|
| 60 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 61 |
+
|
| 62 |
+
print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...")
|
| 63 |
pipe = pipeline(
|
| 64 |
task="automatic-speech-recognition",
|
| 65 |
model=MODEL_ID,
|
| 66 |
chunk_length_s=30,
|
| 67 |
device=device,
|
| 68 |
+
torch_dtype=torch_dtype,
|
| 69 |
)
|
| 70 |
print(f" {MODEL_NAME} ready!")
|
| 71 |
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
def fmt_time(seconds):
|
| 85 |
+
h = int(seconds // 3600)
|
| 86 |
+
m = int((seconds % 3600) // 60)
|
| 87 |
s = int(seconds % 60)
|
| 88 |
+
if h > 0:
|
| 89 |
+
return f"{h:02d}:{m:02d}:{s:02d}"
|
| 90 |
return f"{m:02d}:{s:02d}"
|
| 91 |
|
| 92 |
|
|
|
|
| 119 |
continue
|
| 120 |
|
| 121 |
try:
|
| 122 |
+
# Cap analysis to 3s per segment for speed
|
| 123 |
+
max_samples = int(sr * 3)
|
| 124 |
+
analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk
|
| 125 |
+
|
| 126 |
+
# MFCC (13 = industry standard) + delta — sufficient for speaker ID
|
| 127 |
+
mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13)
|
| 128 |
delta = librosa.feature.delta(mfcc)
|
| 129 |
+
|
| 130 |
+
# F0 (pitch) — key differentiator between speakers
|
|
|
|
|
|
|
|
|
|
| 131 |
f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
|
| 132 |
f0c = f0[f0 > 0]
|
| 133 |
f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
|
| 134 |
f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
|
| 135 |
|
| 136 |
+
combined = np.vstack([mfcc, delta])
|
| 137 |
vec = np.concatenate([
|
| 138 |
np.mean(combined, axis=1),
|
| 139 |
np.std(combined, axis=1),
|
|
|
|
| 224 |
f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
|
| 225 |
|
| 226 |
|
| 227 |
+
LANG_NAMES = {
|
| 228 |
+
'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean',
|
| 229 |
+
'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German',
|
| 230 |
+
'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai',
|
| 231 |
+
'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish',
|
| 232 |
+
'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected',
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
|
| 236 |
def generate_txt(segments, path, filename='', language='', duration=0):
|
|
|
|
| 237 |
with open(path, 'w', encoding='utf-8') as f:
|
| 238 |
f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
|
| 239 |
if filename:
|
| 240 |
f.write(f"File: {filename}\n")
|
| 241 |
+
f.write(f"Language: {LANG_NAMES.get(language, language)}\n")
|
| 242 |
f.write(f"Duration: {fmt_time(duration)}\n")
|
| 243 |
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 244 |
speakers = sorted(set(s.get('speaker', '') for s in segments))
|
|
|
|
| 257 |
from docx import Document
|
| 258 |
from docx.shared import Pt, RGBColor
|
| 259 |
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
|
|
|
|
|
| 260 |
colors = {
|
| 261 |
0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
|
| 262 |
2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
|
|
|
|
| 274 |
meta = []
|
| 275 |
if filename:
|
| 276 |
meta.append(('File', filename))
|
| 277 |
+
meta.append(('Language', LANG_NAMES.get(language, language)))
|
| 278 |
meta.append(('Duration', fmt_time(duration)))
|
| 279 |
meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
| 280 |
speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
|
|
|
|
| 358 |
return raw_segments, detected_lang, duration
|
| 359 |
|
| 360 |
|
| 361 |
+
def apply_vad_filter(segments):
|
| 362 |
+
"""Filter out segments that are likely silence/noise (very short + filler)."""
|
| 363 |
+
FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]',
|
| 364 |
+
'(music)', '[Musik]', '[musik]', '♪', '♪♪', '♫'}
|
| 365 |
+
MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise
|
| 366 |
+
filtered = []
|
| 367 |
+
for seg in segments:
|
| 368 |
+
text = seg['text'].strip()
|
| 369 |
+
seg_dur = seg['end'] - seg['start']
|
| 370 |
+
if text in FILLER:
|
| 371 |
+
continue
|
| 372 |
+
if seg_dur < MIN_DURATION and len(text.split()) <= 1:
|
| 373 |
+
continue
|
| 374 |
+
filtered.append(seg)
|
| 375 |
+
return filtered if filtered else segments # fallback: return original if all filtered
|
| 376 |
+
|
| 377 |
+
|
| 378 |
# ============================================================
|
| 379 |
# Full Pipeline (wired to Gradio)
|
| 380 |
# ============================================================
|
|
|
|
| 401 |
if not segments:
|
| 402 |
raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
|
| 403 |
|
| 404 |
+
# 1b. VAD filter — remove silence/filler segments
|
| 405 |
+
if enable_vad:
|
| 406 |
+
segments = apply_vad_filter(segments)
|
| 407 |
+
|
| 408 |
transcribe_time = time.time() - t0
|
| 409 |
+
progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s) — {len(segments)} segmen")
|
| 410 |
|
| 411 |
# 2. Speaker Diarization (CPU)
|
| 412 |
if enable_diarization and len(segments) >= 2:
|
|
|
|
| 473 |
# ============================================================
|
| 474 |
# Cleanup old files (>1 hour)
|
| 475 |
# ============================================================
|
|
|
|
|
|
|
| 476 |
def cleanup_loop():
|
| 477 |
while True:
|
| 478 |
try:
|
|
|
|
| 658 |
border-color: #6366f1 !important;
|
| 659 |
}
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
/* How-to steps */
|
| 662 |
.howto {
|
| 663 |
display: flex;
|
|
|
|
| 852 |
var _fetch=window.fetch;
|
| 853 |
window.fetch=function(input,init){
|
| 854 |
var url=typeof input==='string'?input:(input&&input.url?input.url:'');
|
| 855 |
+
if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){
|
| 856 |
return new Promise(function(resolve,reject){
|
| 857 |
var xhr=new XMLHttpRequest();
|
| 858 |
xhr.open('POST',url,true);
|