Spaces:

romizone
/

TranscribeAI

Running on Zero

App Files Files Community

romizone commited on Feb 22

Commit

d3cd3c5

verified ·

1 Parent(s): 6183223

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +53 -45

app.py CHANGED Viewed

@@ -8,9 +8,9 @@ Input   : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM
 Output  : SRT, TXT, DOCX
 """
-import os
 import time
 import tempfile
 import torch
 import spaces
 import gradio as gr
@@ -47,7 +47,7 @@ LANGUAGE_MAP = {
     'Italian': 'it',
 }
-BATCH_SIZE = 8
 OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
 OUTPUT_DIR.mkdir(exist_ok=True)
@@ -57,12 +57,15 @@ OUTPUT_DIR.mkdir(exist_ok=True)
 # ============================================================
 device = 0 if torch.cuda.is_available() else "cpu"
-print(f"  Loading pipeline: {MODEL_ID}...")
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_ID,
     chunk_length_s=30,
     device=device,
 )
 print(f"  {MODEL_NAME} ready!")
@@ -79,8 +82,11 @@ def fmt_timestamp(seconds):
 def fmt_time(seconds):
-    m = int(seconds // 60)
     s = int(seconds % 60)
     return f"{m:02d}:{s:02d}"
@@ -113,20 +119,21 @@ def perform_diarization(audio_path, segments, num_speakers):
             continue
         try:
-            analysis_chunk = chunk[:sr * 3] if len(chunk) > sr * 3 else chunk
-            mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=20)
             delta = librosa.feature.delta(mfcc)
-            delta2 = librosa.feature.delta(mfcc, order=2)
-            sc = librosa.feature.spectral_centroid(y=analysis_chunk, sr=sr)
-            sb = librosa.feature.spectral_bandwidth(y=analysis_chunk, sr=sr)
-            ro = librosa.feature.spectral_rolloff(y=analysis_chunk, sr=sr)
-            zcr = librosa.feature.zero_crossing_rate(analysis_chunk)
             f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
             f0c = f0[f0 > 0]
             f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
             f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
-            combined = np.vstack([mfcc, delta, delta2, sc, sb, ro, zcr])
             vec = np.concatenate([
                 np.mean(combined, axis=1),
                 np.std(combined, axis=1),
@@ -217,13 +224,21 @@ def generate_srt(segments, path):
             f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
 def generate_txt(segments, path, filename='', language='', duration=0):
-    lang_names = {'id': 'Indonesian', 'en': 'English'}
     with open(path, 'w', encoding='utf-8') as f:
         f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
         if filename:
             f.write(f"File: {filename}\n")
-        f.write(f"Language: {lang_names.get(language, language)}\n")
         f.write(f"Duration: {fmt_time(duration)}\n")
         f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
         speakers = sorted(set(s.get('speaker', '') for s in segments))
@@ -242,8 +257,6 @@ def generate_docx(segments, path, filename='', language='', duration=0):
     from docx import Document
     from docx.shared import Pt, RGBColor
     from docx.enum.text import WD_ALIGN_PARAGRAPH
-    lang_names = {'id': 'Indonesian', 'en': 'English'}
     colors = {
         0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
         2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
@@ -261,7 +274,7 @@ def generate_docx(segments, path, filename='', language='', duration=0):
     meta = []
     if filename:
         meta.append(('File', filename))
-    meta.append(('Language', lang_names.get(language, language)))
     meta.append(('Duration', fmt_time(duration)))
     meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
     speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
@@ -345,6 +358,23 @@ def transcribe_with_gpu(audio_path, language):
     return raw_segments, detected_lang, duration
 # ============================================================
 # Full Pipeline (wired to Gradio)
 # ============================================================
@@ -371,8 +401,12 @@ def transcribe_full(audio_file, language_name, num_speakers,
     if not segments:
         raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
     transcribe_time = time.time() - t0
-    progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s)")
     # 2. Speaker Diarization (CPU)
     if enable_diarization and len(segments) >= 2:
@@ -439,8 +473,6 @@ def transcribe_full(audio_file, language_name, num_speakers,
 # ============================================================
 # Cleanup old files (>1 hour)
 # ============================================================
-import threading
 def cleanup_loop():
     while True:
         try:
@@ -626,30 +658,6 @@ footer { display: none !important; }
     border-color: #6366f1 !important;
 }
-/* Model info chips */
-.model-chips {
-    display: flex;
-    gap: 6px;
-    flex-wrap: wrap;
-    margin-top: 8px;
-}
-.model-chip {
-    display: inline-flex;
-    align-items: center;
-    gap: 4px;
-    background: #222228;
-    color: #a0a0b0;
-    font-size: 11px;
-    padding: 3px 10px;
-    border-radius: 6px;
-    border: 1px solid #333340;
-}
-.model-chip.active {
-    background: rgba(99,102,241,.12);
-    color: #818cf8;
-    border-color: #6366f1;
-}
 /* How-to steps */
 .howto {
     display: flex;
@@ -844,7 +852,7 @@ box-shadow:0 4px 20px rgba(99,102,241,.3)}
   var _fetch=window.fetch;
   window.fetch=function(input,init){
     var url=typeof input==='string'?input:(input&&input.url?input.url:'');
-    if(url.indexOf('/upload')!==-1 && init && init.method==='POST' && init.body){
       return new Promise(function(resolve,reject){
         var xhr=new XMLHttpRequest();
         xhr.open('POST',url,true);

 Output  : SRT, TXT, DOCX
 """
 import time
 import tempfile
+import threading
 import torch
 import spaces
 import gradio as gr
     'Italian': 'it',
 }
+BATCH_SIZE = 16  # A10G 24GB VRAM — safe for whisper-small float16
 OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
 OUTPUT_DIR.mkdir(exist_ok=True)
 # ============================================================
 device = 0 if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+print(f"  Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...")
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_ID,
     chunk_length_s=30,
     device=device,
+    torch_dtype=torch_dtype,
 )
 print(f"  {MODEL_NAME} ready!")
 def fmt_time(seconds):
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
     s = int(seconds % 60)
+    if h > 0:
+        return f"{h:02d}:{m:02d}:{s:02d}"
     return f"{m:02d}:{s:02d}"
             continue
         try:
+            # Cap analysis to 3s per segment for speed
+            max_samples = int(sr * 3)
+            analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk
+            # MFCC (13 = industry standard) + delta — sufficient for speaker ID
+            mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13)
             delta = librosa.feature.delta(mfcc)
+            # F0 (pitch) — key differentiator between speakers
             f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
             f0c = f0[f0 > 0]
             f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
             f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
+            combined = np.vstack([mfcc, delta])
             vec = np.concatenate([
                 np.mean(combined, axis=1),
                 np.std(combined, axis=1),
             f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
+LANG_NAMES = {
+    'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean',
+    'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German',
+    'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai',
+    'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish',
+    'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected',
+}
 def generate_txt(segments, path, filename='', language='', duration=0):
     with open(path, 'w', encoding='utf-8') as f:
         f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
         if filename:
             f.write(f"File: {filename}\n")
+        f.write(f"Language: {LANG_NAMES.get(language, language)}\n")
         f.write(f"Duration: {fmt_time(duration)}\n")
         f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
         speakers = sorted(set(s.get('speaker', '') for s in segments))
     from docx import Document
     from docx.shared import Pt, RGBColor
     from docx.enum.text import WD_ALIGN_PARAGRAPH
     colors = {
         0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
         2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
     meta = []
     if filename:
         meta.append(('File', filename))
+    meta.append(('Language', LANG_NAMES.get(language, language)))
     meta.append(('Duration', fmt_time(duration)))
     meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
     speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
     return raw_segments, detected_lang, duration
+def apply_vad_filter(segments):
+    """Filter out segments that are likely silence/noise (very short + filler)."""
+    FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]',
+              '(music)', '[Musik]', '[musik]', '♪', '♪♪', '♫'}
+    MIN_DURATION = 0.3  # segments shorter than 0.3s are likely noise
+    filtered = []
+    for seg in segments:
+        text = seg['text'].strip()
+        seg_dur = seg['end'] - seg['start']
+        if text in FILLER:
+            continue
+        if seg_dur < MIN_DURATION and len(text.split()) <= 1:
+            continue
+        filtered.append(seg)
+    return filtered if filtered else segments  # fallback: return original if all filtered
 # ============================================================
 # Full Pipeline (wired to Gradio)
 # ============================================================
     if not segments:
         raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
+    # 1b. VAD filter — remove silence/filler segments
+    if enable_vad:
+        segments = apply_vad_filter(segments)
     transcribe_time = time.time() - t0
+    progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s) — {len(segments)} segmen")
     # 2. Speaker Diarization (CPU)
     if enable_diarization and len(segments) >= 2:
 # ============================================================
 # Cleanup old files (>1 hour)
 # ============================================================
 def cleanup_loop():
     while True:
         try:
     border-color: #6366f1 !important;
 }
 /* How-to steps */
 .howto {
     display: flex;
   var _fetch=window.fetch;
   window.fetch=function(input,init){
     var url=typeof input==='string'?input:(input&&input.url?input.url:'');
+    if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){
       return new Promise(function(resolve,reject){
         var xhr=new XMLHttpRequest();
         xhr.open('POST',url,true);