romizone commited on
Commit
d3cd3c5
·
verified ·
1 Parent(s): 6183223

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +53 -45
app.py CHANGED
@@ -8,9 +8,9 @@ Input : MP3, MP4, WAV, M4A, OGG, FLAC, WEBM
8
  Output : SRT, TXT, DOCX
9
  """
10
 
11
- import os
12
  import time
13
  import tempfile
 
14
  import torch
15
  import spaces
16
  import gradio as gr
@@ -47,7 +47,7 @@ LANGUAGE_MAP = {
47
  'Italian': 'it',
48
  }
49
 
50
- BATCH_SIZE = 8
51
  OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
52
  OUTPUT_DIR.mkdir(exist_ok=True)
53
 
@@ -57,12 +57,15 @@ OUTPUT_DIR.mkdir(exist_ok=True)
57
  # ============================================================
58
  device = 0 if torch.cuda.is_available() else "cpu"
59
 
60
- print(f" Loading pipeline: {MODEL_ID}...")
 
 
61
  pipe = pipeline(
62
  task="automatic-speech-recognition",
63
  model=MODEL_ID,
64
  chunk_length_s=30,
65
  device=device,
 
66
  )
67
  print(f" {MODEL_NAME} ready!")
68
 
@@ -79,8 +82,11 @@ def fmt_timestamp(seconds):
79
 
80
 
81
  def fmt_time(seconds):
82
- m = int(seconds // 60)
 
83
  s = int(seconds % 60)
 
 
84
  return f"{m:02d}:{s:02d}"
85
 
86
 
@@ -113,20 +119,21 @@ def perform_diarization(audio_path, segments, num_speakers):
113
  continue
114
 
115
  try:
116
- analysis_chunk = chunk[:sr * 3] if len(chunk) > sr * 3 else chunk
117
- mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=20)
 
 
 
 
118
  delta = librosa.feature.delta(mfcc)
119
- delta2 = librosa.feature.delta(mfcc, order=2)
120
- sc = librosa.feature.spectral_centroid(y=analysis_chunk, sr=sr)
121
- sb = librosa.feature.spectral_bandwidth(y=analysis_chunk, sr=sr)
122
- ro = librosa.feature.spectral_rolloff(y=analysis_chunk, sr=sr)
123
- zcr = librosa.feature.zero_crossing_rate(analysis_chunk)
124
  f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
125
  f0c = f0[f0 > 0]
126
  f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
127
  f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
128
 
129
- combined = np.vstack([mfcc, delta, delta2, sc, sb, ro, zcr])
130
  vec = np.concatenate([
131
  np.mean(combined, axis=1),
132
  np.std(combined, axis=1),
@@ -217,13 +224,21 @@ def generate_srt(segments, path):
217
  f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
218
 
219
 
 
 
 
 
 
 
 
 
 
220
  def generate_txt(segments, path, filename='', language='', duration=0):
221
- lang_names = {'id': 'Indonesian', 'en': 'English'}
222
  with open(path, 'w', encoding='utf-8') as f:
223
  f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
224
  if filename:
225
  f.write(f"File: {filename}\n")
226
- f.write(f"Language: {lang_names.get(language, language)}\n")
227
  f.write(f"Duration: {fmt_time(duration)}\n")
228
  f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
229
  speakers = sorted(set(s.get('speaker', '') for s in segments))
@@ -242,8 +257,6 @@ def generate_docx(segments, path, filename='', language='', duration=0):
242
  from docx import Document
243
  from docx.shared import Pt, RGBColor
244
  from docx.enum.text import WD_ALIGN_PARAGRAPH
245
-
246
- lang_names = {'id': 'Indonesian', 'en': 'English'}
247
  colors = {
248
  0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
249
  2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
@@ -261,7 +274,7 @@ def generate_docx(segments, path, filename='', language='', duration=0):
261
  meta = []
262
  if filename:
263
  meta.append(('File', filename))
264
- meta.append(('Language', lang_names.get(language, language)))
265
  meta.append(('Duration', fmt_time(duration)))
266
  meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
267
  speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
@@ -345,6 +358,23 @@ def transcribe_with_gpu(audio_path, language):
345
  return raw_segments, detected_lang, duration
346
 
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  # ============================================================
349
  # Full Pipeline (wired to Gradio)
350
  # ============================================================
@@ -371,8 +401,12 @@ def transcribe_full(audio_file, language_name, num_speakers,
371
  if not segments:
372
  raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
373
 
 
 
 
 
374
  transcribe_time = time.time() - t0
375
- progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s)")
376
 
377
  # 2. Speaker Diarization (CPU)
378
  if enable_diarization and len(segments) >= 2:
@@ -439,8 +473,6 @@ def transcribe_full(audio_file, language_name, num_speakers,
439
  # ============================================================
440
  # Cleanup old files (>1 hour)
441
  # ============================================================
442
- import threading
443
-
444
  def cleanup_loop():
445
  while True:
446
  try:
@@ -626,30 +658,6 @@ footer { display: none !important; }
626
  border-color: #6366f1 !important;
627
  }
628
 
629
- /* Model info chips */
630
- .model-chips {
631
- display: flex;
632
- gap: 6px;
633
- flex-wrap: wrap;
634
- margin-top: 8px;
635
- }
636
- .model-chip {
637
- display: inline-flex;
638
- align-items: center;
639
- gap: 4px;
640
- background: #222228;
641
- color: #a0a0b0;
642
- font-size: 11px;
643
- padding: 3px 10px;
644
- border-radius: 6px;
645
- border: 1px solid #333340;
646
- }
647
- .model-chip.active {
648
- background: rgba(99,102,241,.12);
649
- color: #818cf8;
650
- border-color: #6366f1;
651
- }
652
-
653
  /* How-to steps */
654
  .howto {
655
  display: flex;
@@ -844,7 +852,7 @@ box-shadow:0 4px 20px rgba(99,102,241,.3)}
844
  var _fetch=window.fetch;
845
  window.fetch=function(input,init){
846
  var url=typeof input==='string'?input:(input&&input.url?input.url:'');
847
- if(url.indexOf('/upload')!==-1 && init && init.method==='POST' && init.body){
848
  return new Promise(function(resolve,reject){
849
  var xhr=new XMLHttpRequest();
850
  xhr.open('POST',url,true);
 
8
  Output : SRT, TXT, DOCX
9
  """
10
 
 
11
  import time
12
  import tempfile
13
+ import threading
14
  import torch
15
  import spaces
16
  import gradio as gr
 
47
  'Italian': 'it',
48
  }
49
 
50
+ BATCH_SIZE = 16 # A10G 24GB VRAM — safe for whisper-small float16
51
  OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
52
  OUTPUT_DIR.mkdir(exist_ok=True)
53
 
 
57
  # ============================================================
58
  device = 0 if torch.cuda.is_available() else "cpu"
59
 
60
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
61
+
62
+ print(f" Loading pipeline: {MODEL_ID} (dtype={torch_dtype})...")
63
  pipe = pipeline(
64
  task="automatic-speech-recognition",
65
  model=MODEL_ID,
66
  chunk_length_s=30,
67
  device=device,
68
+ torch_dtype=torch_dtype,
69
  )
70
  print(f" {MODEL_NAME} ready!")
71
 
 
82
 
83
 
84
  def fmt_time(seconds):
85
+ h = int(seconds // 3600)
86
+ m = int((seconds % 3600) // 60)
87
  s = int(seconds % 60)
88
+ if h > 0:
89
+ return f"{h:02d}:{m:02d}:{s:02d}"
90
  return f"{m:02d}:{s:02d}"
91
 
92
 
 
119
  continue
120
 
121
  try:
122
+ # Cap analysis to 3s per segment for speed
123
+ max_samples = int(sr * 3)
124
+ analysis_chunk = chunk[:max_samples] if len(chunk) > max_samples else chunk
125
+
126
+ # MFCC (13 = industry standard) + delta — sufficient for speaker ID
127
+ mfcc = librosa.feature.mfcc(y=analysis_chunk, sr=sr, n_mfcc=13)
128
  delta = librosa.feature.delta(mfcc)
129
+
130
+ # F0 (pitch) — key differentiator between speakers
 
 
 
131
  f0 = librosa.yin(analysis_chunk, fmin=50, fmax=500, sr=sr)
132
  f0c = f0[f0 > 0]
133
  f0_mean = float(np.mean(f0c)) if len(f0c) > 0 else 0.0
134
  f0_std = float(np.std(f0c)) if len(f0c) > 0 else 0.0
135
 
136
+ combined = np.vstack([mfcc, delta])
137
  vec = np.concatenate([
138
  np.mean(combined, axis=1),
139
  np.std(combined, axis=1),
 
224
  f.write(f"[{sp}] {seg['text']}\n\n" if sp else f"{seg['text']}\n\n")
225
 
226
 
227
+ LANG_NAMES = {
228
+ 'id': 'Indonesian', 'en': 'English', 'ja': 'Japanese', 'ko': 'Korean',
229
+ 'zh': 'Chinese', 'ar': 'Arabic', 'fr': 'French', 'de': 'German',
230
+ 'es': 'Spanish', 'pt': 'Portuguese', 'ru': 'Russian', 'th': 'Thai',
231
+ 'vi': 'Vietnamese', 'ms': 'Malay', 'hi': 'Hindi', 'tr': 'Turkish',
232
+ 'nl': 'Dutch', 'it': 'Italian', 'auto': 'Auto-detected',
233
+ }
234
+
235
+
236
  def generate_txt(segments, path, filename='', language='', duration=0):
 
237
  with open(path, 'w', encoding='utf-8') as f:
238
  f.write("TRANSCRIPT\n" + "=" * 60 + "\n")
239
  if filename:
240
  f.write(f"File: {filename}\n")
241
+ f.write(f"Language: {LANG_NAMES.get(language, language)}\n")
242
  f.write(f"Duration: {fmt_time(duration)}\n")
243
  f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
244
  speakers = sorted(set(s.get('speaker', '') for s in segments))
 
257
  from docx import Document
258
  from docx.shared import Pt, RGBColor
259
  from docx.enum.text import WD_ALIGN_PARAGRAPH
 
 
260
  colors = {
261
  0: RGBColor(79, 70, 229), 1: RGBColor(220, 38, 38),
262
  2: RGBColor(5, 150, 105), 3: RGBColor(217, 119, 6),
 
274
  meta = []
275
  if filename:
276
  meta.append(('File', filename))
277
+ meta.append(('Language', LANG_NAMES.get(language, language)))
278
  meta.append(('Duration', fmt_time(duration)))
279
  meta.append(('Generated', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
280
  speakers = sorted(set(s.get('speaker', 'Speaker 1') for s in segments))
 
358
  return raw_segments, detected_lang, duration
359
 
360
 
361
+ def apply_vad_filter(segments):
362
+ """Filter out segments that are likely silence/noise (very short + filler)."""
363
+ FILLER = {'', '.', '..', '...', '…', '-', '–', '[Music]', '[music]',
364
+ '(music)', '[Musik]', '[musik]', '♪', '♪♪', '♫'}
365
+ MIN_DURATION = 0.3 # segments shorter than 0.3s are likely noise
366
+ filtered = []
367
+ for seg in segments:
368
+ text = seg['text'].strip()
369
+ seg_dur = seg['end'] - seg['start']
370
+ if text in FILLER:
371
+ continue
372
+ if seg_dur < MIN_DURATION and len(text.split()) <= 1:
373
+ continue
374
+ filtered.append(seg)
375
+ return filtered if filtered else segments # fallback: return original if all filtered
376
+
377
+
378
  # ============================================================
379
  # Full Pipeline (wired to Gradio)
380
  # ============================================================
 
401
  if not segments:
402
  raise gr.Error("Tidak ada teks yang terdeteksi dari audio.")
403
 
404
+ # 1b. VAD filter — remove silence/filler segments
405
+ if enable_vad:
406
+ segments = apply_vad_filter(segments)
407
+
408
  transcribe_time = time.time() - t0
409
+ progress(0.60, desc=f"✅ Transkripsi selesai ({transcribe_time:.0f}s) — {len(segments)} segmen")
410
 
411
  # 2. Speaker Diarization (CPU)
412
  if enable_diarization and len(segments) >= 2:
 
473
  # ============================================================
474
  # Cleanup old files (>1 hour)
475
  # ============================================================
 
 
476
  def cleanup_loop():
477
  while True:
478
  try:
 
658
  border-color: #6366f1 !important;
659
  }
660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  /* How-to steps */
662
  .howto {
663
  display: flex;
 
852
  var _fetch=window.fetch;
853
  window.fetch=function(input,init){
854
  var url=typeof input==='string'?input:(input&&input.url?input.url:'');
855
+ if(url.indexOf('/upload')!==-1 && url.indexOf('/upload_progress')===-1 && init && init.method==='POST' && init.body){
856
  return new Promise(function(resolve,reject){
857
  var xhr=new XMLHttpRequest();
858
  xhr.open('POST',url,true);