romizone commited on
Commit
59cdb6f
·
verified ·
1 Parent(s): 31789de

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +19 -50
app.py CHANGED
@@ -20,15 +20,10 @@ from pathlib import Path
20
  from transformers import pipeline
21
 
22
  # ============================================================
23
- # Config
24
  # ============================================================
25
- WHISPER_MODELS = {
26
- 'tiny': 'openai/whisper-tiny',
27
- 'base': 'openai/whisper-base',
28
- 'small': 'openai/whisper-small',
29
- 'medium': 'openai/whisper-medium',
30
- 'large-v3': 'openai/whisper-large-v3',
31
- }
32
 
33
  LANGUAGE_MAP = {
34
  'Auto-detect': None,
@@ -57,36 +52,19 @@ OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
57
  OUTPUT_DIR.mkdir(exist_ok=True)
58
 
59
  # ============================================================
60
- # Load default pipeline at MODULE LEVEL (ZeroGPU requirement!)
61
- # Only load 'small' at startup. Other models loaded on-demand.
62
  # ============================================================
63
  device = 0 if torch.cuda.is_available() else "cpu"
64
- DEFAULT_MODEL = "small"
65
 
66
- pipes = {}
67
- print(f" Loading default pipeline: {WHISPER_MODELS[DEFAULT_MODEL]}...")
68
- pipes[DEFAULT_MODEL] = pipeline(
69
  task="automatic-speech-recognition",
70
- model=WHISPER_MODELS[DEFAULT_MODEL],
71
  chunk_length_s=30,
72
  device=device,
73
  )
74
- print(f" {DEFAULT_MODEL} ready!")
75
-
76
-
77
- def get_pipe(model_size):
78
- """Get pipeline, load on-demand if not cached."""
79
- if model_size not in pipes:
80
- model_id = WHISPER_MODELS.get(model_size, WHISPER_MODELS[DEFAULT_MODEL])
81
- print(f" Loading pipeline on-demand: {model_id}...")
82
- pipes[model_size] = pipeline(
83
- task="automatic-speech-recognition",
84
- model=model_id,
85
- chunk_length_s=30,
86
- device=device,
87
- )
88
- print(f" {model_size} ready!")
89
- return pipes[model_size]
90
 
91
 
92
  # ============================================================
@@ -326,10 +304,8 @@ def generate_docx(segments, path, filename='', language='', duration=0):
326
  # GPU Transcription (ZeroGPU — proven pattern)
327
  # ============================================================
328
  @spaces.GPU(duration=120)
329
- def transcribe_with_gpu(audio_path, model_size, language):
330
- """Run Whisper inference on GPU. Default model pre-loaded, others on-demand."""
331
- pipe = get_pipe(model_size)
332
-
333
  generate_kwargs = {"task": "transcribe"}
334
  if language:
335
  generate_kwargs["language"] = language
@@ -372,7 +348,7 @@ def transcribe_with_gpu(audio_path, model_size, language):
372
  # ============================================================
373
  # Full Pipeline (wired to Gradio)
374
  # ============================================================
375
- def transcribe_full(audio_file, model_size, language_name, num_speakers,
376
  enable_diarization, enable_vad, progress=gr.Progress()):
377
  if audio_file is None:
378
  raise gr.Error("Upload file audio terlebih dahulu!")
@@ -387,7 +363,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
387
  t0 = time.time()
388
  try:
389
  segments, detected_lang, duration = transcribe_with_gpu(
390
- audio_path, model_size, lang_code
391
  )
392
  except Exception as e:
393
  raise gr.Error(f"Gagal transkripsi: {str(e)}")
@@ -449,7 +425,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
449
  f"| File | {filename} |\n"
450
  f"| Durasi Audio | {fmt_time(duration)} |\n"
451
  f"| Bahasa | {lang_display} |\n"
452
- f"| Model | {model_size} |\n"
453
  f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
454
  f"| Segmen | {len(segments)} |\n"
455
  f"| Waktu Proses | {total_time:.0f} detik |\n"
@@ -848,9 +824,8 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
848
  </div>
849
  <div class="howto">
850
  <div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
851
- <div class="howto-step"><div class="howto-num">2</div> Pilih model & bahasa</div>
852
- <div class="howto-step"><div class="howto-num">3</div> Klik Mulai</div>
853
- <div class="howto-step"><div class="howto-num">4</div> Download hasil</div>
854
  </div>
855
  </div>
856
  """)
@@ -869,20 +844,14 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
869
  # ---- Settings ----
870
  with gr.Group(elem_classes="card-section"):
871
  gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
 
872
  with gr.Row():
873
- model_choice = gr.Dropdown(
874
- choices=list(WHISPER_MODELS.keys()),
875
- value="small",
876
- label="Model Whisper",
877
- info="tiny (39M, cepat) • base (74M) • small (244M, rekomendasi) • medium (769M) • large-v3 (1.5B, paling akurat)",
878
- scale=2,
879
- )
880
  language_choice = gr.Dropdown(
881
  choices=list(LANGUAGE_MAP.keys()),
882
  value="Auto-detect",
883
  label="Bahasa",
884
  info="Auto-detect atau pilih bahasa spesifik",
885
- scale=1,
886
  )
887
  speaker_count = gr.Slider(
888
  minimum=0, maximum=10, step=1, value=0,
@@ -939,7 +908,7 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
939
  # ---- Connect ----
940
  btn_start.click(
941
  fn=transcribe_full,
942
- inputs=[audio_input, model_choice, language_choice, speaker_count,
943
  enable_diarization, enable_vad],
944
  outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
945
  )
 
20
  from transformers import pipeline
21
 
22
  # ============================================================
23
+ # Config — Single model (small) for fastest startup & simplicity
24
  # ============================================================
25
+ MODEL_ID = 'openai/whisper-small'
26
+ MODEL_NAME = 'small'
 
 
 
 
 
27
 
28
  LANGUAGE_MAP = {
29
  'Auto-detect': None,
 
52
  OUTPUT_DIR.mkdir(exist_ok=True)
53
 
54
  # ============================================================
55
+ # Load pipeline at MODULE LEVEL (ZeroGPU requirement!)
56
+ # Single model = faster startup, no on-demand loading delay
57
  # ============================================================
58
  device = 0 if torch.cuda.is_available() else "cpu"
 
59
 
60
+ print(f" Loading pipeline: {MODEL_ID}...")
61
+ pipe = pipeline(
 
62
  task="automatic-speech-recognition",
63
+ model=MODEL_ID,
64
  chunk_length_s=30,
65
  device=device,
66
  )
67
+ print(f" {MODEL_NAME} ready!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
 
70
  # ============================================================
 
304
  # GPU Transcription (ZeroGPU — proven pattern)
305
  # ============================================================
306
  @spaces.GPU(duration=120)
307
+ def transcribe_with_gpu(audio_path, language):
308
+ """Run Whisper inference on GPU. Single model, always ready."""
 
 
309
  generate_kwargs = {"task": "transcribe"}
310
  if language:
311
  generate_kwargs["language"] = language
 
348
  # ============================================================
349
  # Full Pipeline (wired to Gradio)
350
  # ============================================================
351
+ def transcribe_full(audio_file, language_name, num_speakers,
352
  enable_diarization, enable_vad, progress=gr.Progress()):
353
  if audio_file is None:
354
  raise gr.Error("Upload file audio terlebih dahulu!")
 
363
  t0 = time.time()
364
  try:
365
  segments, detected_lang, duration = transcribe_with_gpu(
366
+ audio_path, lang_code
367
  )
368
  except Exception as e:
369
  raise gr.Error(f"Gagal transkripsi: {str(e)}")
 
425
  f"| File | {filename} |\n"
426
  f"| Durasi Audio | {fmt_time(duration)} |\n"
427
  f"| Bahasa | {lang_display} |\n"
428
+ f"| Model | {MODEL_NAME} (244M) |\n"
429
  f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
430
  f"| Segmen | {len(segments)} |\n"
431
  f"| Waktu Proses | {total_time:.0f} detik |\n"
 
824
  </div>
825
  <div class="howto">
826
  <div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
827
+ <div class="howto-step"><div class="howto-num">2</div> Klik Mulai</div>
828
+ <div class="howto-step"><div class="howto-num">3</div> Download hasil</div>
 
829
  </div>
830
  </div>
831
  """)
 
844
  # ---- Settings ----
845
  with gr.Group(elem_classes="card-section"):
846
  gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
847
+ gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) &mdash; auto-loaded, siap pakai</div>')
848
  with gr.Row():
 
 
 
 
 
 
 
849
  language_choice = gr.Dropdown(
850
  choices=list(LANGUAGE_MAP.keys()),
851
  value="Auto-detect",
852
  label="Bahasa",
853
  info="Auto-detect atau pilih bahasa spesifik",
854
+ scale=2,
855
  )
856
  speaker_count = gr.Slider(
857
  minimum=0, maximum=10, step=1, value=0,
 
908
  # ---- Connect ----
909
  btn_start.click(
910
  fn=transcribe_full,
911
+ inputs=[audio_input, language_choice, speaker_count,
912
  enable_diarization, enable_vad],
913
  outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
914
  )