Spaces:

romizone
/

TranscribeAI

Running on Zero

App Files Files Community

romizone commited on Feb 22

Commit

59cdb6f

verified ·

1 Parent(s): 31789de

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +19 -50

app.py CHANGED Viewed

@@ -20,15 +20,10 @@ from pathlib import Path
 from transformers import pipeline
 # ============================================================
-# Config
 # ============================================================
-WHISPER_MODELS = {
-    'tiny':     'openai/whisper-tiny',
-    'base':     'openai/whisper-base',
-    'small':    'openai/whisper-small',
-    'medium':   'openai/whisper-medium',
-    'large-v3': 'openai/whisper-large-v3',
-}
 LANGUAGE_MAP = {
     'Auto-detect': None,
@@ -57,36 +52,19 @@ OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
 OUTPUT_DIR.mkdir(exist_ok=True)
 # ============================================================
-# Load default pipeline at MODULE LEVEL (ZeroGPU requirement!)
-# Only load 'small' at startup. Other models loaded on-demand.
 # ============================================================
 device = 0 if torch.cuda.is_available() else "cpu"
-DEFAULT_MODEL = "small"
-pipes = {}
-print(f"  Loading default pipeline: {WHISPER_MODELS[DEFAULT_MODEL]}...")
-pipes[DEFAULT_MODEL] = pipeline(
     task="automatic-speech-recognition",
-    model=WHISPER_MODELS[DEFAULT_MODEL],
     chunk_length_s=30,
     device=device,
 )
-print(f"  {DEFAULT_MODEL} ready!")
-def get_pipe(model_size):
-    """Get pipeline, load on-demand if not cached."""
-    if model_size not in pipes:
-        model_id = WHISPER_MODELS.get(model_size, WHISPER_MODELS[DEFAULT_MODEL])
-        print(f"  Loading pipeline on-demand: {model_id}...")
-        pipes[model_size] = pipeline(
-            task="automatic-speech-recognition",
-            model=model_id,
-            chunk_length_s=30,
-            device=device,
-        )
-        print(f"  {model_size} ready!")
-    return pipes[model_size]
 # ============================================================
@@ -326,10 +304,8 @@ def generate_docx(segments, path, filename='', language='', duration=0):
 # GPU Transcription (ZeroGPU — proven pattern)
 # ============================================================
 @spaces.GPU(duration=120)
-def transcribe_with_gpu(audio_path, model_size, language):
-    """Run Whisper inference on GPU. Default model pre-loaded, others on-demand."""
-    pipe = get_pipe(model_size)
     generate_kwargs = {"task": "transcribe"}
     if language:
         generate_kwargs["language"] = language
@@ -372,7 +348,7 @@ def transcribe_with_gpu(audio_path, model_size, language):
 # ============================================================
 # Full Pipeline (wired to Gradio)
 # ============================================================
-def transcribe_full(audio_file, model_size, language_name, num_speakers,
                     enable_diarization, enable_vad, progress=gr.Progress()):
     if audio_file is None:
         raise gr.Error("Upload file audio terlebih dahulu!")
@@ -387,7 +363,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
     t0 = time.time()
     try:
         segments, detected_lang, duration = transcribe_with_gpu(
-            audio_path, model_size, lang_code
         )
     except Exception as e:
         raise gr.Error(f"Gagal transkripsi: {str(e)}")
@@ -449,7 +425,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
         f"| File | {filename} |\n"
         f"| Durasi Audio | {fmt_time(duration)} |\n"
         f"| Bahasa | {lang_display} |\n"
-        f"| Model | {model_size} |\n"
         f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
         f"| Segmen | {len(segments)} |\n"
         f"| Waktu Proses | {total_time:.0f} detik |\n"
@@ -848,9 +824,8 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
         </div>
         <div class="howto">
             <div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
-            <div class="howto-step"><div class="howto-num">2</div> Pilih model & bahasa</div>
-            <div class="howto-step"><div class="howto-num">3</div> Klik Mulai</div>
-            <div class="howto-step"><div class="howto-num">4</div> Download hasil</div>
         </div>
     </div>
     """)
@@ -869,20 +844,14 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
     # ---- Settings ----
     with gr.Group(elem_classes="card-section"):
         gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
         with gr.Row():
-            model_choice = gr.Dropdown(
-                choices=list(WHISPER_MODELS.keys()),
-                value="small",
-                label="Model Whisper",
-                info="tiny (39M, cepat) • base (74M) • small (244M, rekomendasi) • medium (769M) • large-v3 (1.5B, paling akurat)",
-                scale=2,
-            )
             language_choice = gr.Dropdown(
                 choices=list(LANGUAGE_MAP.keys()),
                 value="Auto-detect",
                 label="Bahasa",
                 info="Auto-detect atau pilih bahasa spesifik",
-                scale=1,
             )
             speaker_count = gr.Slider(
                 minimum=0, maximum=10, step=1, value=0,
@@ -939,7 +908,7 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
     # ---- Connect ----
     btn_start.click(
         fn=transcribe_full,
-        inputs=[audio_input, model_choice, language_choice, speaker_count,
                 enable_diarization, enable_vad],
         outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
     )

 from transformers import pipeline
 # ============================================================
+# Config — Single model (small) for fastest startup & simplicity
 # ============================================================
+MODEL_ID = 'openai/whisper-small'
+MODEL_NAME = 'small'
 LANGUAGE_MAP = {
     'Auto-detect': None,
 OUTPUT_DIR.mkdir(exist_ok=True)
 # ============================================================
+# Load pipeline at MODULE LEVEL (ZeroGPU requirement!)
+# Single model = faster startup, no on-demand loading delay
 # ============================================================
 device = 0 if torch.cuda.is_available() else "cpu"
+print(f"  Loading pipeline: {MODEL_ID}...")
+pipe = pipeline(
     task="automatic-speech-recognition",
+    model=MODEL_ID,
     chunk_length_s=30,
     device=device,
 )
+print(f"  {MODEL_NAME} ready!")
 # ============================================================
 # GPU Transcription (ZeroGPU — proven pattern)
 # ============================================================
 @spaces.GPU(duration=120)
+def transcribe_with_gpu(audio_path, language):
+    """Run Whisper inference on GPU. Single model, always ready."""
     generate_kwargs = {"task": "transcribe"}
     if language:
         generate_kwargs["language"] = language
 # ============================================================
 # Full Pipeline (wired to Gradio)
 # ============================================================
+def transcribe_full(audio_file, language_name, num_speakers,
                     enable_diarization, enable_vad, progress=gr.Progress()):
     if audio_file is None:
         raise gr.Error("Upload file audio terlebih dahulu!")
     t0 = time.time()
     try:
         segments, detected_lang, duration = transcribe_with_gpu(
+            audio_path, lang_code
         )
     except Exception as e:
         raise gr.Error(f"Gagal transkripsi: {str(e)}")
         f"| File | {filename} |\n"
         f"| Durasi Audio | {fmt_time(duration)} |\n"
         f"| Bahasa | {lang_display} |\n"
+        f"| Model | {MODEL_NAME} (244M) |\n"
         f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
         f"| Segmen | {len(segments)} |\n"
         f"| Waktu Proses | {total_time:.0f} detik |\n"
         </div>
         <div class="howto">
             <div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
+            <div class="howto-step"><div class="howto-num">2</div> Klik Mulai</div>
+            <div class="howto-step"><div class="howto-num">3</div> Download hasil</div>
         </div>
     </div>
     """)
     # ---- Settings ----
     with gr.Group(elem_classes="card-section"):
         gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
+        gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) &mdash; auto-loaded, siap pakai</div>')
         with gr.Row():
             language_choice = gr.Dropdown(
                 choices=list(LANGUAGE_MAP.keys()),
                 value="Auto-detect",
                 label="Bahasa",
                 info="Auto-detect atau pilih bahasa spesifik",
+                scale=2,
             )
             speaker_count = gr.Slider(
                 minimum=0, maximum=10, step=1, value=0,
     # ---- Connect ----
     btn_start.click(
         fn=transcribe_full,
+        inputs=[audio_input, language_choice, speaker_count,
                 enable_diarization, enable_vad],
         outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
     )