Spaces:
Running on Zero
Running on Zero
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -20,15 +20,10 @@ from pathlib import Path
|
|
| 20 |
from transformers import pipeline
|
| 21 |
|
| 22 |
# ============================================================
|
| 23 |
-
# Config
|
| 24 |
# ============================================================
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
'base': 'openai/whisper-base',
|
| 28 |
-
'small': 'openai/whisper-small',
|
| 29 |
-
'medium': 'openai/whisper-medium',
|
| 30 |
-
'large-v3': 'openai/whisper-large-v3',
|
| 31 |
-
}
|
| 32 |
|
| 33 |
LANGUAGE_MAP = {
|
| 34 |
'Auto-detect': None,
|
|
@@ -57,36 +52,19 @@ OUTPUT_DIR = Path(tempfile.gettempdir()) / 'transcribeai_output'
|
|
| 57 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 58 |
|
| 59 |
# ============================================================
|
| 60 |
-
# Load
|
| 61 |
-
#
|
| 62 |
# ============================================================
|
| 63 |
device = 0 if torch.cuda.is_available() else "cpu"
|
| 64 |
-
DEFAULT_MODEL = "small"
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
pipes[DEFAULT_MODEL] = pipeline(
|
| 69 |
task="automatic-speech-recognition",
|
| 70 |
-
model=
|
| 71 |
chunk_length_s=30,
|
| 72 |
device=device,
|
| 73 |
)
|
| 74 |
-
print(f" {
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def get_pipe(model_size):
|
| 78 |
-
"""Get pipeline, load on-demand if not cached."""
|
| 79 |
-
if model_size not in pipes:
|
| 80 |
-
model_id = WHISPER_MODELS.get(model_size, WHISPER_MODELS[DEFAULT_MODEL])
|
| 81 |
-
print(f" Loading pipeline on-demand: {model_id}...")
|
| 82 |
-
pipes[model_size] = pipeline(
|
| 83 |
-
task="automatic-speech-recognition",
|
| 84 |
-
model=model_id,
|
| 85 |
-
chunk_length_s=30,
|
| 86 |
-
device=device,
|
| 87 |
-
)
|
| 88 |
-
print(f" {model_size} ready!")
|
| 89 |
-
return pipes[model_size]
|
| 90 |
|
| 91 |
|
| 92 |
# ============================================================
|
|
@@ -326,10 +304,8 @@ def generate_docx(segments, path, filename='', language='', duration=0):
|
|
| 326 |
# GPU Transcription (ZeroGPU — proven pattern)
|
| 327 |
# ============================================================
|
| 328 |
@spaces.GPU(duration=120)
|
| 329 |
-
def transcribe_with_gpu(audio_path,
|
| 330 |
-
"""Run Whisper inference on GPU.
|
| 331 |
-
pipe = get_pipe(model_size)
|
| 332 |
-
|
| 333 |
generate_kwargs = {"task": "transcribe"}
|
| 334 |
if language:
|
| 335 |
generate_kwargs["language"] = language
|
|
@@ -372,7 +348,7 @@ def transcribe_with_gpu(audio_path, model_size, language):
|
|
| 372 |
# ============================================================
|
| 373 |
# Full Pipeline (wired to Gradio)
|
| 374 |
# ============================================================
|
| 375 |
-
def transcribe_full(audio_file,
|
| 376 |
enable_diarization, enable_vad, progress=gr.Progress()):
|
| 377 |
if audio_file is None:
|
| 378 |
raise gr.Error("Upload file audio terlebih dahulu!")
|
|
@@ -387,7 +363,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
|
|
| 387 |
t0 = time.time()
|
| 388 |
try:
|
| 389 |
segments, detected_lang, duration = transcribe_with_gpu(
|
| 390 |
-
audio_path,
|
| 391 |
)
|
| 392 |
except Exception as e:
|
| 393 |
raise gr.Error(f"Gagal transkripsi: {str(e)}")
|
|
@@ -449,7 +425,7 @@ def transcribe_full(audio_file, model_size, language_name, num_speakers,
|
|
| 449 |
f"| File | {filename} |\n"
|
| 450 |
f"| Durasi Audio | {fmt_time(duration)} |\n"
|
| 451 |
f"| Bahasa | {lang_display} |\n"
|
| 452 |
-
f"| Model | {
|
| 453 |
f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
|
| 454 |
f"| Segmen | {len(segments)} |\n"
|
| 455 |
f"| Waktu Proses | {total_time:.0f} detik |\n"
|
|
@@ -848,9 +824,8 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
|
|
| 848 |
</div>
|
| 849 |
<div class="howto">
|
| 850 |
<div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
|
| 851 |
-
<div class="howto-step"><div class="howto-num">2</div>
|
| 852 |
-
<div class="howto-step"><div class="howto-num">3</div>
|
| 853 |
-
<div class="howto-step"><div class="howto-num">4</div> Download hasil</div>
|
| 854 |
</div>
|
| 855 |
</div>
|
| 856 |
""")
|
|
@@ -869,20 +844,14 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
|
|
| 869 |
# ---- Settings ----
|
| 870 |
with gr.Group(elem_classes="card-section"):
|
| 871 |
gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
|
|
|
|
| 872 |
with gr.Row():
|
| 873 |
-
model_choice = gr.Dropdown(
|
| 874 |
-
choices=list(WHISPER_MODELS.keys()),
|
| 875 |
-
value="small",
|
| 876 |
-
label="Model Whisper",
|
| 877 |
-
info="tiny (39M, cepat) • base (74M) • small (244M, rekomendasi) • medium (769M) • large-v3 (1.5B, paling akurat)",
|
| 878 |
-
scale=2,
|
| 879 |
-
)
|
| 880 |
language_choice = gr.Dropdown(
|
| 881 |
choices=list(LANGUAGE_MAP.keys()),
|
| 882 |
value="Auto-detect",
|
| 883 |
label="Bahasa",
|
| 884 |
info="Auto-detect atau pilih bahasa spesifik",
|
| 885 |
-
scale=
|
| 886 |
)
|
| 887 |
speaker_count = gr.Slider(
|
| 888 |
minimum=0, maximum=10, step=1, value=0,
|
|
@@ -939,7 +908,7 @@ with gr.Blocks(theme=THEME, title="TranscribeAI", css=CUSTOM_CSS) as demo:
|
|
| 939 |
# ---- Connect ----
|
| 940 |
btn_start.click(
|
| 941 |
fn=transcribe_full,
|
| 942 |
-
inputs=[audio_input,
|
| 943 |
enable_diarization, enable_vad],
|
| 944 |
outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
|
| 945 |
)
|
|
|
|
| 20 |
from transformers import pipeline
|
| 21 |
|
| 22 |
# ============================================================
|
| 23 |
+
# Config — Single model (small) for fastest startup & simplicity
|
| 24 |
# ============================================================
|
| 25 |
+
MODEL_ID = 'openai/whisper-small'
|
| 26 |
+
MODEL_NAME = 'small'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
LANGUAGE_MAP = {
|
| 29 |
'Auto-detect': None,
|
|
|
|
| 52 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 53 |
|
| 54 |
# ============================================================
|
| 55 |
+
# Load pipeline at MODULE LEVEL (ZeroGPU requirement!)
|
| 56 |
+
# Single model = faster startup, no on-demand loading delay
|
| 57 |
# ============================================================
|
| 58 |
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
| 59 |
|
| 60 |
+
print(f" Loading pipeline: {MODEL_ID}...")
|
| 61 |
+
pipe = pipeline(
|
|
|
|
| 62 |
task="automatic-speech-recognition",
|
| 63 |
+
model=MODEL_ID,
|
| 64 |
chunk_length_s=30,
|
| 65 |
device=device,
|
| 66 |
)
|
| 67 |
+
print(f" {MODEL_NAME} ready!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
# ============================================================
|
|
|
|
| 304 |
# GPU Transcription (ZeroGPU — proven pattern)
|
| 305 |
# ============================================================
|
| 306 |
@spaces.GPU(duration=120)
|
| 307 |
+
def transcribe_with_gpu(audio_path, language):
|
| 308 |
+
"""Run Whisper inference on GPU. Single model, always ready."""
|
|
|
|
|
|
|
| 309 |
generate_kwargs = {"task": "transcribe"}
|
| 310 |
if language:
|
| 311 |
generate_kwargs["language"] = language
|
|
|
|
| 348 |
# ============================================================
|
| 349 |
# Full Pipeline (wired to Gradio)
|
| 350 |
# ============================================================
|
| 351 |
+
def transcribe_full(audio_file, language_name, num_speakers,
|
| 352 |
enable_diarization, enable_vad, progress=gr.Progress()):
|
| 353 |
if audio_file is None:
|
| 354 |
raise gr.Error("Upload file audio terlebih dahulu!")
|
|
|
|
| 363 |
t0 = time.time()
|
| 364 |
try:
|
| 365 |
segments, detected_lang, duration = transcribe_with_gpu(
|
| 366 |
+
audio_path, lang_code
|
| 367 |
)
|
| 368 |
except Exception as e:
|
| 369 |
raise gr.Error(f"Gagal transkripsi: {str(e)}")
|
|
|
|
| 425 |
f"| File | {filename} |\n"
|
| 426 |
f"| Durasi Audio | {fmt_time(duration)} |\n"
|
| 427 |
f"| Bahasa | {lang_display} |\n"
|
| 428 |
+
f"| Model | {MODEL_NAME} (244M) |\n"
|
| 429 |
f"| Pembicara | {len(speakers_found)} ({', '.join(sorted(speakers_found))}) |\n"
|
| 430 |
f"| Segmen | {len(segments)} |\n"
|
| 431 |
f"| Waktu Proses | {total_time:.0f} detik |\n"
|
|
|
|
| 824 |
</div>
|
| 825 |
<div class="howto">
|
| 826 |
<div class="howto-step"><div class="howto-num">1</div> Upload audio</div>
|
| 827 |
+
<div class="howto-step"><div class="howto-num">2</div> Klik Mulai</div>
|
| 828 |
+
<div class="howto-step"><div class="howto-num">3</div> Download hasil</div>
|
|
|
|
| 829 |
</div>
|
| 830 |
</div>
|
| 831 |
""")
|
|
|
|
| 844 |
# ---- Settings ----
|
| 845 |
with gr.Group(elem_classes="card-section"):
|
| 846 |
gr.HTML('<div class="card-title">⚙️ Pengaturan</div>')
|
| 847 |
+
gr.HTML('<div style="font-size:12px;color:#818cf8;margin-bottom:8px;">Model: Whisper Small (244M) — auto-loaded, siap pakai</div>')
|
| 848 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
language_choice = gr.Dropdown(
|
| 850 |
choices=list(LANGUAGE_MAP.keys()),
|
| 851 |
value="Auto-detect",
|
| 852 |
label="Bahasa",
|
| 853 |
info="Auto-detect atau pilih bahasa spesifik",
|
| 854 |
+
scale=2,
|
| 855 |
)
|
| 856 |
speaker_count = gr.Slider(
|
| 857 |
minimum=0, maximum=10, step=1, value=0,
|
|
|
|
| 908 |
# ---- Connect ----
|
| 909 |
btn_start.click(
|
| 910 |
fn=transcribe_full,
|
| 911 |
+
inputs=[audio_input, language_choice, speaker_count,
|
| 912 |
enable_diarization, enable_vad],
|
| 913 |
outputs=[summary_output, transcript_output, srt_file, txt_file, docx_file],
|
| 914 |
)
|