Realtime-whisper-demo

Sleeping

App Files Files Community

hyungjoochae commited on Apr 19

Commit

cf76f12

verified ·

1 Parent(s): 7e6df1e

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -24

app.py CHANGED Viewed

@@ -10,14 +10,14 @@ import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
-# Install flash-attn without building CUDA part
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
-# Available models
 MODEL_OPTIONS = [
     "openai/whisper-tiny",
     "openai/whisper-base",
@@ -26,17 +26,19 @@ MODEL_OPTIONS = [
     "openai/whisper-large-v3-turbo"
 ]
-# Set device and dtype
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16
-# Default model
 current_model_name = MODEL_OPTIONS[-1]
-# Load pipeline for selected model
 def load_pipeline(model_name):
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_name, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
         attn_implementation="flash_attention_2"
     ).to(device)
@@ -53,19 +55,19 @@ def load_pipeline(model_name):
         device=device,
     )
-# Initialize pipeline
 pipe = load_pipeline(current_model_name)
-# Function to update model
-def update_model_and_return_status(model_name):
-    global pipe, current_model_name
     current_model_name = model_name
     pipe = load_pipeline(model_name)
-    return f"✅ Loaded model: {model_name}"
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
-    start_time = time.time()
     try:
         sr, y = new_chunk
         if y.ndim > 1:
@@ -81,7 +83,7 @@ def stream_transcribe(stream, new_chunk):
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
-    start_time = time.time()
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
@@ -96,14 +98,15 @@ def transcribe(inputs, previous_transcription):
 def clear(): return ""
 def clear_state(): return None
-# Microphone Interface
 with gr.Blocks() as microphone:
     with gr.Column():
         model_dropdown = gr.Dropdown(label="Select Whisper Model", choices=MODEL_OPTIONS, value=current_model_name)
         model_status = gr.Textbox(label="Model Load Status", value=f"✅ Loaded model: {current_model_name}")
-        model_dropdown.change(fn=update_model_and_return_status, inputs=model_dropdown, outputs=model_status)
-        gr.Markdown("# 🎤 Realtime Whisper ASR (Streaming)")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
@@ -112,22 +115,23 @@ with gr.Blocks() as microphone:
             clear_button = gr.Button("Clear Output")
         state = gr.State()
         input_audio_microphone.stream(
-            stream_transcribe,
-            [state, input_audio_microphone],
-            [state, output, latency_textbox],
-            time_limit=30,
             stream_every=2
         )
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
-# File Upload Interface
 with gr.Blocks() as file:
     with gr.Column():
         model_dropdown_file = gr.Dropdown(label="Select Whisper Model", choices=MODEL_OPTIONS, value=current_model_name)
         model_status_file = gr.Textbox(label="Model Load Status", value=f"✅ Loaded model: {current_model_name}")
-        model_dropdown_file.change(fn=update_model_and_return_status, inputs=model_dropdown_file, outputs=model_status_file)
-        gr.Markdown("# 📁 Upload Audio File for Transcription")
         with gr.Row():
             input_audio_file = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
@@ -138,7 +142,7 @@ with gr.Blocks() as file:
         submit_button.click(transcribe, [input_audio_file, output], [output, latency_textbox])
         clear_button.click(clear, outputs=[output])
-# Combine into demo
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])

 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
+# Install flash-attn
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
+# Whisper 모델 리스트
 MODEL_OPTIONS = [
     "openai/whisper-tiny",
     "openai/whisper-base",
     "openai/whisper-large-v3-turbo"
 ]
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16
+# 초기 모델 설정
 current_model_name = MODEL_OPTIONS[-1]
+# 모델 불러오기 함수
 def load_pipeline(model_name):
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_name,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
+        use_safetensors=True,
         attn_implementation="flash_attention_2"
     ).to(device)
         device=device,
     )
+# 전역 상태
 pipe = load_pipeline(current_model_name)
+# 모델 로딩 버튼 함수
+def update_model_with_button(model_name):
+    global current_model_name, pipe
     current_model_name = model_name
     pipe = load_pipeline(model_name)
+    return f"✅ Model loaded: {model_name}"
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
+    start_time = time.time()
     try:
         sr, y = new_chunk
         if y.ndim > 1:
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
+    start_time = time.time()
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
 def clear(): return ""
 def clear_state(): return None
+# 마이크 입력 탭
 with gr.Blocks() as microphone:
     with gr.Column():
+        gr.Markdown("### 🎙️ Realtime Whisper Transcription")
         model_dropdown = gr.Dropdown(label="Select Whisper Model", choices=MODEL_OPTIONS, value=current_model_name)
+        model_load_button = gr.Button("Load Model")
         model_status = gr.Textbox(label="Model Load Status", value=f"✅ Loaded model: {current_model_name}")
+        model_load_button.click(fn=update_model_with_button, inputs=[model_dropdown], outputs=[model_status])
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
             clear_button = gr.Button("Clear Output")
         state = gr.State()
         input_audio_microphone.stream(
+            stream_transcribe,
+            [state, input_audio_microphone],
+            [state, output, latency_textbox],
+            time_limit=30,
             stream_every=2
         )
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
+# 파일 업로드 탭
 with gr.Blocks() as file:
     with gr.Column():
+        gr.Markdown("### 📁 Upload Audio File for Transcription")
         model_dropdown_file = gr.Dropdown(label="Select Whisper Model", choices=MODEL_OPTIONS, value=current_model_name)
+        model_load_button_file = gr.Button("Load Model")
         model_status_file = gr.Textbox(label="Model Load Status", value=f"✅ Loaded model: {current_model_name}")
+        model_load_button_file.click(fn=update_model_with_button, inputs=[model_dropdown_file], outputs=[model_status_file])
         with gr.Row():
             input_audio_file = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
         submit_button.click(transcribe, [input_audio_file, output], [output, latency_textbox])
         clear_button.click(clear, outputs=[output])
+# 통합된 데모 UI
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])