Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

michaeltangz commited on Dec 8, 2025

Commit

721ab04

1 Parent(s): 20eeccd

refactor app.py to remove flash attention installation logic and simplify attention implementation; enhance error handling in transcription functions

Browse files

Files changed (1) hide show

app.py +12 -65

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import spaces
 import torch
 import gradio as gr
-import tempfile
 import os
 import uuid
 import scipy.io.wavfile
@@ -13,25 +12,12 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16
 MODEL_NAME = "openai/whisper-large-v3-turbo"
-# Try to use flash attention, fall back to sdpa if not available
-try:
-    import subprocess
-    subprocess.run(
-        "pip install flash-attn --no-build-isolation",
-        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-        shell=True,
-    )
-    from flash_attn import flash_attn_func
-    attn_implementation = "flash_attention_2"
-except Exception:
-    attn_implementation = "sdpa"  # Use PyTorch's scaled dot product attention
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch_dtype,
     low_cpu_mem_usage=True,
     use_safetensors=True,
-    attn_implementation=attn_implementation
 )
 model.to(device)
@@ -46,6 +32,7 @@ pipe = pipeline(
     chunk_length_s=10,
     torch_dtype=torch_dtype,
     device=device,
 )
 @spaces.GPU
@@ -54,7 +41,6 @@ def stream_transcribe(stream, new_chunk):
     try:
         sr, y = new_chunk
-        # Convert to mono if stereo
         if y.ndim > 1:
             y = y.mean(axis=1)
@@ -75,7 +61,7 @@ def stream_transcribe(stream, new_chunk):
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        return stream, e, "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
@@ -95,25 +81,6 @@ def transcribe(inputs, previous_transcription):
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
-@spaces.GPU
-def translate_and_transcribe(inputs, previous_transcription, target_language):
-    start_time = time.time()
-    try:
-        filename = f"{uuid.uuid4().hex}.wav"
-        sample_rate, audio_data = inputs
-        scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language, "condition_on_previous_text": False})["text"]
-        previous_transcription += translation
-        end_time = time.time()
-        latency = end_time - start_time
-        return previous_transcription, f"{latency:.2f}"
-    except Exception as e:
-        print(f"Error during Translation and Transcription: {e}")
-        return previous_transcription, "Error"
 def clear():
     return ""
@@ -122,7 +89,7 @@ def clear_state():
 with gr.Blocks() as microphone:
     with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
@@ -130,12 +97,17 @@ with gr.Blocks() as microphone:
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
-        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
@@ -144,34 +116,9 @@ with gr.Blocks() as file:
             submit_button = gr.Button("Submit")
             clear_button = gr.Button("Clear Output")
-        submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
-# with gr.Blocks() as translate:
-#     with gr.Column():
-#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-#         with gr.Row():
-#             input_audio_microphone = gr.Audio(streaming=True)
-#             output = gr.Textbox(label="Transcription and Translation", value="")
-#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-#             target_language_dropdown = gr.Dropdown(
-#                 choices=["english", "french", "hindi", "spanish", "russian"],
-#                 label="Target Language",
-#                 value="<|es|>"
-#             )
-#         with gr.Row():
-#             clear_button = gr.Button("Clear Output")
-#         input_audio_microphone.stream(
-#             translate_and_transcribe,
-#             [input_audio_microphone, output, target_language_dropdown],
-#             [output, latency_textbox],
-#             time_limit=45,
-#             stream_every=2,
-#             concurrency_limit=None
-#         )
-#         clear_button.click(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])

 import spaces
 import torch
 import gradio as gr
 import os
 import uuid
 import scipy.io.wavfile
 torch_dtype = torch.float16
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch_dtype,
     low_cpu_mem_usage=True,
     use_safetensors=True,
+    attn_implementation="sdpa"
 )
 model.to(device)
     chunk_length_s=10,
     torch_dtype=torch_dtype,
     device=device,
+    ignore_warning=True,
 )
 @spaces.GPU
     try:
         sr, y = new_chunk
         if y.ndim > 1:
             y = y.mean(axis=1)
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        return stream, str(e), "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
 def clear():
     return ""
 with gr.Blocks() as microphone:
     with gr.Column():
+        gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
+        input_audio_microphone.stream(
+            stream_transcribe,
+            inputs=[state, input_audio_microphone],
+            outputs=[state, output, latency_textbox],
+            stream_every=2
+        )
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
+        gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
             submit_button = gr.Button("Submit")
             clear_button = gr.Button("Clear Output")
+        submit_button.click(transcribe, inputs=[input_audio_microphone, output], outputs=[output, latency_textbox])
         clear_button.click(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])