Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

michaeltangz commited on Dec 8, 2025

Commit

8f2a46b

1 Parent(s): f8af19e

refactor app.py to streamline flash attention installation and model loading; remove fallback mechanisms and enhance transcription parameters

Browse files

Files changed (1) hide show

app.py +46 -174

app.py CHANGED Viewed

@@ -9,44 +9,19 @@ import time
 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
-# Try to install flash-attn (optional, will fallback if fails)
-try:
-    subprocess.run(
-        "pip install flash-attn --no-build-isolation",
-        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-        shell=True,
-        timeout=60,
-    )
-    print("✅ Flash Attention installed")
-except Exception as e:
-    print(f"⚠️ Flash Attention installation failed (will use default): {e}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 MODEL_NAME = "openai/whisper-large-v3-turbo"
-# Try to load model with flash attention, fallback to default if it fails
-try:
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME,
-        dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True,
-        attn_implementation="flash_attention_2"
-    )
-    print("✅ Model loaded with Flash Attention 2")
-except Exception as e:
-    print(f"⚠️ Could not load with Flash Attention 2: {e}")
-    print("Loading with default attention implementation...")
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME,
-        dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True
-    )
-    print("✅ Model loaded with default attention")
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
@@ -57,19 +32,11 @@ pipe = pipeline(
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=30,  # Increased from 10 for better context
     device=device,
-    ignore_warning=True,
 )
-# Voice Activity Detection
-def detect_voice_activity(audio, threshold=0.01):
-    """Detect if audio contains speech based on energy."""
-    if len(audio) == 0:
-        return False
-    rms = np.sqrt(np.mean(audio**2))
-    return rms > threshold
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
@@ -81,51 +48,23 @@ def stream_transcribe(stream, new_chunk):
             y = y.mean(axis=1)
         y = y.astype(np.float32)
-        # FIX: Prevent division by zero
         max_val = np.max(np.abs(y))
         if max_val > 0:
             y /= max_val
-        else:
-            # Silent audio, skip
-            return stream, "", "0.00"
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
-        # FIX: Limit buffer size to prevent memory issues and accumulated silence
-        MAX_BUFFER = sr * 30  # 30 seconds maximum
-        if len(stream) > MAX_BUFFER:
-            stream = stream[-MAX_BUFFER:]
-        # FIX: Check for voice activity before transcribing
-        if not detect_voice_activity(stream, threshold=0.01):
-            return stream, "", "0.00"
-        # FIX: Require minimum audio length
-        if len(stream) < sr * 1.0:  # At least 1 second
-            return stream, "", "0.00"
-        # FIX: Add anti-hallucination parameters
-        transcription = pipe(
-            {"sampling_rate": sr, "raw": stream},
-            generate_kwargs={
-                "language": "english",
-                "no_repeat_ngram_size": 3,  # Prevents repetitive outputs
-            }
-        )["text"]
         end_time = time.time()
         latency = end_time - start_time
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        import traceback
-        traceback.print_exc()
-        return stream if stream is not None else np.array([]), "", "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
@@ -133,41 +72,16 @@ def transcribe(inputs, previous_transcription):
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
-        # Convert to float for VAD check
-        audio_float = audio_data.astype(np.float32)
-        if audio_data.dtype == np.int16:
-            audio_float /= 32768.0
-        elif audio_data.dtype == np.int32:
-            audio_float /= 2147483648.0
-        # FIX: Check for voice activity before transcribing
-        if not detect_voice_activity(audio_float, threshold=0.01):
-            return previous_transcription + "\n[No speech detected in audio]", "0.00"
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        # FIX: Add anti-hallucination parameters
-        transcription = pipe(
-            filename,
-            generate_kwargs={
-                "language": "english",
-            }
-        )["text"]
         previous_transcription += transcription
-        # Clean up temp file
-        if os.path.exists(filename):
-            os.remove(filename)
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        import traceback
-        traceback.print_exc()
         return previous_transcription, "Error"
 @spaces.GPU
@@ -178,27 +92,15 @@ def translate_and_transcribe(inputs, previous_transcription, target_language):
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        translation = pipe(
-            filename,
-            generate_kwargs={
-                "task": "translate",
-                "language": target_language,
-            }
-        )["text"]
         previous_transcription += translation
-        # Clean up temp file
-        if os.path.exists(filename):
-            os.remove(filename)
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Translation and Transcription: {e}")
-        import traceback
-        traceback.print_exc()
         return previous_transcription, "Error"
 def clear():
@@ -209,21 +111,7 @@ def clear_state():
 with gr.Blocks() as microphone:
     with gr.Column():
-        gr.Markdown(f"""
-        # 🎤 Realtime Whisper Large V3 Turbo
-        Transcribe Audio in Realtime with **Voice Activity Detection** to prevent hallucinations.
-        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
-        **Features:**
-        - Flash Attention 2 for speed
-        - Voice Activity Detection (no "oh oh oh" hallucinations)
-        - 30-second context window
-        - Anti-repetition safeguards
-        **Note:** First transcription takes ~5 seconds. After that, it works flawlessly.
-        """)
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
@@ -231,22 +119,12 @@ with gr.Blocks() as microphone:
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
-        input_audio_microphone.stream(
-            stream_transcribe,
-            [state, input_audio_microphone],
-            [state, output, latency_textbox]
-        )
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
-        gr.Markdown(f"""
-        # 🎤 Realtime Whisper Large V3 Turbo
-        Transcribe Audio Files.
-        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
-        """)
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
@@ -258,38 +136,32 @@ with gr.Blocks() as file:
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
-with gr.Blocks() as translate:
-    with gr.Column():
-        gr.Markdown(f"""
-        # 🌍 Realtime Whisper Large V3 Turbo (Translation)
-        Transcribe and Translate Audio in Realtime.
-        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
-        **Note:** First token takes ~5 seconds. After that, it works flawlessly.
-        """)
-        with gr.Row():
-            input_audio_microphone = gr.Audio(streaming=True)
-            output = gr.Textbox(label="Transcription and Translation", value="")
-            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-            target_language_dropdown = gr.Dropdown(
-                choices=["english", "french", "hindi", "spanish", "russian"],
-                label="Target Language",
-                value="english"
-            )
-        with gr.Row():
-            clear_button = gr.Button("Clear Output")
-        state = gr.State()
-        input_audio_microphone.stream(
-            translate_and_transcribe,
-            [state, input_audio_microphone, target_language_dropdown],
-            [state, output, latency_textbox]
-        )
-        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
-with gr.Blocks() as demo:
-    gr.TabbedInterface([microphone, file, translate], ["Microphone", "Upload File", "Translation"])
 demo.launch()

 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16
 MODEL_NAME = "openai/whisper-large-v3-turbo"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
+)
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=10,
+    torch_dtype=torch_dtype,
     device=device,
 )
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
             y = y.mean(axis=1)
         y = y.astype(np.float32)
         max_val = np.max(np.abs(y))
         if max_val > 0:
             y /= max_val
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
+        transcription = pipe({"sampling_rate": sr, "raw": stream}, generate_kwargs={"condition_on_previous_text": False})["text"]
         end_time = time.time()
         latency = end_time - start_time
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        return stream, e, "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        transcription = pipe(filename, generate_kwargs={"condition_on_previous_text": False})["text"]
         previous_transcription += transcription
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
 @spaces.GPU
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language, "condition_on_previous_text": False})["text"]
         previous_transcription += translation
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Translation and Transcription: {e}")
         return previous_transcription, "Error"
 def clear():
 with gr.Blocks() as microphone:
     with gr.Column():
+        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
+        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
+        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
+# with gr.Blocks() as translate:
+#     with gr.Column():
+#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
+#         with gr.Row():
+#             input_audio_microphone = gr.Audio(streaming=True)
+#             output = gr.Textbox(label="Transcription and Translation", value="")
+#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
+#             target_language_dropdown = gr.Dropdown(
+#                 choices=["english", "french", "hindi", "spanish", "russian"],
+#                 label="Target Language",
+#                 value="<|es|>"
+#             )
+#         with gr.Row():
+#             clear_button = gr.Button("Clear Output")
+#         input_audio_microphone.stream(
+#             translate_and_transcribe,
+#             [input_audio_microphone, output, target_language_dropdown],
+#             [output, latency_textbox],
+#             time_limit=45,
+#             stream_every=2,
+#             concurrency_limit=None
+#         )
+#         clear_button.click(clear, outputs=[output])
+with gr.Blocks(theme=gr.themes.Ocean()) as demo:
+    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
 demo.launch()