Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

michaeltangz commited on Dec 8, 2025

Commit

62fccb4

1 Parent(s): ae149f3

refactor app.py to streamline flash attention installation and model loading; enhance voice activity detection and transcription accuracy

Browse files

Files changed (1) hide show

app.py +162 -69

app.py CHANGED Viewed

@@ -9,35 +9,19 @@ import time
 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
-# Try to install flash-attn (optional)
-try:
-    subprocess.run(
-        "pip install flash-attn --no-build-isolation",
-        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-        shell=True,
-    )
-except Exception as e:
-    print(f"Flash attention installation failed: {e}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 MODEL_NAME = "openai/whisper-large-v3-turbo"
-# Try to load with flash attention, fallback if it fails
-try:
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME, dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
-    )
-    print("Loaded with flash_attention_2")
-except Exception as e:
-    print(f"Could not load with flash_attention_2: {e}")
-    print("Falling back to default attention...")
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME, dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-    )
-    print("Loaded with default attention")
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
@@ -48,11 +32,19 @@ pipe = pipeline(
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=10,
     device=device,
-    ignore_warning=True,
 )
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
@@ -64,25 +56,52 @@ def stream_transcribe(stream, new_chunk):
             y = y.mean(axis=1)
         y = y.astype(np.float32)
-        y /= np.max(np.abs(y))
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
         transcription = pipe(
-    {"sampling_rate": sr, "raw": stream},
-    generate_kwargs={
-        "language": "english",
-        "condition_on_previous_text": False })['text']
         end_time = time.time()
         latency = end_time - start_time
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        return stream, str(e), "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
@@ -90,16 +109,42 @@ def transcribe(inputs, previous_transcription):
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        transcription = pipe(filename, generate_kwargs={"language": "english"})["text"]
         previous_transcription += transcription
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
 @spaces.GPU
@@ -110,15 +155,28 @@ def translate_and_transcribe(inputs, previous_transcription, target_language):
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
         previous_transcription += translation
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Translation and Transcription: {e}")
         return previous_transcription, "Error"
 def clear():
@@ -129,7 +187,21 @@ def clear_state():
 with gr.Blocks() as microphone:
     with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
@@ -137,12 +209,25 @@ with gr.Blocks() as microphone:
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
-        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox])
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
@@ -154,33 +239,41 @@ with gr.Blocks() as file:
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
-# with gr.Blocks() as translate:
-#     with gr.Column():
-#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-#         with gr.Row():
-#             input_audio_microphone = gr.Audio(streaming=True)
-#             output = gr.Textbox(label="Transcription and Translation", value="")
-#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-#             target_language_dropdown = gr.Dropdown(
-#                 choices=["english", "french", "hindi", "spanish", "russian"],
-#                 label="Target Language",
-#                 value="<|es|>"
-#             )
-#         with gr.Row():
-#             clear_button = gr.Button("Clear Output")
-#         input_audio_microphone.stream(
-#             translate_and_transcribe,
-#             [input_audio_microphone, output, target_language_dropdown],
-#             [output, latency_textbox],
-#             time_limit=45,
-#             stream_every=2,
-#             concurrency_limit=None
-#         )
-#         clear_button.click(clear, outputs=[output])
-with gr.Blocks() as demo:
-    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
-if __name__ == "__main__":
-    demo.launch(share=False)

 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16
 MODEL_NAME = "openai/whisper-large-v3-turbo"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
+)
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=30,  # Increased from 10 for better context
+    torch_dtype=torch_dtype,
     device=device,
 )
+# Voice Activity Detection
+def detect_voice_activity(audio, threshold=0.01):
+    """Detect if audio contains speech based on energy."""
+    if len(audio) == 0:
+        return False
+    rms = np.sqrt(np.mean(audio**2))
+    return rms > threshold
 @spaces.GPU
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
             y = y.mean(axis=1)
         y = y.astype(np.float32)
+        # FIX: Prevent division by zero
+        max_val = np.max(np.abs(y))
+        if max_val > 0:
+            y /= max_val
+        else:
+            # Silent audio, skip
+            return stream, "", "0.00"
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
+        # FIX: Limit buffer size to prevent memory issues and accumulated silence
+        MAX_BUFFER = sr * 30  # 30 seconds maximum
+        if len(stream) > MAX_BUFFER:
+            stream = stream[-MAX_BUFFER:]
+        # FIX: Check for voice activity before transcribing
+        if not detect_voice_activity(stream, threshold=0.01):
+            return stream, "", "0.00"
+        # FIX: Require minimum audio length
+        if len(stream) < sr * 1.0:  # At least 1 second
+            return stream, "", "0.00"
+        # FIX: Add anti-hallucination parameters
         transcription = pipe(
+            {"sampling_rate": sr, "raw": stream},
+            generate_kwargs={
+                "language": "english",
+                "condition_on_previous_text": False,  # Prevents hallucinations
+                "no_repeat_ngram_size": 3,  # Prevents repetitive outputs
+            }
+        )["text"]
         end_time = time.time()
         latency = end_time - start_time
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        import traceback
+        traceback.print_exc()
+        return stream if stream is not None else np.array([]), "", "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
     try:
         filename = f"{uuid.uuid4().hex}.wav"
         sample_rate, audio_data = inputs
+        # Convert to float for VAD check
+        audio_float = audio_data.astype(np.float32)
+        if audio_data.dtype == np.int16:
+            audio_float /= 32768.0
+        elif audio_data.dtype == np.int32:
+            audio_float /= 2147483648.0
+        # FIX: Check for voice activity before transcribing
+        if not detect_voice_activity(audio_float, threshold=0.01):
+            return previous_transcription + "\n[No speech detected in audio]", "0.00"
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        # FIX: Add anti-hallucination parameters
+        transcription = pipe(
+            filename,
+            generate_kwargs={
+                "language": "english",
+                "condition_on_previous_text": False,
+            }
+        )["text"]
         previous_transcription += transcription
+        # Clean up temp file
+        if os.path.exists(filename):
+            os.remove(filename)
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        import traceback
+        traceback.print_exc()
         return previous_transcription, "Error"
 @spaces.GPU
         sample_rate, audio_data = inputs
         scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        translation = pipe(
+            filename,
+            generate_kwargs={
+                "task": "translate",
+                "language": target_language,
+                "condition_on_previous_text": False,
+            }
+        )["text"]
         previous_transcription += translation
+        # Clean up temp file
+        if os.path.exists(filename):
+            os.remove(filename)
         end_time = time.time()
         latency = end_time - start_time
         return previous_transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Translation and Transcription: {e}")
+        import traceback
+        traceback.print_exc()
         return previous_transcription, "Error"
 def clear():
 with gr.Blocks() as microphone:
     with gr.Column():
+        gr.Markdown(f"""
+        # 🎤 Realtime Whisper Large V3 Turbo
+        Transcribe Audio in Realtime with **Voice Activity Detection** to prevent hallucinations.
+        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
+        **Features:**
+        - Flash Attention 2 for speed
+        - Voice Activity Detection (no "oh oh oh" hallucinations)
+        - 30-second context window
+        - Anti-repetition safeguards
+        **Note:** First transcription takes ~5 seconds. After that, it works flawlessly.
+        """)
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
+        input_audio_microphone.stream(
+            stream_transcribe,
+            [state, input_audio_microphone],
+            [state, output, latency_textbox],
+            time_limit=60,  # Increased from 30
+            stream_every=2,
+            concurrency_limit=None
+        )
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks() as file:
     with gr.Column():
+        gr.Markdown(f"""
+        # 🎤 Realtime Whisper Large V3 Turbo
+        Transcribe Audio Files.
+        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
+        """)
         with gr.Row():
             input_audio_microphone = gr.Audio(sources="upload", type="numpy")
             output = gr.Textbox(label="Transcription", value="")
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
+with gr.Blocks() as translate:
+    with gr.Column():
+        gr.Markdown(f"""
+        # 🌍 Realtime Whisper Large V3 Turbo (Translation)
+        Transcribe and Translate Audio in Realtime.
+        **Model:** [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})
+        **Note:** First token takes ~5 seconds. After that, it works flawlessly.
+        """)
+        with gr.Row():
+            input_audio_microphone = gr.Audio(streaming=True)
+            output = gr.Textbox(label="Transcription and Translation", value="")
+            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
+            target_language_dropdown = gr.Dropdown(
+                choices=["english", "french", "hindi", "spanish", "russian"],
+                label="Target Language",
+                value="english"
+            )
+        with gr.Row():
+            clear_button = gr.Button("Clear Output")
+        state = gr.State()
+        input_audio_microphone.stream(
+            translate_and_transcribe,
+            [state, input_audio_microphone, target_language_dropdown],
+            [state, output, latency_textbox],
+            time_limit=60,
+            stream_every=2,
+            concurrency_limit=None
+        )
+        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
+with gr.Blocks(theme=gr.themes.Ocean()) as demo:
+    gr.TabbedInterface([microphone, file, translate], ["Microphone", "Upload File", "Translation"])
+demo.launch()