Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

michaeltangz commited on Dec 8, 2025

Commit

a9a8aec

1 Parent(s): 03bd1f9

install Flash Attention 2 and optimize Whisper model loading; enhance streaming transcription with pipeline approach and latency tracking

Browse files

Files changed (1) hide show

app.py +156 -279

app.py CHANGED Viewed

@@ -2,11 +2,22 @@ import os
 import numpy as np
 import gradio as gr
 import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 import spaces
 import traceback
 from pydub import AudioSegment
 import librosa
 # -------------------------
 # Model Loading
@@ -14,7 +25,7 @@ import librosa
 print("🚀 Loading Whisper model...")
 model_id = "openai/whisper-large-v3-turbo"
-DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 print(f"Using device={DEVICE}, dtype={TORCH_DTYPE}")
@@ -24,230 +35,94 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
     torch_dtype=TORCH_DTYPE,
     low_cpu_mem_usage=True,
     use_safetensors=True,
 )
 model.to(DEVICE)
-model.eval()
 processor = AutoProcessor.from_pretrained(model_id)
 print(f"✅ Model loaded on {DEVICE}")
 # -------------------------
 # Constants
 # -------------------------
 SAMPLE_RATE = 16000
-BUFFER_SECONDS = 30  # Increased from 10 to keep more context
-MIN_AUDIO_LENGTH = 2.0  # Minimum 2 seconds before transcribing
-OVERLAP_SECONDS = 2  # Keep overlap for context
-def resample_audio(audio, orig_sr, target_sr=16000):
-    """High-quality resampling using librosa."""
-    if orig_sr == target_sr:
-        return audio
-    try:
-        return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
-    except Exception as e:
-        print(f"Librosa resample failed: {e}, using linear interpolation")
-        # Fallback to simple resampling
-        duration = len(audio) / orig_sr
-        target_length = int(duration * target_sr)
-        if target_length == 0:
-            return np.array([], dtype=np.float32)
-        indices = np.linspace(0, len(audio) - 1, target_length)
-        return np.interp(indices, np.arange(len(audio)), audio).astype(np.float32)
-def detect_voice_activity(audio, threshold=0.01):
-    """Simple VAD: check if audio has sufficient energy."""
-    if len(audio) == 0:
-        return False
-    rms = np.sqrt(np.mean(audio**2))
-    return rms > threshold
 @spaces.GPU
-def transcribe_audio(audio_chunk, history, full_transcript, last_transcribed_length):
     """
-    Improved streaming transcription with better accuracy.
-    audio_chunk: (sample_rate, audio_data) from Gradio
-    history: accumulated audio buffer as numpy array
-    full_transcript: accumulated text transcript
-    last_transcribed_length: length of audio already transcribed
     """
     try:
-        if audio_chunk is None:
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Parse audio
-        if isinstance(audio_chunk, tuple):
-            sr, data = audio_chunk
-        else:
-            return history, full_transcript, full_transcript, last_transcribed_length
-        if data is None or len(data) == 0:
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Convert to mono float32
-        data = np.asarray(data, dtype=np.float32)
-        if data.ndim == 2:
-            data = np.mean(data, axis=1)
-        # Normalize if needed
-        if data.dtype == np.int16:
-            data = data.astype(np.float32) / 32768.0
-        elif data.dtype == np.int32:
-            data = data.astype(np.float32) / 2147483648.0
-        data = np.clip(data, -1.0, 1.0)
-        # Resample if needed
-        if sr != SAMPLE_RATE:
-            data = resample_audio(data, sr, SAMPLE_RATE)
-        # Initialize history if needed
-        if history is None or len(history) == 0:
-            history = data
         else:
-            history = np.concatenate([history, data])
-        # Keep buffer within limits
-        max_samples = SAMPLE_RATE * BUFFER_SECONDS
-        if len(history) > max_samples:
-            # Keep some overlap for context
-            overlap_samples = int(SAMPLE_RATE * OVERLAP_SECONDS)
-            history = history[-(max_samples + overlap_samples):]
-        # Need minimum audio to transcribe
-        min_samples = int(SAMPLE_RATE * MIN_AUDIO_LENGTH)
-        if len(history) < min_samples:
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Check for voice activity
-        if not detect_voice_activity(history[-min_samples:]):
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Only transcribe new audio (not already transcribed)
-        new_audio_length = len(history) - last_transcribed_length
-        if new_audio_length < SAMPLE_RATE * 1.0:  # Wait for at least 1 second of new audio
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Transcribe the buffer with better parameters
-        inputs = processor(
-            history,
-            sampling_rate=SAMPLE_RATE,
-            return_tensors="pt"
-        )
-        input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
-        with torch.no_grad():
-            predicted_ids = model.generate(
-                input_features,
-                max_new_tokens=440,  # Leave room for special tokens (total must be < 448)
-                num_beams=3,  # Beam search for better quality (balanced)
-                do_sample=False,
-                language="en",  # Specify language for better accuracy
-                task="transcribe",
-            )
-        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
-        if not text:
-            return history, full_transcript, full_transcript, last_transcribed_length
-        # Update the full transcript
-        # Check if new text is different from what we already have
-        if full_transcript:
-            # If the new transcription starts with the end of our current transcript,
-            # only add the new part
-            words_current = full_transcript.split()
-            words_new = text.split()
-            # Find overlap
-            overlap_found = False
-            for i in range(min(len(words_current), len(words_new))):
-                if words_current[-(i+1):] == words_new[:i+1]:
-                    # Found overlap, add only new words
-                    new_words = words_new[i+1:]
-                    if new_words:
-                        full_transcript = full_transcript + " " + " ".join(new_words)
-                        overlap_found = True
-                    break
-            if not overlap_found:
-                # No overlap found, check if it's completely new
-                if text not in full_transcript:
-                    full_transcript = full_transcript + " " + text
         else:
-            full_transcript = text
-        # Update last transcribed length
-        last_transcribed_length = len(history)
-        return history, full_transcript, full_transcript, last_transcribed_length
     except Exception as e:
-        print(f"Error: {e}")
         traceback.print_exc()
-        return (
-            history if history is not None else np.array([]),
-            full_transcript,
-            full_transcript,
-            last_transcribed_length
-        )
 def transcribe_file(file):
-    """Transcribe an uploaded audio file with high quality settings."""
     if file is None:
         return ""
     try:
-        # Load audio file using librosa for better quality
-        audio_data, sr = librosa.load(file.name, sr=SAMPLE_RATE, mono=True)
-        # Normalize
-        audio_data = np.clip(audio_data, -1.0, 1.0)
-        # Process in chunks with overlap
-        chunk_size = SAMPLE_RATE * 30  # 30 second chunks
-        overlap_size = SAMPLE_RATE * 2  # 2 second overlap
-        texts = []
-        for start in range(0, len(audio_data), chunk_size - overlap_size):
-            chunk = audio_data[start:start + chunk_size]
-            if len(chunk) < SAMPLE_RATE * 1.0:  # Skip chunks less than 1 second
-                continue
-            inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt")
-            input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
-            with torch.no_grad():
-                predicted_ids = model.generate(
-                    input_features,
-                    max_new_tokens=440,  # Leave room for special tokens (total must be < 448)
-                    num_beams=5,  # Higher beam search for best quality
-                    language="en",
-                    task="transcribe",
-                )
-            text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
-            if text:
-                # Remove duplicate text from overlaps
-                if texts and text.startswith(texts[-1].split()[-5:][0] if len(texts[-1].split()) >= 5 else ""):
-                    # Find overlap and merge
-                    words_prev = texts[-1].split()
-                    words_curr = text.split()
-                    for i in range(min(10, len(words_prev), len(words_curr))):
-                        if words_prev[-(i+1):] == words_curr[:i+1]:
-                            texts[-1] = texts[-1] + " " + " ".join(words_curr[i+1:])
-                            break
-                    else:
-                        texts.append(text)
-                else:
-                    texts.append(text)
-        return " ".join(texts)
     except Exception as e:
         print(f"File transcription error: {e}")
@@ -255,111 +130,113 @@ def transcribe_file(file):
         return f"Error: {str(e)}"
-def clear_history():
-    """Reset everything."""
-    return np.array([]), "", "", 0
 # -------------------------
 # Gradio UI
 # -------------------------
-with gr.Blocks(title="🎤 Whisper ASR") as demo:
     gr.Markdown(
         """
-        # 🎤 Whisper Real-Time Transcription (Improved Accuracy)
-        **How to use:**
-        - **Microphone**: Click to record, speak, see live transcription
-        - **File Upload**: Upload audio file and click "Transcribe"
-        - **Clear**: Reset the transcription
-        **Improvements:**
-        - Higher quality beam search
-        - Better context retention (30s buffer)
-        - Proper audio resampling with librosa
-        - Voice activity detection
-        - Smarter overlap handling
-        Using Whisper-large-v3-turbo
         """
     )
-    with gr.Row():
-        with gr.Column():
-            source = gr.Radio(
-                ["Microphone", "Upload File"],
-                value="Microphone",
-                label="Audio Source"
-            )
-            mic = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                streaming=True,
-                label="🎙️ Microphone",
-                visible=True
-            )
-            file_input = gr.File(
-                label="📁 Upload Audio",
-                file_types=["audio"],
-                visible=False
-            )
-            transcribe_btn = gr.Button(
-                "Transcribe File",
-                visible=False
-            )
-            clear_btn = gr.Button("🗑️ Clear")
-        with gr.Column():
-            output = gr.Textbox(
-                label="📄 Transcription",
-                lines=12,
-                interactive=False
-            )
-    # State: audio buffer, full transcript, and last transcribed length
-    audio_history = gr.State(np.array([]))
-    transcript_state = gr.State("")
-    last_transcribed_state = gr.State(0)
-    # Toggle UI based on source
-    def update_ui(choice):
-        is_mic = choice == "Microphone"
-        return (
-            gr.update(visible=is_mic),
-            gr.update(visible=not is_mic),
-            gr.update(visible=not is_mic)
         )
-    source.change(
-        update_ui,
-        inputs=source,
-        outputs=[mic, file_input, transcribe_btn]
-    )
-    # Streaming mic input
-    mic.stream(
-        transcribe_audio,
-        inputs=[mic, audio_history, transcript_state, last_transcribed_state],
-        outputs=[audio_history, transcript_state, output, last_transcribed_state]
-    )
-    # File transcription
-    transcribe_btn.click(
-        transcribe_file,
-        inputs=file_input,
-        outputs=output
-    )
-    # Clear button
-    clear_btn.click(
-        clear_history,
-        outputs=[audio_history, transcript_state, output, last_transcribed_state]
-    )
 if __name__ == "__main__":
-    # share=False on Spaces (automatically public), True for local
-    demo.launch(share=False)

 import numpy as np
 import gradio as gr
 import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import spaces
 import traceback
 from pydub import AudioSegment
 import librosa
+import subprocess
+import time
+# -------------------------
+# Install Flash Attention 2
+# -------------------------
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 # -------------------------
 # Model Loading
 print("🚀 Loading Whisper model...")
 model_id = "openai/whisper-large-v3-turbo"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 print(f"Using device={DEVICE}, dtype={TORCH_DTYPE}")
     torch_dtype=TORCH_DTYPE,
     low_cpu_mem_usage=True,
     use_safetensors=True,
+    attn_implementation="flash_attention_2"
 )
 model.to(DEVICE)
 processor = AutoProcessor.from_pretrained(model_id)
+tokenizer = WhisperTokenizer.from_pretrained(model_id)
+# Create pipeline with proper configuration
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=model,
+    tokenizer=tokenizer,
+    feature_extractor=processor.feature_extractor,
+    chunk_length_s=30,  # Process 30-second chunks
+    torch_dtype=TORCH_DTYPE,
+    device=DEVICE,
+)
 print(f"✅ Model loaded on {DEVICE}")
 # -------------------------
 # Constants
 # -------------------------
 SAMPLE_RATE = 16000
 @spaces.GPU
+def stream_transcribe(stream, new_chunk):
     """
+    Streaming transcription using pipeline approach.
+    stream: accumulated audio buffer
+    new_chunk: (sample_rate, audio_data) from Gradio
     """
+    start_time = time.time()
     try:
+        if new_chunk is None:
+            return stream, "", f"0.00"
+        sr, y = new_chunk
+        # Convert to mono if stereo
+        if y.ndim > 1:
+            y = y.mean(axis=1)
+        # Convert to float32 and normalize
+        y = y.astype(np.float32)
+        max_val = np.max(np.abs(y))
+        if max_val > 0:
+            y /= max_val
+        # Concatenate with existing stream
+        if stream is not None and len(stream) > 0:
+            stream = np.concatenate([stream, y])
         else:
+            stream = y
+        # Transcribe the accumulated stream
+        if len(stream) > SAMPLE_RATE * 0.5:  # At least 0.5 seconds
+            transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
         else:
+            transcription = ""
+        end_time = time.time()
+        latency = end_time - start_time
+        return stream, transcription, f"{latency:.2f}"
     except Exception as e:
+        print(f"Error during streaming transcription: {e}")
         traceback.print_exc()
+        return stream if stream is not None else np.array([]), "", "Error"
+@spaces.GPU
 def transcribe_file(file):
+    """Transcribe an uploaded audio file using pipeline."""
     if file is None:
         return ""
+    start_time = time.time()
     try:
+        # Use pipeline directly on the file
+        transcription = pipe(file.name)["text"]
+        end_time = time.time()
+        latency = end_time - start_time
+        return f"{transcription}\n\n(Transcribed in {latency:.2f}s)"
     except Exception as e:
         print(f"File transcription error: {e}")
         return f"Error: {str(e)}"
+def clear_output():
+    """Clear the output text."""
+    return ""
+def clear_state():
+    """Clear the audio stream state."""
+    return None
 # -------------------------
 # Gradio UI
 # -------------------------
+with gr.Blocks(title="🎤 Whisper ASR", theme=gr.themes.Ocean()) as demo:
     gr.Markdown(
         """
+        # 🎤 Whisper Large V3 Turbo - Real-Time Transcription
+        **Transcribe audio in real-time with high accuracy!**
+        This demo uses:
+        - Model: `openai/whisper-large-v3-turbo`
+        - Flash Attention 2 for speed
+        - Optimized pipeline for best accuracy
+        **Note:** First transcription may take ~5 seconds. After that, it runs smoothly.
         """
     )
+    with gr.Tab("🎙️ Microphone"):
+        with gr.Row():
+            with gr.Column():
+                mic_input = gr.Audio(
+                    sources=["microphone"],
+                    type="numpy",
+                    streaming=True,
+                    label="Microphone Input"
+                )
+                with gr.Row():
+                    clear_mic_btn = gr.Button("🗑️ Clear", size="sm")
+            with gr.Column():
+                mic_output = gr.Textbox(
+                    label="📄 Real-Time Transcription",
+                    lines=10,
+                    interactive=False
+                )
+                latency_box = gr.Textbox(
+                    label="⚡ Latency (seconds)",
+                    value="0.00",
+                    interactive=False,
+                    scale=0
+                )
+        # State for streaming
+        stream_state = gr.State()
+        # Streaming transcription
+        mic_input.stream(
+            stream_transcribe,
+            inputs=[stream_state, mic_input],
+            outputs=[stream_state, mic_output, latency_box],
+            time_limit=60,
+            stream_every=2,
+            concurrency_limit=None
+        )
+        # Clear button
+        clear_mic_btn.click(
+            clear_state,
+            outputs=[stream_state]
+        ).then(
+            clear_output,
+            outputs=[mic_output]
         )
+    with gr.Tab("📁 Upload File"):
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.Audio(
+                    sources=["upload"],
+                    type="filepath",
+                    label="Upload Audio File"
+                )
+                with gr.Row():
+                    transcribe_file_btn = gr.Button("▶️ Transcribe", variant="primary")
+                    clear_file_btn = gr.Button("🗑️ Clear", size="sm")
+            with gr.Column():
+                file_output = gr.Textbox(
+                    label="📄 Transcription",
+                    lines=10,
+                    interactive=False
+                )
+        # File transcription
+        transcribe_file_btn.click(
+            transcribe_file,
+            inputs=file_input,
+            outputs=file_output
+        )
+        # Clear button
+        clear_file_btn.click(
+            clear_output,
+            outputs=[file_output]
+        )
 if __name__ == "__main__":
+    demo.launch(share=True)