Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

mic3333 commited on Dec 4, 2025

Commit

55d67f9

1 Parent(s): 1f8fa97

simplify streaming transcription by removing VAD, diarization, and complex buffering logic

Browse files

Files changed (1) hide show

app.py +165 -629

app.py CHANGED Viewed

@@ -1,33 +1,18 @@
 import os
-from contextlib import contextmanager, nullcontext
-from collections import deque
 import numpy as np
 import gradio as gr
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 import spaces
 import traceback
-import webrtcvad
-import re
-from difflib import SequenceMatcher
 from pydub import AudioSegment
-try:
-    from pyannote.audio import Pipeline
-    _HAVE_DIARIZATION = True
-except Exception:
-    Pipeline = None
-    _HAVE_DIARIZATION = False
 # -------------------------
-# Config / Model Loading
 # -------------------------
-print("🚀 Loading Whisper model at startup...")
-torch.set_float32_matmul_precision("high")
 model_id = "openai/whisper-large-v3-turbo"
-# Decide device and dtype once
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -42,213 +27,18 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
 model.to(DEVICE)
 model.eval()
-# Configure generation settings
-try:
-    model.generation_config.cache_implementation = "static"
-    model.generation_config.max_new_tokens = 256
-except Exception as e:
-    print("⚠️ Could not configure static cache on generation_config:", e)
 processor = AutoProcessor.from_pretrained(model_id)
-print(f"✅ Model and processor loaded on {DEVICE}")
 # -------------------------
-# Globals / constants
 # -------------------------
 SAMPLE_RATE = 16000
-BUFFER_DURATION = 8  # seconds
-MAX_BUFFER_SAMPLES = int(SAMPLE_RATE * BUFFER_DURATION)
-# VAD (webrtcvad)
-vad = webrtcvad.Vad(2)  # aggressiveness 0-3
-# Sentence splitting regex
-sentence_split_re = re.compile(
-    r"(?<!Mr\.)(?<!Ms\.)(?<!Mrs\.)(?<!Dr\.)(?<!St\.)(?<!Jr\.)(?<!Sr\.)"
-    r"(?<!Prof\.)(?<!Inc\.)(?<!Ltd\.)(?<!U\.S\.)"
-    r"(?<=[.!?])\s+"
-)
-def create_initial_state():
-    """
-    Create a fresh per-session state dictionary.
-    Uses list instead of deque and set for Gradio serialization compatibility.
-    We convert back to deque/set during processing for efficiency.
-    """
-    return {
-        "buffer": [],  # Will be converted to deque during processing
-        "full_transcript": "",
-        "last_transcription": "",
-        "entries": [],
-        "processed_samples": 0,
-        "total_audio_samples": 0,
-        "speaker_map": {},
-        "next_speaker_idx": 1,
-        "seen_texts": [],  # Will be converted to set during processing
-        "unprocessed_audio": np.array([], dtype=np.float32),
-    }
-def is_near_duplicate(a: str, b: str, threshold: float = 0.6) -> bool:
-    """
-    Return True if sentences a and b are very similar.
-    """
-    if not a or not b:
-        return False
-    ratio = SequenceMatcher(None, a.lower(), b.lower()).ratio()
-    return ratio >= threshold
-def format_timestamp(seconds: float) -> str:
-    """
-    Format seconds as mm:ss.mmm (or hh:mm:ss.mmm for long audio).
-    """
-    total_ms = int(seconds * 1000)
-    hours, rem = divmod(total_ms, 3_600_000)
-    minutes, rem = divmod(rem, 60_000)
-    secs, ms = divmod(rem, 1_000)
-    if hours:
-        return f"{hours:02d}:{minutes:02d}:{secs:02d}.{ms:03d}"
-    return f"{minutes:02d}:{secs:02d}.{ms:03d}"
-diarization_pipeline = None
-diarization_call_count = 0  # Throttle diarization calls
-@contextmanager
-def _unsafe_torch_load_context():
-    """
-    Temporarily force torch.load to use weights_only=False.
-    """
-    orig_load = torch.load
-    def _patched_load(*args, **kwargs):
-        kwargs.setdefault("weights_only", False)
-        return orig_load(*args, **kwargs)
-    torch.load = _patched_load
-    try:
-        yield
-    finally:
-        torch.load = orig_load
-def get_diarization_pipeline():
-    """
-    Lazily load the pyannote diarization pipeline if available and configured.
-    """
-    global diarization_pipeline
-    if not _HAVE_DIARIZATION:
-        return None
-    if diarization_pipeline is not None:
-        return diarization_pipeline
-    token = (
-        os.environ.get("PYANNOTE_TOKEN")
-        or os.environ.get("HF_TOKEN")
-        or os.environ.get("HF_API_TOKEN")
-    )
-    if not token:
-        print(
-            "Diarization disabled: no Hugging Face token found. "
-            "Set PYANNOTE_TOKEN, HF_TOKEN, or HF_API_TOKEN in your Space settings."
-        )
-        return None
-    try:
-        import torch.serialization as ts
-        safe = []
-        try:
-            from torch.torch_version import TorchVersion
-            safe.append(TorchVersion)
-        except Exception:
-            pass
-        try:
-            from pyannote.audio.core.task import Specifications, Problem, Resolution
-            safe.append(Specifications)
-            safe.append(Problem)
-            safe.append(Resolution)
-        except Exception:
-            pass
-        ctx = ts.safe_globals(safe) if safe else nullcontext()
-        with _unsafe_torch_load_context():
-            with ctx:
-                diarization_pipeline = Pipeline.from_pretrained(
-                    "pyannote/speaker-diarization-3.1",
-                    use_auth_token=token,
-                )
-        print("✅ Loaded pyannote speaker diarization pipeline.")
-    except Exception as e:
-        print("❌ Failed to load diarization pipeline:", e)
-        diarization_pipeline = None
-    return diarization_pipeline
-# -------------------------
-# VAD helpers
-# -------------------------
-def frame_generator(frame_duration_ms, audio, sample_rate):
-    """
-    Yields contiguous frames (numpy float32 array chunks).
-    """
-    n = int(sample_rate * (frame_duration_ms / 1000.0))
-    offset = 0
-    while offset + n <= len(audio):
-        yield audio[offset:offset + n]
-        offset += n
-def vad_collector(audio, sample_rate, frame_ms=30):
-    """
-    Return list of (start_sample, end_sample) voiced segments in `audio`.
-    """
-    frames = list(frame_generator(frame_ms, audio, sample_rate))
-    if not frames:
-        return []
-    # Convert each frame to 16-bit PCM bytes for webrtcvad
-    voiced_flags = []
-    for f in frames:
-        pcm16 = np.clip(f, -1.0, 1.0)  # Ensure range
-        pcm16 = (pcm16 * 32767).astype(np.int16).tobytes()
-        try:
-            is_speech = vad.is_speech(pcm16, sample_rate)
-        except Exception:
-            is_speech = False
-        voiced_flags.append(is_speech)
-    # Group consecutive voiced frames
-    segments_ms = []
-    start_frame = None
-    for i, flag in enumerate(voiced_flags):
-        if flag and start_frame is None:
-            start_frame = i
-        elif (not flag) and (start_frame is not None):
-            segments_ms.append((start_frame * frame_ms, i * frame_ms))
-            start_frame = None
-    if start_frame is not None:
-        segments_ms.append((start_frame * frame_ms, len(frames) * frame_ms))
-    # Convert ms to sample indices
-    sample_segments = []
-    for s_ms, e_ms in segments_ms:
-        s = int((s_ms / 1000.0) * sample_rate)
-        e = int((e_ms / 1000.0) * sample_rate)
-        sample_segments.append((s, e))
-    return sample_segments
-# -------------------------
-# Audio resampling helper
-# -------------------------
-def resample_audio(audio, orig_sr, target_sr=16000):
-    """
-    Simple linear interpolation resampling.
-    For production, consider using librosa or torchaudio for better quality.
-    """
     if orig_sr == target_sr:
         return audio
     duration = len(audio) / orig_sr
@@ -256,489 +46,235 @@ def resample_audio(audio, orig_sr, target_sr=16000):
     if target_length == 0:
         return np.array([], dtype=np.float32)
     indices = np.linspace(0, len(audio) - 1, target_length)
-    resampled = np.interp(indices, np.arange(len(audio)), audio)
-    return resampled.astype(np.float32)
-# -------------------------
-# Core streaming transcription
-# -------------------------
 @spaces.GPU
-def stream_transcribe(audio, state):
     """
-    Receives streaming audio chunks from Gradio Audio component.
-    Returns (full_transcript, state).
     """
-    global diarization_call_count
-    # Ensure we have per-session state
-    if state is None:
-        state = create_initial_state()
-    # Make a working copy and convert types back
-    # Convert buffer from list back to deque if needed
-    buffer_data = state["buffer"]
-    if isinstance(buffer_data, list):
-        buffer = deque(buffer_data, maxlen=MAX_BUFFER_SAMPLES)
-    else:
-        buffer = buffer_data
-    full_transcript = state["full_transcript"]
-    last_transcription = state["last_transcription"]
-    entries = state["entries"].copy()  # Copy list to avoid mutations
-    processed_samples = state["processed_samples"]
-    total_audio_samples = state["total_audio_samples"]
-    speaker_map = state["speaker_map"].copy()
-    next_speaker_idx = state["next_speaker_idx"]
-    # Convert seen_texts back to set if it's a list
-    seen_texts_data = state["seen_texts"]
-    if isinstance(seen_texts_data, list):
-        seen_texts = set(seen_texts_data)
-    else:
-        seen_texts = seen_texts_data.copy() if isinstance(seen_texts_data, set) else set()
-    unprocessed_audio = state["unprocessed_audio"]
     try:
-        if audio is None:
-            return full_transcript, state
-        # Expect (sr, data)
-        if not (isinstance(audio, (list, tuple)) and len(audio) == 2):
-            return full_transcript, state
-        sr, data = audio
-        if data is None or (isinstance(data, np.ndarray) and data.size == 0):
-            return full_transcript, state
-        # Convert to numpy float32
         data = np.asarray(data, dtype=np.float32)
-        # If stereo, convert to mono
         if data.ndim == 2:
             data = np.mean(data, axis=1)
-        # If int PCM, normalize
         if data.dtype == np.int16:
             data = data.astype(np.float32) / 32768.0
         elif data.dtype == np.int32:
             data = data.astype(np.float32) / 2147483648.0
-        # Resample if needed
-        if sr != SAMPLE_RATE:
-            data = resample_audio(data, sr, SAMPLE_RATE)
-        # Validate data range
-        data = np.clip(data, -1.0, 1.0)
-        # Track total samples received
-        num_new = len(data)
-        total_audio_samples += num_new
-        # Add to buffer (deque will auto-trim to maxlen)
-        buffer.extend(data)
-        # Accumulate unprocessed audio for VAD
-        unprocessed_audio = np.concatenate([unprocessed_audio, data])
-        # If buffer too short, wait
-        if len(buffer) < int(0.5 * SAMPLE_RATE):
-            state["buffer"] = buffer
-            state["total_audio_samples"] = total_audio_samples
-            state["unprocessed_audio"] = unprocessed_audio
-            return full_transcript, state
-        # Only run VAD on NEW audio (unprocessed)
-        if len(unprocessed_audio) < int(0.3 * SAMPLE_RATE):
-            # Not enough new audio to process
-            state["buffer"] = buffer
-            state["total_audio_samples"] = total_audio_samples
-            state["unprocessed_audio"] = unprocessed_audio
-            return full_transcript, state
-        # Run VAD on unprocessed audio to find speech
-        segments = vad_collector(unprocessed_audio, SAMPLE_RATE)
-        if not segments:
-            # No speech detected in new audio, clear unprocessed buffer
-            state["buffer"] = buffer
-            state["total_audio_samples"] = total_audio_samples
-            state["unprocessed_audio"] = np.array([], dtype=np.float32)
-            return full_transcript, state
-        # Get the last voiced segment from unprocessed audio
-        start_samp, end_samp = segments[-1]
-        # Extend with context from the full buffer
-        # Calculate where this segment is in the full buffer
-        buffer_array = np.array(buffer)
-        buffer_len = len(buffer_array)
-        unprocessed_len = len(unprocessed_audio)
-        # Offset of unprocessed audio within buffer
-        unprocessed_offset = buffer_len - unprocessed_len
-        # Absolute positions in buffer
-        abs_start_in_buffer = unprocessed_offset + start_samp
-        abs_end_in_buffer = unprocessed_offset + end_samp
-        # Add context
-        ctx = int(0.15 * SAMPLE_RATE)
-        s = max(0, abs_start_in_buffer - ctx)
-        e = min(buffer_len, abs_end_in_buffer + ctx)
-        segment_audio = buffer_array[s:e]
-        # Calculate absolute timestamps
-        # Buffer represents the last BUFFER_DURATION seconds of audio
-        # The START of the buffer corresponds to (total_audio_samples - buffer_len)
-        buffer_start_sample = total_audio_samples - buffer_len
-        abs_start = buffer_start_sample + s
-        abs_end = buffer_start_sample + e
-        start_time = abs_start / SAMPLE_RATE
-        end_time = abs_end / SAMPLE_RATE
-        # Clear unprocessed audio after processing
-        unprocessed_audio = np.array([], dtype=np.float32)
-        # Optional speaker diarization (throttled - only every 3rd call for performance)
-        speaker_label = "Speaker 1"
-        diarization_call_count += 1
-        pipeline = get_diarization_pipeline()
-        if pipeline is not None and (diarization_call_count % 3 == 0):
-            try:
-                wave = torch.from_numpy(segment_audio).float().unsqueeze(0)
-                diarization = pipeline({"waveform": wave, "sample_rate": SAMPLE_RATE})
-                speaker_durations = {}
-                for segment, _, raw_speaker in diarization.itertracks(yield_label=True):
-                    dur = segment.end - segment.start
-                    speaker_durations[raw_speaker] = speaker_durations.get(raw_speaker, 0.0) + dur
-                if speaker_durations:
-                    dominant_raw = max(speaker_durations, key=speaker_durations.get)
-                    if dominant_raw not in speaker_map:
-                        speaker_map[dominant_raw] = f"Speaker {next_speaker_idx}"
-                        next_speaker_idx += 1
-                    speaker_label = speaker_map[dominant_raw]
-            except Exception as e:
-                print("Diarization failed:", e)
-        # Skip if segment too short
-        if len(segment_audio) < int(0.25 * SAMPLE_RATE):
-            state["buffer"] = buffer
-            state["total_audio_samples"] = total_audio_samples
-            state["unprocessed_audio"] = unprocessed_audio
-            return full_transcript, state
-        # Process segment with Whisper
-        inputs = processor(segment_audio.copy(), sampling_rate=SAMPLE_RATE, return_tensors="pt")
         input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
-        # Generate with optimized settings for streaming (reduced beam search)
         with torch.no_grad():
             predicted_ids = model.generate(
                 input_features,
                 max_new_tokens=128,
-                num_beams=1,  # Greedy decoding for speed in streaming
-                no_repeat_ngram_size=4,
-                repetition_penalty=1.3,
-                length_penalty=0.7,
                 temperature=0.0,
                 do_sample=False,
-                early_stopping=True,
-                suppress_tokens=[1, 2, 7, 9],
-                forced_decoder_ids=None,
             )
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-        text = transcription[0].strip()
-        # Sentence-by-sentence commit logic
-        if not text:
-            state["buffer"] = buffer
-            state["total_audio_samples"] = total_audio_samples
-            state["processed_samples"] = processed_samples
-            state["unprocessed_audio"] = unprocessed_audio
-            return full_transcript, state
-        # Split into sentences
-        ends_with_punct = bool(re.search(r"[.!?]\s*$", text))
-        parts = sentence_split_re.split(text)
-        if ends_with_punct:
-            finished = parts
-        else:
-            finished = parts[:-1]
-        # Process finished sentences
-        for snt in finished:
-            snt = snt.strip()
-            if not snt:
-                continue
-            # Skip if exact duplicate
-            if snt in seen_texts:
-                continue
-            # Skip if near-duplicate of last transcription
-            if last_transcription and is_near_duplicate(snt, last_transcription, threshold=0.75):
-                continue
-            # Check for similar existing entries (O(n) but necessary for quality)
-            is_duplicate = False
-            for idx, entry in enumerate(entries):
-                if is_near_duplicate(snt, entry["text"], threshold=0.7):
-                    # If new sentence is longer, upgrade the old one
-                    if len(snt) > len(entry["text"]):
-                        entries[idx] = {
-                            "text": snt,
-                            "start": entry["start"],
-                            "end": end_time,
-                            "speaker": speaker_label,
-                        }
-                        seen_texts.discard(entry["text"])
-                        seen_texts.add(snt)
-                    is_duplicate = True
-                    break
-            if is_duplicate:
-                last_transcription = snt
-                continue
-            # Add new entry
-            entry = {
-                "text": snt,
-                "start": start_time,
-                "end": end_time,
-                "speaker": speaker_label,
-            }
-            entries.append(entry)
-            seen_texts.add(snt)
-            last_transcription = snt
-        # Build formatted transcript
-        lines = []
-        for entry in entries:
-            ts = format_timestamp(entry["start"])
-            speaker = entry["speaker"]
-            text_out = entry["text"]
-            if text_out:
-                lines.append(f"[{ts}] {speaker}: {text_out}")
-        full_transcript = "\n".join(lines)
-        # Update state (create new dict to avoid mutation)
-        # Convert deque to list and set to list for Gradio compatibility
-        state = {
-            "buffer": list(buffer),  # Convert deque to list for Gradio
-            "full_transcript": full_transcript,
-            "last_transcription": last_transcription,
-            "entries": entries,
-            "processed_samples": processed_samples,
-            "total_audio_samples": total_audio_samples,
-            "speaker_map": speaker_map,
-            "next_speaker_idx": next_speaker_idx,
-            "seen_texts": list(seen_texts),  # Convert set to list for Gradio
-            "unprocessed_audio": unprocessed_audio,
-        }
-        return full_transcript, state
     except Exception as e:
-        print("Error in stream_transcribe:")
-        print(traceback.format_exc())
-        return full_transcript, state
-# -------------------------
-# Reset helper
-# -------------------------
-def reset_transcript(state):
-    state = create_initial_state()
-    return "", state
-def transcribe_uploaded_file(file, state):
-    """
-    High-accuracy transcription for uploaded audio file.
-    Uses larger beam search for better quality.
-    """
     if file is None:
-        return state.get("full_transcript", ""), state
-    path = getattr(file, "name", None) or file
     try:
-        audio = AudioSegment.from_file(path)
-        audio = audio.set_channels(1)
-        audio = audio.set_frame_rate(SAMPLE_RATE)
-        samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
-        # Normalize based on sample width
         if audio.sample_width == 2:
             samples /= 32768.0
         elif audio.sample_width == 4:
             samples /= 2147483648.0
-        # Clip to valid range
         samples = np.clip(samples, -1.0, 1.0)
     except Exception as e:
-        print("Error loading uploaded audio file:", e)
-        return state.get("full_transcript", ""), state
-    # Fresh state for file
-    state = create_initial_state()
-    # Process in 30-second chunks
-    chunk_sec = 30.0
-    chunk_size = int(SAMPLE_RATE * chunk_sec)
-    texts = []
-    for start in range(0, len(samples), chunk_size):
-        chunk = samples[start:start + chunk_size]
-        if len(chunk) < int(0.5 * SAMPLE_RATE):  # Skip very short chunks
-            continue
-        inputs = processor(
-            chunk,
-            sampling_rate=SAMPLE_RATE,
-            return_tensors="pt",
-        )
-        input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
-        with torch.no_grad():
-            predicted_ids = model.generate(
-                input_features,
-                max_new_tokens=256,
-                num_beams=5,  # Higher beam search for file upload quality
-                no_repeat_ngram_size=4,
-                repetition_penalty=1.3,
-                length_penalty=0.7,
-                temperature=0.0,
-                do_sample=False,
-                early_stopping=True,
-                suppress_tokens=[1, 2, 7, 9],
-                forced_decoder_ids=None,
-            )
-        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
-        if text:
-            texts.append(text)
-    full_text = " ".join(texts).strip()
-    duration_sec = len(samples) / SAMPLE_RATE if len(samples) > 0 else 0.0
-    entries = []
-    if full_text:
-        entries.append({
-            "text": full_text,
-            "start": 0.0,
-            "end": duration_sec,
-            "speaker": "Speaker 1",
-        })
-        formatted = f"[{format_timestamp(0.0)}] Speaker 1: {full_text}"
-    else:
-        formatted = ""
-    state["entries"] = entries
-    state["full_transcript"] = formatted
-    state["last_transcription"] = full_text
-    state["total_audio_samples"] = len(samples)
-    state["seen_texts"] = [full_text] if full_text else []  # List instead of set
-    return formatted, state
 # -------------------------
 # Gradio UI
 # -------------------------
-with gr.Blocks(title="🎤 Whisper ASR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 🎤 Whisper Real-Time ASR
-        **💡 How to use:**
-        1. Click the **microphone icon** to start recording
-        2. See real-time transcription below
-        3. Click **Clear** to reset the transcript
-        4. Click **Copy** to copy the transcript to clipboard
-        Using OpenAI Whisper-large-v3-turbo with optimized streaming performance.
         """
     )
     with gr.Row():
-        with gr.Column(scale=1):
-            source_selector = gr.Radio(
-                choices=["Microphone (live)", "Upload audio file"],
-                value="Microphone (live)",
-                label="Audio source",
             )
-            mic_input = gr.Audio(
                 sources=["microphone"],
                 type="numpy",
                 streaming=True,
-                label="🎙️ Speak with your microphone",
-                visible=True,
             )
             file_input = gr.File(
-                label="📁 Upload audio file",
                 file_types=["audio"],
-                file_count="single",
-                visible=False,
             )
-            transcribe_file_btn = gr.Button(
-                "Transcribe Uploaded File", variant="secondary", visible=False
             )
-            clear_btn = gr.Button("🗑️ Clear Transcript", variant="secondary")
-        with gr.Column(scale=2):
-            output_box = gr.Textbox(
-                label="📄 Full Transcription",
-                lines=10,
-                interactive=False,
-                show_copy_button=True
             )
-            state = gr.State(create_initial_state())
-    def _update_source_ui(source_choice):
-        use_mic = source_choice.startswith("Microphone")
         return (
-            gr.update(visible=use_mic),
-            gr.update(visible=not use_mic),
-            gr.update(visible=not use_mic),
         )
-    source_selector.change(
-        _update_source_ui,
-        inputs=source_selector,
-        outputs=[mic_input, file_input, transcribe_file_btn],
     )
-    mic_input.stream(
-        fn=stream_transcribe,
-        inputs=[mic_input, state],
-        outputs=[output_box, state],
     )
-    transcribe_file_btn.click(
-        fn=transcribe_uploaded_file,
-        inputs=[file_input, state],
-        outputs=[output_box, state],
     )
     clear_btn.click(
-        fn=reset_transcript,
-        inputs=state,
-        outputs=[output_box, state],
     )
 if __name__ == "__main__":
-    # Launch without show_api parameter to avoid schema generation bug
-    # in some Gradio versions
-    try:
-        demo.launch(share=True, show_api=False)
-    except TypeError:
-        # Fallback if show_api causes issues
-        demo.launch(share=True)

 import os
 import numpy as np
 import gradio as gr
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 import spaces
 import traceback
 from pydub import AudioSegment
 # -------------------------
+# Model Loading
 # -------------------------
+print("🚀 Loading Whisper model...")
 model_id = "openai/whisper-large-v3-turbo"
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 model.to(DEVICE)
 model.eval()
 processor = AutoProcessor.from_pretrained(model_id)
+print(f"✅ Model loaded on {DEVICE}")
 # -------------------------
+# Constants
 # -------------------------
 SAMPLE_RATE = 16000
+BUFFER_SECONDS = 10
+def simple_resample(audio, orig_sr, target_sr=16000):
+    """Simple resampling using linear interpolation."""
     if orig_sr == target_sr:
         return audio
     duration = len(audio) / orig_sr
     if target_length == 0:
         return np.array([], dtype=np.float32)
     indices = np.linspace(0, len(audio) - 1, target_length)
+    return np.interp(indices, np.arange(len(audio)), audio).astype(np.float32)
 @spaces.GPU
+def transcribe_audio(audio_chunk, history):
     """
+    Simple streaming transcription.
+    audio_chunk: (sample_rate, audio_data) from Gradio
+    history: accumulated audio buffer as numpy array
     """
     try:
+        if audio_chunk is None:
+            return history, ""
+        # Parse audio
+        if isinstance(audio_chunk, tuple):
+            sr, data = audio_chunk
+        else:
+            return history, ""
+        if data is None or len(data) == 0:
+            return history, ""
+        # Convert to mono float32
         data = np.asarray(data, dtype=np.float32)
         if data.ndim == 2:
             data = np.mean(data, axis=1)
+        # Normalize if needed
         if data.dtype == np.int16:
             data = data.astype(np.float32) / 32768.0
         elif data.dtype == np.int32:
             data = data.astype(np.float32) / 2147483648.0
+        data = np.clip(data, -1.0, 1.0)
+        # Resample if needed
+        if sr != SAMPLE_RATE:
+            data = simple_resample(data, sr, SAMPLE_RATE)
+        # Initialize history if needed
+        if history is None or len(history) == 0:
+            history = data
+        else:
+            history = np.concatenate([history, data])
+        # Keep only last N seconds
+        max_samples = SAMPLE_RATE * BUFFER_SECONDS
+        if len(history) > max_samples:
+            history = history[-max_samples:]
+        # Need minimum audio to transcribe
+        if len(history) < SAMPLE_RATE * 0.5:  # 0.5 seconds minimum
+            return history, ""
+        # Transcribe the buffer
+        inputs = processor(
+            history,
+            sampling_rate=SAMPLE_RATE,
+            return_tensors="pt"
+        )
         input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
         with torch.no_grad():
             predicted_ids = model.generate(
                 input_features,
                 max_new_tokens=128,
+                num_beams=1,  # Greedy for speed
                 temperature=0.0,
                 do_sample=False,
             )
+        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
+        return history, text
     except Exception as e:
+        print(f"Error: {e}")
+        traceback.print_exc()
+        return history if history is not None else np.array([]), ""
+def transcribe_file(file):
+    """Transcribe an uploaded audio file."""
     if file is None:
+        return ""
     try:
+        # Load audio file
+        audio = AudioSegment.from_file(file.name)
+        audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
+        samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
         if audio.sample_width == 2:
             samples /= 32768.0
         elif audio.sample_width == 4:
             samples /= 2147483648.0
         samples = np.clip(samples, -1.0, 1.0)
+        # Process in chunks
+        chunk_size = SAMPLE_RATE * 30  # 30 second chunks
+        texts = []
+        for start in range(0, len(samples), chunk_size):
+            chunk = samples[start:start + chunk_size]
+            if len(chunk) < SAMPLE_RATE * 0.5:
+                continue
+            inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+            input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
+            with torch.no_grad():
+                predicted_ids = model.generate(
+                    input_features,
+                    max_new_tokens=256,
+                    num_beams=5,  # Better quality for files
+                    temperature=0.0,
+                    do_sample=False,
+                )
+            text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
+            if text:
+                texts.append(text)
+        return " ".join(texts)
     except Exception as e:
+        print(f"File transcription error: {e}")
+        traceback.print_exc()
+        return f"Error: {str(e)}"
+def clear_history():
+    """Reset everything."""
+    return np.array([]), ""
 # -------------------------
 # Gradio UI
 # -------------------------
+with gr.Blocks(title="🎤 Whisper ASR") as demo:
     gr.Markdown(
         """
+        # 🎤 Whisper Real-Time Transcription
+        **How to use:**
+        - **Microphone**: Click to record, speak, see live transcription
+        - **File Upload**: Upload audio file and click "Transcribe"
+        - **Clear**: Reset the transcription
+        Using Whisper-large-v3-turbo
         """
     )
     with gr.Row():
+        with gr.Column():
+            source = gr.Radio(
+                ["Microphone", "Upload File"],
+                value="Microphone",
+                label="Audio Source"
             )
+            mic = gr.Audio(
                 sources=["microphone"],
                 type="numpy",
                 streaming=True,
+                label="🎙️ Microphone",
+                visible=True
             )
             file_input = gr.File(
+                label="📁 Upload Audio",
                 file_types=["audio"],
+                visible=False
             )
+            transcribe_btn = gr.Button(
+                "Transcribe File",
+                visible=False
             )
+            clear_btn = gr.Button("🗑️ Clear")
+        with gr.Column():
+            output = gr.Textbox(
+                label="📄 Transcription",
+                lines=12,
+                interactive=False
             )
+    # State: just the audio buffer
+    audio_history = gr.State(np.array([]))
+    # Toggle UI based on source
+    def update_ui(choice):
+        is_mic = choice == "Microphone"
         return (
+            gr.update(visible=is_mic),
+            gr.update(visible=not is_mic),
+            gr.update(visible=not is_mic)
         )
+    source.change(
+        update_ui,
+        inputs=source,
+        outputs=[mic, file_input, transcribe_btn]
     )
+    # Streaming mic input
+    mic.stream(
+        transcribe_audio,
+        inputs=[mic, audio_history],
+        outputs=[audio_history, output]
     )
+    # File transcription
+    transcribe_btn.click(
+        transcribe_file,
+        inputs=file_input,
+        outputs=output
     )
+    # Clear button
     clear_btn.click(
+        clear_history,
+        outputs=[audio_history, output]
     )
 if __name__ == "__main__":
+    demo.launch(share=True)