Spaces:

thomaskywong0131
/

Whisper_Streaming

Sleeping

File size: 11,568 Bytes

43602d3

# app.py
import os, gc, warnings, logging
import torch, numpy as np, librosa, gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from huggingface_hub import login

# -------------------------------
# HF Token Login (for private repos)
# -------------------------------
if "HF_TOKEN" in os.environ:
    login(token=os.environ["HF_TOKEN"])

# -------------------------------
# Config & Device
# -------------------------------
warnings.filterwarnings("ignore")
logger = logging.getLogger("whisper_streaming")
logger.setLevel(logging.DEBUG)

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}, dtype={torch_dtype}")

# -------------------------------
# Model Loading
# -------------------------------
MODEL_OPTIONS = {
    "Fine-tuned Cantonese": "thomaskywong0131/whisper-large-v3-cantonese",
    "OpenAI Large-v3": "openai/whisper-large-v3",
    "OpenAI Large-v3-Turbo": "openai/whisper-large-v3-turbo",
}

def load_model(model_choice="Fine-tuned Cantonese"):
    model_name = MODEL_OPTIONS[model_choice]
    print(f"Loading model: {model_name}")
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(
        model_name,
        dtype=torch_dtype,
        device_map="auto" if device == "cuda" else None,
        use_safetensors=True,
    )

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        dtype=torch_dtype,
        generate_kwargs={"language": "yue"}  # 強制指定粵語
    )
    print(f"✅ Successfully loaded: {model_choice}")
    return pipe, processor

pipe, processor = load_model("Fine-tuned Cantonese")

# -------------------------------
# HypothesisBuffer
# -------------------------------
class HypothesisBuffer:
    def __init__(self):
        self.entries = []

    def insert(self, new, offset=0):
        safe_new = []
        for a, b, t in new:
            start = a + offset if a is not None else None
            end = b + offset if b is not None else None
            safe_new.append((start, end, t))
        self.entries.extend(safe_new)

    def reset(self):
        self.entries = []

    def get_text(self):
        return "".join([t for (_, _, t) in self.entries])

    def get_entries(self):
        return self.entries

    def complete(self):
        return self.entries

    def flush(self):
        return self.entries

# -------------------------------
# OnlineASRProcessor
# -------------------------------
class OnlineASRProcessor:
    def __init__(self, pipe, processor, sample_rate=16000):
        self.pipe = pipe
        self.processor = processor
        self.sample_rate = sample_rate
        self.audio_accum = np.array([], dtype=np.float32)
        self.transcript_buffer = HypothesisBuffer()

    def init(self):
        self.audio_accum = np.array([], dtype=np.float32)
        self.transcript_buffer.reset()

    def insert_audio_chunk(self, audio: np.ndarray):
        self.audio_accum = np.append(self.audio_accum, audio)

    def process_iter(self):
        if len(self.audio_accum) < self.sample_rate:
            return None, None, ""

        try:
            result = self.pipe(self.audio_accum, chunk_length_s=10)
            txt = result["text"].strip()
        except Exception as e:
            txt = f"[ASR error: {e}]"

        if txt:
            self.transcript_buffer.insert([(None, None, txt)])
            self.audio_accum = np.array([], dtype=np.float32)
            return None, None, txt
        return None, None, ""

    def finish(self):
        if len(self.audio_accum) == 0:
            return None, None, ""

        try:
            result = self.pipe(self.audio_accum, chunk_length_s=30)
            txt = result["text"].strip()
        except Exception as e:
            txt = f"[ASR error: {e}]"

        if txt:
            self.transcript_buffer.insert([(None, None, txt)])
            self.audio_accum = np.array([], dtype=np.float32)
            return None, None, txt
        return None, None, ""

# -------------------------------
# VACOnlineASRProcessor (Silero VAD)
# -------------------------------
class VACOnlineASRProcessor:
    def __init__(self, pipe, processor, silence_sec=0.8, speech_threshold=0.5):
        self.online = OnlineASRProcessor(pipe, processor)
        self.model, _ = torch.hub.load(
            repo_or_dir="snakers4/silero-vad",
            model="silero_vad",
            force_reload=False
        )
        self.sample_rate = 16000
        self.frame_size = 512
        self.silence_sec = silence_sec
        self.speech_threshold = speech_threshold
        self.reset()

    def reset(self):
        self.online.init()
        self.buffer = np.array([], dtype=np.float32)
        self.audio_accum = np.array([], dtype=np.float32)
        self.silence_samples = 0
        self.flush_queue = []

    def insert_audio_chunk(self, audio: np.ndarray):
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        if audio.max() > 1.0 or audio.min() < -1.0:
            audio /= 32768.0

        self.buffer = np.append(self.buffer, audio)

        while len(self.buffer) >= self.frame_size:
            frame = self.buffer[:self.frame_size]
            self.buffer = self.buffer[self.frame_size:]

            tensor = torch.from_numpy(frame).unsqueeze(0)
            with torch.no_grad():
                speech_prob = self.model(tensor, self.sample_rate).item()

            log_debug(f"[VAD] prob={speech_prob:.2f}, silence={self.silence_samples}, accum={len(self.audio_accum)}")

            if speech_prob > self.speech_threshold:
                self.audio_accum = np.append(self.audio_accum, frame)
                self.silence_samples = 0
            else:
                self.silence_samples += self.frame_size
                if self.silence_samples >= self.sample_rate * self.silence_sec:
                    if len(self.audio_accum) > 0:
                        self.online.insert_audio_chunk(self.audio_accum)
                        beg, end, txt = self.online.finish()
                        if txt:
                            self.flush_queue.append((beg, end, txt))
                            log_debug(f"[FLUSH] Added to queue: {txt}")
                        self.audio_accum = np.array([], dtype=np.float32)
                    self.silence_samples = 0

    def process_iter(self):
        if self.flush_queue:
            return self.flush_queue.pop(0)
        return None, None, ""

    def finish(self):
        beg, end, txt = self.online.finish()
        if txt:
            return beg, end, txt
        return None, None, ""

# -------------------------------
# Gradio Callbacks
# -------------------------------
stream_text = ""
debug_text = ""
use_vac = False
vac_online = None
online = OnlineASRProcessor(pipe, processor)
silence_sec_value = 0.8
speech_threshold_value = 0.5

def log_debug(msg):
    global debug_text
    debug_text += msg + "\n"

def start_transcription(vac_mode, silence_sec, speech_threshold):
    global stream_text, debug_text, use_vac, vac_online, online
    global silence_sec_value, speech_threshold_value

    stream_text, debug_text = "", ""
    use_vac = vac_mode
    silence_sec_value = silence_sec
    speech_threshold_value = speech_threshold

    if use_vac:
        vac_online = VACOnlineASRProcessor(
            pipe, processor,
            silence_sec=silence_sec_value,
            speech_threshold=speech_threshold_value
        )
        vac_online.reset()
        log_debug("[START] VAC mode enabled")
    else:
        online.init()
        log_debug("[START] VAC mode disabled (basic streaming)")

    log_debug(f"[SETTINGS] silence_sec={silence_sec_value:.2f}, speech_threshold={speech_threshold_value:.2f}")
    return "🔴 Streaming started", gr.update(interactive=False), gr.update(interactive=True), debug_text

def stop_transcription():
    return "⏹️ Stopped", gr.update(interactive=True), gr.update(interactive=False), stream_text, debug_text

def process_stream(audio):
    global stream_text, debug_text, use_vac, vac_online, online

    if audio is None:
        return stream_text, debug_text

    if isinstance(audio, tuple):
        sr, arr = audio
        arr = np.array(arr)
        if arr.dtype != np.float32:
            arr = arr.astype(np.float32)
        if arr.max() > 1.0 or arr.min() < -1.0:
            arr /= 32768.0
        if sr != 16000:
            arr = librosa.resample(arr, orig_sr=sr, target_sr=16000)
    else:
        arr = np.array(audio, dtype=np.float32)

    if use_vac:
        vac_online.insert_audio_chunk(arr)
        beg, end, txt = vac_online.process_iter()
        log_debug(f"[VAC] Insert {len(arr)} samples | Output: {txt}")
    else:
        online.insert_audio_chunk(arr)
        beg, end, txt = online.process_iter()
        log_debug(f"[Online] Insert {len(arr)} samples | Output: {txt}")

    if txt:
        stream_text += txt + "\n"
        log_debug(f"[Flush] {beg}-{end} | '{txt}'")

    return stream_text, debug_text

def clear_text():
    global stream_text, debug_text
    stream_text = ""
    debug_text = ""
    return stream_text, debug_text

# -------------------------------
# Gradio UI
# -------------------------------
with gr.Blocks(title="Cantonese Streaming (VAC)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 Cantonese Streaming Transcription with VAC + Debug Logs")
    gr.Markdown("✅ 支援 VAC，並可在下方調整靜音閾值與語音閾值")

    with gr.Row():
        with gr.Column(scale=1):
            vac_mode = gr.Checkbox(label="啟用 VAC 模式", value=False)
            silence_slider = gr.Slider(label="靜音閾值 (秒)", minimum=0.3, maximum=1.2, value=0.8, step=0.1)
            threshold_slider = gr.Slider(label="語音閾值", minimum=0.1, maximum=0.9, value=0.5, step=0.05)
            start_btn = gr.Button("🔴 Start")
            stop_btn = gr.Button("⏹️ Stop", interactive=False)
            clear_btn = gr.Button("🗑️ Clear")

        with gr.Column(scale=2):
            mic = gr.Audio(sources=["microphone"], type="numpy", streaming=True, label="🎙️ Live Input")
            output = gr.Textbox(label="📝 Transcript", lines=15, autoscroll=True)
            debug_output = gr.Textbox(label="🔎 Debug Window", lines=15, autoscroll=True)

    start_btn.click(start_transcription, inputs=[vac_mode, silence_slider, threshold_slider],
                    outputs=[output, start_btn, stop_btn, debug_output])
    stop_btn.click(stop_transcription, outputs=[output, start_btn, stop_btn, output, debug_output])
    clear_btn.click(clear_text, outputs=[output, debug_output])
    mic.stream(process_stream, inputs=[mic], outputs=[output, debug_output], stream_every=0.5)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0",
                server_port=7860,
                share=False,
                ssr_mode=False)   # 關閉 SSR