Spaces:

facebook
/

seamless-streaming

Paused

App Files Files Community

mduppes commited on Sep 12, 2023

Commit

2d522b6

1 Parent(s): 1727d3b

Update with temp work

Browse files

Files changed (4) hide show

app.py +6 -1
internal_demo_simuleval_transcoder.py +272 -0
requirements.txt +16 -2
simuleval_transcoder.py +178 -0

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from seamless_communication.models.inference.translator import Translator
 from m4t_app import *
 from pydub import AudioSegment
 import time
@@ -19,6 +21,7 @@ from time import sleep
 USE_M4T = True
 def translate_audio_file_segment(audio_file):
     print("translate_m4t state")
@@ -90,7 +93,9 @@ def blocks():
             )
         most_recent_input_audio_segment = gr.Audio(
-            label="Recent Input Audio Segment segments", format="bytes", streaming=True
         )
         # TODO: Should add combined input audio segments...

 from m4t_app import *
+from simuleval_transcoder import *
+# from simuleval_transcoder import *
 from pydub import AudioSegment
 import time
 USE_M4T = True
+Transcoder = SimulevalTranscoder()
 def translate_audio_file_segment(audio_file):
     print("translate_m4t state")
             )
         most_recent_input_audio_segment = gr.Audio(
+            label="Recent Input Audio Segment segments",
+            format="bytes",
+            streaming=True
         )
         # TODO: Should add combined input audio segments...

internal_demo_simuleval_transcoder.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from simuleval.utils.agent import build_system_from_dir
+from typing import Any, Tuple
+import numpy as np
+import soundfile
+from fairseq.data.audio.audio_utils import convert_waveform
+import io
+import asyncio
+from simuleval.data.segments import SpeechSegment, EmptySegment
+import threading
+import math
+import logging
+import sys
+from pathlib import Path
+import time
+from g2p_en import G2p
+import torch
+import traceback
+import time
+import random
+from .speech_and_text_output import SpeechAndTextOutput
+MODEL_SAMPLE_RATE = 16_000
+logger = logging.getLogger()
+logger.addHandler(logging.StreamHandler(sys.stdout))
+class SimulevalTranscoder:
+    def __init__(self, agent, sample_rate, debug, buffer_limit):
+        self.agent = agent
+        self.input_queue = asyncio.Queue()
+        self.output_queue = asyncio.Queue()
+        self.states = self.agent.build_states()
+        if debug:
+            self.states[0].debug = True
+        self.incoming_sample_rate = sample_rate
+        self.close = False
+        self.g2p = G2p()
+        # buffer all outgoing translations within this amount of time
+        self.output_buffer_idle_ms = 5000
+        self.output_buffer_size_limit = (
+            buffer_limit  # phonemes for text, seconds for speech
+        )
+        self.output_buffer_cur_size = 0
+        self.output_buffer = []
+        self.speech_output_sample_rate = None
+        self.last_output_ts = time.time() * 1000
+        self.timeout_ms = (
+            30000  # close the transcoder thread after this amount of silence
+        )
+        self.first_input_ts = None
+        self.first_output_ts = None
+        self.output_data_type = None  # speech or text
+        self.debug = debug
+        self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
+        if self.debug:
+            debug_folder = Path(__file__).resolve().parent.parent / "debug"
+            self.test_incoming_wav = soundfile.SoundFile(
+                debug_folder / f"{self.debug_ts}_test_incoming.wav",
+                mode="w+",
+                format="WAV",
+                subtype="PCM_16",
+                samplerate=self.incoming_sample_rate,
+                channels=1,
+            )
+            self.states[0].test_input_segments_wav = soundfile.SoundFile(
+                debug_folder / f"{self.debug_ts}_test_input_segments.wav",
+                mode="w+",
+                format="WAV",
+                samplerate=MODEL_SAMPLE_RATE,
+                channels=1,
+            )
+    def debug_log(self, *args):
+        if self.debug:
+            logger.info(*args)
+    @classmethod
+    def build_agent(cls, model_path):
+        logger.info(f"Building simuleval agent: {model_path}")
+        agent = build_system_from_dir(
+            Path(__file__).resolve().parent.parent / f"models/{model_path}",
+            config_name="vad_main.yaml",
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        agent.to(device, fp16=True)
+        logger.info(
+            f"Successfully built simuleval agent {model_path} on device {device}"
+        )
+        return agent
+    def process_incoming_bytes(self, incoming_bytes):
+        segment, _sr = self._preprocess_wav(incoming_bytes)
+        # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
+        self.input_queue.put_nowait(segment)
+    def get_input_segment(self):
+        if self.input_queue.empty():
+            return None
+        chunk = self.input_queue.get_nowait()
+        self.input_queue.task_done()
+        return chunk
+    def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
+        segment, sample_rate = soundfile.read(
+            io.BytesIO(data),
+            dtype="float32",
+            always_2d=True,
+            frames=-1,
+            start=0,
+            format="RAW",
+            subtype="PCM_16",
+            samplerate=self.incoming_sample_rate,
+            channels=1,
+        )
+        if self.debug:
+            self.test_incoming_wav.seek(0, soundfile.SEEK_END)
+            self.test_incoming_wav.write(segment)
+        segment = segment.T
+        segment, new_sample_rate = convert_waveform(
+            segment,
+            sample_rate,
+            normalize_volume=False,
+            to_mono=True,
+            to_sample_rate=MODEL_SAMPLE_RATE,
+        )
+        assert MODEL_SAMPLE_RATE == new_sample_rate
+        segment = segment.squeeze(axis=0)
+        return segment, new_sample_rate
+    def process_pipeline_impl(self, input_segment):
+        try:
+            output_segment = self.agent.pushpop(input_segment, self.states)
+            if (
+                self.states[0].first_input_ts is not None
+                and self.first_input_ts is None
+            ):
+                # TODO: this is hacky
+                self.first_input_ts = self.states[0].first_input_ts
+            if not output_segment.is_empty:
+                self.output_queue.put_nowait(output_segment)
+            if output_segment.finished:
+                self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
+                for state in self.states:
+                    state.reset()
+                if self.debug:
+                    # when we rebuild states, this value is reset to whatever
+                    # is in the system dir config, which defaults debug=False.
+                    self.states[0].debug = True
+        except Exception as e:
+            logger.error(f"Got exception while processing pipeline: {e}")
+            traceback.print_exc()
+        return input_segment
+    def process_pipeline_loop(self):
+        if self.close:
+            return  # closes the thread
+        self.debug_log("processing_pipeline")
+        while not self.close:
+            input_segment = self.get_input_segment()
+            if input_segment is None:
+                if self.states[0].is_fresh_state:  # TODO: this is hacky
+                    time.sleep(0.3)
+                else:
+                    time.sleep(0.03)
+                continue
+            self.process_pipeline_impl(input_segment)
+        self.debug_log("finished processing_pipeline")
+    def process_pipeline_once(self):
+        if self.close:
+            return
+        self.debug_log("processing pipeline once")
+        input_segment = self.get_input_segment()
+        if input_segment is None:
+            return
+        self.process_pipeline_impl(input_segment)
+        self.debug_log("finished processing_pipeline_once")
+    def get_output_segment(self):
+        if self.output_queue.empty():
+            return None
+        output_chunk = self.output_queue.get_nowait()
+        self.output_queue.task_done()
+        return output_chunk
+    def start(self):
+        self.debug_log("starting transcoder in a thread")
+        threading.Thread(target=self.process_pipeline_loop).start()
+    def first_translation_time(self):
+        return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
+    def get_buffered_output(self) -> SpeechAndTextOutput:
+        now = time.time() * 1000
+        self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
+        while not self.output_queue.empty():
+            tmp_out = self.get_output_segment()
+            if tmp_out and len(tmp_out.content) > 0:
+                if not self.output_data_type:
+                    self.output_data_type = tmp_out.data_type
+                if len(self.output_buffer) == 0:
+                    self.last_output_ts = now
+                self._populate_output_buffer(tmp_out)
+                self._increment_output_buffer_size(tmp_out)
+                if tmp_out.finished:
+                    res = self._gather_output_buffer_data(final=True)
+                    self.output_buffer = []
+                    self.increment_output_buffer_size = 0
+                    self.last_output_ts = now
+                    self.first_output_ts = now
+                    return res
+        if len(self.output_buffer) > 0 and (
+            now - self.last_output_ts >= self.output_buffer_idle_ms
+            or self.output_buffer_cur_size >= self.output_buffer_size_limit
+        ):
+            self.last_output_ts = now
+            res = self._gather_output_buffer_data(final=False)
+            self.output_buffer = []
+            self.output_buffer_phoneme_count = 0
+            self.first_output_ts = now
+            return res
+        else:
+            return None
+    def _gather_output_buffer_data(self, final):
+        if self.output_data_type == "text":
+            return SpeechAndTextOutput(text=" ".join(self.output_buffer), final=final)
+        elif self.output_data_type == "speech":
+            return SpeechAndTextOutput(
+                speech_samples=self.output_buffer,
+                speech_sample_rate=MODEL_SAMPLE_RATE,
+                final=final,
+            )
+        else:
+            raise ValueError(
+                f"Invalid output buffer data type: {self.output_data_type}"
+            )
+    def _increment_output_buffer_size(self, segment):
+        if segment.data_type == "text":
+            self.output_buffer_cur_size += self._compute_phoneme_count(segment.content)
+        elif segment.data_type == "speech":
+            self.output_buffer_cur_size += (
+                len(segment.content) / MODEL_SAMPLE_RATE
+            )  # seconds
+    def _populate_output_buffer(self, segment):
+        if segment.data_type == "text":
+            self.output_buffer.append(segment.content)
+        elif segment.data_type == "speech":
+            self.output_buffer += segment.content
+        else:
+            raise ValueError(f"Invalid segment data type: {segment.data_type}")
+    def _compute_phoneme_count(self, string: str) -> int:
+        return len([x for x in self.g2p(string) if x != " "])

requirements.txt CHANGED Viewed

@@ -1,9 +1,23 @@
 # fairseq2==0.1.0
 git+https://github.com/mduppes/fairseq2.git@93420c86ba01349ee8f90d7adda439b666b50557
-git+https://github.com/facebookresearch/seamless_communication
 gradio==3.41.0
 huggingface_hub==0.16.4
 torch==2.0.1
 torchaudio==2.0.2
 transformers==4.32.1
-pydub

 # fairseq2==0.1.0
+# Temp to skip
 git+https://github.com/mduppes/fairseq2.git@93420c86ba01349ee8f90d7adda439b666b50557
+# git+https://github.com/facebookresearch/seamless_communication
+./seamless_communication
+# comment this out to test fairseq1 first
+# git+https://github.com/facebookresearch/SimulEval.git
 gradio==3.41.0
 huggingface_hub==0.16.4
 torch==2.0.1
 torchaudio==2.0.2
 transformers==4.32.1
+pydub
+# Can't import fairseq1 together.. causes conflict:
+#The conflict is caused by:
+#    The user requested simuleval 1.1.0 (from git+ssh://****@github.com/facebookresearch/SimulEval.git@tree_pipeline)
+#    seamless-communication 1.0.0 depends on simuleval 1.0.3.dev36+gd84fa60 (from git+https://github.com/mduppes/SimulEval.git@main)
+# From fairseq1 pipeline
+# git+ssh://git@github.com/fairinternal/fairseq-py.git@emma_incremental_decoder
+# git+ssh://git@github.com/facebookresearch/SimulEval.git@tree_pipeline

simuleval_transcoder.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from fairseq2.assets.card import AssetCard
+from fairseq2.data import Collater
+from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
+from fairseq2.data.text.text_tokenizer import TextTokenizer
+from fairseq2.data.typing import StringLike
+from fairseq2.generation import SequenceToTextOutput, SequenceGeneratorOptions
+from fairseq2.memory import MemoryBlock
+from fairseq2.typing import DataType, Device
+from torch import Tensor
+from enum import Enum, auto
+from seamless_communication.models.inference.ngram_repeat_block_processor import (
+    NGramRepeatBlockProcessor,
+)
+from seamless_communication.models.unity import (
+    UnitTokenizer,
+    UnitYGenerator,
+    UnitYModel,
+    load_unity_model,
+    load_unity_text_tokenizer,
+    load_unity_unit_tokenizer,
+)
+from seamless_communication.models.unity.generator import SequenceToUnitOutput
+from seamless_communication.models.vocoder import load_vocoder_model, Vocoder
+from seamless_communication.models.streaming.agents import (
+    SileroVADAgent,
+    TestTimeWaitKS2TVAD,
+    TestTimeWaitKUnityV1M4T
+)
+### From test_pipeline
+import math
+import soundfile
+from argparse import Namespace, ArgumentParser
+from simuleval.data.segments import SpeechSegment, EmptySegment
+from simuleval.utils import build_system_from_dir
+from pathlib import Path
+import numpy as np
+class AudioFrontEnd:
+    def __init__(self, wav_file, segment_size) -> None:
+        self.samples, self.sample_rate = soundfile.read(wav_file)
+        # print(len(self.samples), self.samples[:100])
+        self.samples = self.samples.tolist()
+        self.segment_size = segment_size
+        self.step = 0
+    def send_segment(self):
+        """
+        This is the front-end logic in simuleval instance.py
+        """
+        num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)
+        print("self.segment_size", self.segment_size)
+        print('num_samples is', num_samples)
+        print('self.sample_rate is', self.sample_rate)
+        if self.step < len(self.samples):
+            if self.step + num_samples >= len(self.samples):
+                samples = self.samples[self.step :]
+                is_finished = True
+            else:
+                samples = self.samples[self.step : self.step + num_samples]
+                is_finished = False
+            self.step = min(self.step + num_samples, len(self.samples))
+            # print("len(samples) is", len(samples))
+            # import pdb
+            # pdb.set_trace()
+            segment = SpeechSegment(
+                index=self.step / self.sample_rate * 1000,
+                content=samples,
+                sample_rate=self.sample_rate,
+                finished=is_finished,
+            )
+        else:
+            # Finish reading this audio
+            segment = EmptySegment(
+                index=self.step / self.sample_rate * 1000,
+                finished=True,
+            )
+        return segment
+def load_model_for_inference(
+    load_model_fn: Callable[..., nn.Module],
+    model_name_or_card: Union[str, AssetCard],
+    device: Device,
+    dtype: DataType,
+) -> nn.Module:
+    model = load_model_fn(model_name_or_card, device=device, dtype=dtype)
+    model.eval()
+    return model
+class SimulevalTranscoder:
+    # def __init__(self, agent, sample_rate, debug, buffer_limit):
+    def __init__(self):
+        print("MDUPPES in here", SileroVADAgent, TestTimeWaitKS2TVAD)
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        device = "cpu"
+        print("DEVICE", device)
+        model_name_or_card="seamlessM4T_medium"
+        vocoder_name_or_card="vocoder_36langs"
+        # dtype=torch.float16,
+        # For CPU Mode need to use 32, float16 causes errors downstream
+        dtype=dtype=torch.float32
+        model: UnitYModel = load_model_for_inference(
+            load_unity_model, model_name_or_card, device, dtype
+        )
+        print(model, type(model))
+        parser = ArgumentParser()
+        source_segment_size = 320  # milliseconds
+        audio_frontend = AudioFrontEnd(
+            wav_file="/checkpoint/mduppes/samples/marta.wav",
+            segment_size=source_segment_size,
+        )
+        # mostly taken from S2S first agent: OnlineFeatureExtractorAgent defaults
+        SHIFT_SIZE = 10
+        WINDOW_SIZE = 25
+        SAMPLE_RATE = 16000
+        FEATURE_DIM = 80
+        # args and convert to namespace so it can be accesed via .
+        args = {
+            "shift_size": SHIFT_SIZE,
+            "window_size": WINDOW_SIZE,
+            "sample_rate": audio_frontend.sample_rate,
+            "feature_dim": 160, # from Wav2Vec2Frontend
+            "denormalize": False, # not sure..
+            "global_stats": None, # default file path containing cmvn stats..
+        }
+        print(args)
+        args = Namespace(**args)
+        pipeline = TestTimeWaitKUnityV1M4T(model, args)
+        system_states = pipeline.build_states()
+        print('system states')
+        print(system_states)
+        input_segment = np.empty(0, dtype=np.int16)
+        segments = []
+        while True:
+            speech_segment = audio_frontend.send_segment()
+            input_segment = np.concatenate((input_segment, np.array(speech_segment.content)))
+            # Translation happens here
+            output_segment = pipeline.pushpop(speech_segment, system_states)
+            print('pushpop result')
+            print(output_segment)
+            if output_segment.finished:
+                segments.append(input_segment)
+                input_segment = np.empty(0, dtype=np.int16)
+                print("Resetting states")
+                for state in system_states:
+                    state.reset()
+            if speech_segment.finished:
+                break
+        # The VAD-segmented samples from the full input audio
+        for i, seg in enumerate(segments):
+            with soundfile.SoundFile(
+                Path("/checkpoint/mduppes/samples") / f"marta_{i}.wav",
+                mode="w+",
+                format="WAV",
+                samplerate=16000,
+                channels=1,
+            ) as f:
+                f.seek(0, soundfile.SEEK_END)
+                f.write(seg)