Spaces:

zsc
/

tokenizer-demo

Sleeping

App Files Files Community

zsc commited on Dec 22, 2025

Commit

cda2a68

1 Parent(s): acd204b

asis

Browse files

Files changed (2) hide show

app.py +288 -140
requirements.txt +24 -4

app.py CHANGED Viewed

@@ -1,154 +1,302 @@
 import gradio as gr
 import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import os
+import torch
+import torchaudio
 import numpy as np
+import onnxruntime
+import whisper
+import io
+import librosa
+import math
+from huggingface_hub import snapshot_download
+from funasr import AutoModel
+# Utils
+def resample_audio(wav, original_sample_rate, target_sample_rate):
+    if original_sample_rate != target_sample_rate:
+        wav = torchaudio.transforms.Resample(
+            orig_freq=original_sample_rate, new_freq=target_sample_rate
+        )(wav)
+    return wav
+def energy_norm_fn(wav):
+    if type(wav) is np.ndarray:
+        max_data = np.max(np.abs(wav))
+        wav = wav / max(max_data, 0.01) * 0.999
+    else:
+        max_data = torch.max(torch.abs(wav))
+        wav = wav / max(max_data, 0.01) * 0.999
+    return wav
+def trim_silence(audio, sr, keep_left_time=0.05, keep_right_time=0.22, hop_size=240):
+    _, index = librosa.effects.trim(audio, top_db=20, frame_length=512, hop_length=128)
+    num_frames = int(math.ceil((index[1] - index[0]) / hop_size))
+    left_sil_samples = int(keep_left_time * sr)
+    right_sil_samples = int(keep_right_time * sr)
+    wav_len = len(audio)
+    start_idx = index[0] - left_sil_samples
+    trim_wav = audio
+    if start_idx > 0:
+        trim_wav = trim_wav[start_idx:]
+    else:
+        trim_wav = np.pad(
+            trim_wav, (abs(start_idx), 0), mode="constant", constant_values=0.0
+        )
+    wav_len = len(trim_wav)
+    out_len = int(num_frames * hop_size + (keep_left_time + keep_right_time) * sr)
+    if out_len < wav_len:
+        trim_wav = trim_wav[:out_len]
+    else:
+        trim_wav = np.pad(
+            trim_wav, (0, (out_len - wav_len)), mode="constant", constant_values=0.0
+        )
+    return trim_wav
+class StepAudioTokenizer:
+    def __init__(self):
+        model_id = "dengcunqin/speech_paraformer-large_asr_nat-zh-cantonese-en-16k-vocab8501-online"
+        print(f"Loading model from Hugging Face: {model_id}")
+        self.model_dir = snapshot_download(model_id)
+        # Load FunASR model
+        print(f"Initializing AutoModel from {self.model_dir}")
+        self.funasr_model = AutoModel(
+            model=self.model_dir,
+            model_revision="main",
+            device="cpu",
+            disable_update=True
+        )
+        kms_path = os.path.join(self.model_dir, "linguistic_tokenizer.npy")
+        cosy_tokenizer_path = os.path.join(self.model_dir, "speech_tokenizer_v1.onnx")
+        if not os.path.exists(kms_path):
+             raise FileNotFoundError(f"KMS file not found: {kms_path}")
+        if not os.path.exists(cosy_tokenizer_path):
+             raise FileNotFoundError(f"Cosy tokenizer file not found: {cosy_tokenizer_path}")
+        self.kms = torch.tensor(np.load(kms_path))
+        providers = ["CPUExecutionProvider"]
+        session_option = onnxruntime.SessionOptions()
+        session_option.graph_optimization_level = (
+            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+        session_option.intra_op_num_threads = 1
+        self.ort_session = onnxruntime.InferenceSession(
+            cosy_tokenizer_path, sess_options=session_option, providers=providers
+        )
+        self.chunk_size = [0, 4, 5]
+        self.encoder_chunk_look_back = 4
+        self.decoder_chunk_look_back = 1
+        # Identify the inference function
+        if hasattr(self.funasr_model, "infer_encoder"):
+             self.infer_func = self.funasr_model.infer_encoder
+        elif hasattr(self.funasr_model, "model") and hasattr(self.funasr_model.model, "infer_encoder"):
+             self.infer_func = self.funasr_model.model.infer_encoder
+        else:
+             # Try to find it in the model object if it's wrapped differently
+             print("Warning: infer_encoder not found directly. Will check at runtime.")
+             self.infer_func = None
+    def __call__(self, audio_path):
+        # Load audio
+        audio, sr = torchaudio.load(audio_path)
+        # Mix to mono if stereo
+        if audio.shape[0] > 1:
+            audio = audio.mean(dim=0, keepdim=True)
+        _, vq02, vq06 = self.wav2token(audio, sr, False)
+        text = self.merge_vq0206_to_token_str(vq02, vq06)
+        return text
+    def preprocess_wav(self, audio, sample_rate, enable_trim=True, energy_norm=True):
+        audio = resample_audio(audio, sample_rate, 16000)
+        if energy_norm:
+            audio = energy_norm_fn(audio)
+        if enable_trim:
+            audio = audio.cpu().numpy().squeeze(0)
+            audio = trim_silence(audio, 16000)
+            audio = torch.from_numpy(audio)
+            audio = audio.unsqueeze(0)
+        return audio
+    def wav2token(self, audio, sample_rate, enable_trim=True, energy_norm=True):
+        audio = self.preprocess_wav(
+            audio, sample_rate, enable_trim=enable_trim, energy_norm=energy_norm
+        )
+        vq02_ori = self.get_vq02_code(audio)
+        vq02 = [int(x) + 65536 for x in vq02_ori]
+        vq06_ori = self.get_vq06_code(audio)
+        vq06 = [int(x) + 65536 + 1024 for x in vq06_ori]
+        chunk = 1
+        chunk_nums = min(len(vq06) // (3 * chunk), len(vq02) // (2 * chunk))
+        speech_tokens = []
+        for idx in range(chunk_nums):
+            speech_tokens += vq02[idx * chunk * 2 : (idx + 1) * chunk * 2]
+            speech_tokens += vq06[idx * chunk * 3 : (idx + 1) * chunk * 3]
+        return speech_tokens, vq02_ori, vq06_ori
+    def get_vq02_code(self, audio):
+        _tmp_wav = io.BytesIO()
+        torchaudio.save(_tmp_wav, audio, 16000, format="wav")
+        _tmp_wav.seek(0)
+        if self.infer_func is None:
+             # Last ditch effort to find it
+             if hasattr(self.funasr_model, "model") and hasattr(self.funasr_model.model, "infer_encoder"):
+                 self.infer_func = self.funasr_model.model.infer_encoder
+             elif hasattr(self.funasr_model, "infer_encoder"):
+                 self.infer_func = self.funasr_model.infer_encoder
+             else:
+                 raise RuntimeError("infer_encoder method not found on FunASR model.")
+        # Note: Depending on funasr version, input might need to be different
+        # funasr usually accepts: audio path, bytes, or numpy
+        # If we pass bytes, we might need to ensure the model handles it.
+        # But let's try passing the BytesIO object wrapped in list as per original code.
+        try:
+            res = self.infer_func(
+                input=[_tmp_wav],
+                chunk_size=self.chunk_size,
+                encoder_chunk_look_back=self.encoder_chunk_look_back,
+                decoder_chunk_look_back=self.decoder_chunk_look_back,
+                device="cpu",
+                is_final=True,
+                cache={}
+            )
+        except TypeError as e:
+            print(f"Error calling infer_encoder: {e}. Trying different arguments.")
+            # Maybe it doesn't accept some args
+            res = self.infer_func(
+                input=[_tmp_wav],
+                is_final=True
             )
+        if isinstance(res, tuple):
+             res = res[0]
+        c_list = []
+        for j, res_ in enumerate(res):
+            feat = res_["enc_out"]
+            if len(feat) > 0:
+                c_list = self.dump_label([feat], self.kms)[0]
+        return c_list
+    def get_vq06_code(self, audio):
+        def split_audio(audio, chunk_duration=480000):
+            start = 0
+            chunks = []
+            while start < len(audio):
+                end = min(start + chunk_duration, len(audio))
+                chunk = audio[start:end]
+                if len(chunk) < 480:
+                    pass
+                else:
+                    chunks.append(chunk)
+                start = end
+            return chunks
+        audio = audio.squeeze(0)
+        chunk_audios = split_audio(audio, chunk_duration=30 * 16000)
+        speech_tokens = []
+        for chunk in chunk_audios:
+            duration = round(chunk.shape[0] / 16000, 2)
+            feat = whisper.log_mel_spectrogram(chunk, n_mels=128)
+            feat = feat.unsqueeze(0)
+            feat_len = np.array([feat.shape[2]], dtype=np.int32)
+            chunk_token = (
+                self.ort_session.run(
+                    None,
+                    {
+                        self.ort_session.get_inputs()[0]
+                        .name: feat.detach()
+                        .cpu()
+                        .numpy(),
+                        self.ort_session.get_inputs()[1].name: feat_len,
+                    },
+                )[0]
+                .flatten()
+                .tolist()
             )
+            speech_tokens += chunk_token
+        return speech_tokens
+    def kmean_cluster(self, samples, means):
+        dists = torch.cdist(samples, means)
+        indices = dists.argmin(dim=1).cpu().numpy()
+        return indices.tolist()
+    def dump_label(self, samples, mean):
+        dims = samples[0].shape[-1]
+        x_lens = [x.shape[1] for x in samples]
+        total_len = sum(x_lens)
+        x_sel = torch.FloatTensor(1, total_len, dims)
+        start_len = 0
+        for sample in samples:
+            sample_len = sample.shape[1]
+            end_len = start_len + sample_len
+            x_sel[:, start_len:end_len] = sample
+            start_len = end_len
+        dense_x = x_sel.squeeze(0)
+        indices = self.kmean_cluster(dense_x, mean)
+        indices_list = []
+        start_len = 0
+        for x_len in x_lens:
+            end_len = start_len + end_len
+            indices_list.append(indices[start_len:end_len])
+        return indices_list
+    def merge_vq0206_to_token_str(self, vq02, vq06):
+        _vq06 = [1024 + x for x in vq06]
+        result = []
+        i = 0
+        j = 0
+        while i < len(vq02) - 1 and j < len(_vq06) - 2:
+            sublist = vq02[i : i + 2] + _vq06[j : j + 3]
+            result.extend(sublist)
+            i += 2
+            j += 3
+        return "".join([f"<audio_{x}>" for x in result])
+tokenizer = None
+def process_audio(audio_path):
+    global tokenizer
+    if tokenizer is None:
+        try:
+             tokenizer = StepAudioTokenizer()
+        except Exception as e:
+             return f"Error loading model: {e}"
+    try:
+        if not audio_path:
+             return "Please upload an audio file."
+        tokens = tokenizer(audio_path)
+        return tokens
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"Error processing audio: {e}"
 if __name__ == "__main__":
+    demo = gr.Interface(
+        fn=process_audio,
+        inputs=gr.Audio(type="filepath", label="Upload WAV"),
+        outputs=gr.Textbox(label="Token String"),
+        title="Step Audio Tokenizer",
+        description="Upload a WAV file to convert it to token string (<audio_XXX>)."
+    )
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,26 @@
 accelerate
 diffusers
-invisible_watermark
-torch
-transformers
-xformers

 accelerate
+xformers
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
+transformers==4.53.3
+openai-whisper==20240930
+onnxruntime
+omegaconf==2.3.0
+librosa==0.10.2.post1
+sox==1.5.0
+modelscope
+numpy==2.2.6
+six==1.16.0
+hyperpyyaml
+conformer==0.3.2
 diffusers
+pillow
+sentencepiece
+funasr>=1.1.3
+protobuf==5.29.3
+gradio==5.49.1
+spaces==0.42.1
+matplotlib==3.10.7
+llmcompressor==0.8.1
+datasets==4.0.0