Spaces:

AutoArk-AI
/

GPA_DEMO

Sleeping

App Files Files Community

wanglamao commited on Jan 16

Commit

528efee

1 Parent(s): e55409d

init

Browse files

Files changed (42) hide show

app.py +206 -0
data_utils/__init__.py +0 -0
data_utils/audio_dataset_ark_audio.py +414 -0
gpa_inference.py +293 -0
models/__init__.py +0 -0
models/bicodec_tokenizer/__init__.py +0 -0
models/bicodec_tokenizer/base_model.py +87 -0
models/bicodec_tokenizer/batch_processor.py +182 -0
models/bicodec_tokenizer/models/__init__.py +0 -0
models/bicodec_tokenizer/models/audio_tokenizer.py +164 -0
models/bicodec_tokenizer/models/bicodec.py +248 -0
models/bicodec_tokenizer/modules/blocks/layers.py +73 -0
models/bicodec_tokenizer/modules/blocks/samper.py +115 -0
models/bicodec_tokenizer/modules/blocks/vocos.py +373 -0
models/bicodec_tokenizer/modules/encoder_decoder/feat_decoder.py +115 -0
models/bicodec_tokenizer/modules/encoder_decoder/feat_encoder.py +107 -0
models/bicodec_tokenizer/modules/encoder_decoder/wave_generator.py +88 -0
models/bicodec_tokenizer/modules/fsq/finite_scalar_quantization.py +251 -0
models/bicodec_tokenizer/modules/fsq/residual_fsq.py +355 -0
models/bicodec_tokenizer/modules/speaker/__init__.py +0 -0
models/bicodec_tokenizer/modules/speaker/ecapa_tdnn.py +267 -0
models/bicodec_tokenizer/modules/speaker/perceiver_encoder.py +360 -0
models/bicodec_tokenizer/modules/speaker/pooling_layers.py +298 -0
models/bicodec_tokenizer/modules/speaker/speaker_encoder.py +136 -0
models/bicodec_tokenizer/modules/vq/factorized_vector_quantize.py +187 -0
models/bicodec_tokenizer/spark_detokenizer.py +106 -0
models/bicodec_tokenizer/spark_tokenizer.py +244 -0
models/bicodec_tokenizer/tokenizer_utils.py +44 -0
models/bicodec_tokenizer/utils/__init__.py +0 -0
models/bicodec_tokenizer/utils/audio.py +271 -0
models/bicodec_tokenizer/utils/file.py +221 -0
models/bicodec_tokenizer/utils/parse_options.sh +97 -0
models/bicodec_tokenizer/utils/token_parser.py +187 -0
models/glm_speech_tokenizer/__init__.py +0 -0
models/glm_speech_tokenizer/batch_processor.py +182 -0
models/glm_speech_tokenizer/configuration_whisper.py +37 -0
models/glm_speech_tokenizer/generation_whisper.py +1828 -0
models/glm_speech_tokenizer/modeling_whisper.py +0 -0
models/glm_speech_tokenizer/speech_token_extractor.py +126 -0
models/glm_speech_tokenizer/test_speech_token_extractor.py +136 -0
models/glm_speech_tokenizer/utils.py +89 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# -*- coding: utf-8 -*-
+import gradio as gr
+import os
+import torch
+import argparse
+import librosa
+import soundfile as sf
+from gpa_inference import GPAInference
+# Global inference object placeholder
+inference = None
+def preprocess_audio(audio_path):
+    """Ensure audio is 16kHz mono"""
+    if not audio_path:
+        return None
+    try:
+        # Load audio with librosa: automatically resamples to sr=16000 and converts to mono
+        y, _ = librosa.load(audio_path, sr=16000, mono=True)
+        # Save processed audio to a new file to avoid conflicts
+        dir_name = os.path.dirname(audio_path)
+        base_name = os.path.basename(audio_path)
+        name, ext = os.path.splitext(base_name)
+        new_path = os.path.join(dir_name, f"{name}_16k.wav")
+        sf.write(new_path, y, 16000)
+        print(f"Preprocessed audio saved to: {new_path}")
+        return new_path
+    except Exception as e:
+        print(f"Error processing audio {audio_path}: {e}")
+        return audio_path
+# ======================== Interface Call Logic ========================
+def process_stt(audio_path):
+    global inference
+    if inference is None:
+        return "Model not initialized."
+    if not audio_path:
+        return "Please upload audio first."
+    # Preprocess audio
+    audio_path = preprocess_audio(audio_path)
+    # Direct inference call
+    return inference.run_stt(audio_path=audio_path, do_sample=False)
+def process_tts_a(text, ref_audio):
+    global inference
+    if inference is None:
+        return None
+    if not text or not ref_audio:
+        return None
+    # Preprocess audio
+    ref_audio = preprocess_audio(ref_audio)
+    # Direct inference call
+    return inference.run_tts(
+        task="tts-a",
+        output_filename="tts_output.wav",
+        text=text,
+        ref_audio_path=ref_audio,
+        temperature=0.8,
+        do_sample=True,
+    )
+def process_vc(src_audio, ref_audio):
+    global inference
+    if inference is None:
+        return None
+    if not src_audio or not ref_audio:
+        return None
+    # Preprocess audio
+    src_audio = preprocess_audio(src_audio)
+    ref_audio = preprocess_audio(ref_audio)
+    # Direct inference call
+    return inference.run_vc(
+        source_audio_path=src_audio,
+        ref_audio_path=ref_audio,
+        output_filename="vc_output.wav",
+    )
+# ======================== Gradio UI Layout ========================
+# Use a soft, premium theme with indigo/slate colors to replace the default orange
+theme = gr.themes.Soft(
+    primary_hue="indigo",
+    secondary_hue="slate",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+)
+with gr.Blocks(title="General Purpose Audio System", theme=theme) as demo:
+    gr.Markdown("# General Purpose Audio System")
+    gr.Markdown("STT, TTS, and VC full-feature demo interface based on GPAEngine.")
+    with gr.Tabs():
+        # --- STT Tab ---
+        with gr.TabItem("🎙️ Speech to Text (STT)"):
+            with gr.Row():
+                stt_input = gr.Audio(label="Input Audio", type="filepath")
+                stt_output = gr.Textbox(label="Recognition Result", placeholder="Recognition result will be displayed here in real-time...", lines=5)
+            stt_btn = gr.Button("Start Recognition", variant="primary")
+            stt_btn.click(process_stt, inputs=stt_input, outputs=stt_output)
+        # --- TTS-A Tab ---
+        with gr.TabItem("👤 Text to Speech (TTS)"):
+            with gr.Row():
+                with gr.Column():
+                    ttsa_text = gr.Textbox(label="Synthesis Text", value="Hello, I am generated by voice cloning.")
+                    ttsa_ref = gr.Audio(label="Reference Audio (Voice Source)", type="filepath")
+                ttsa_output = gr.Audio(label="Synthesis Result")
+            ttsa_btn = gr.Button("Synthesize Now", variant="primary")
+            ttsa_btn.click(process_tts_a, inputs=[ttsa_text, ttsa_ref], outputs=ttsa_output)
+        # --- VC Tab ---
+        with gr.TabItem("🎭 Voice Conversion (VC)"):
+            with gr.Row():
+                with gr.Column():
+                    vc_src = gr.Audio(label="Source Audio (Content Source)", type="filepath")
+                    vc_ref = gr.Audio(label="Reference Audio (Voice Source)", type="filepath")
+                vc_output = gr.Audio(label="Conversion Result")
+            vc_btn = gr.Button("Start Conversion", variant="primary")
+            vc_btn.click(process_vc, inputs=[vc_src, vc_ref], outputs=vc_output)
+def parse_args():
+    parser = argparse.ArgumentParser(description="GPA Audio System GUI")
+    # Model Paths
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="/data3/gpa_ckpt/gpa_final/glm-4-voice-tokenizer",
+        help="Path to GLM4 tokenizer",
+    )
+    parser.add_argument(
+        "--text_tokenizer_path",
+        type=str,
+        default="/data3/gpa_ckpt/gpa_final",
+        help="Path to text tokenizer",
+    )
+    parser.add_argument(
+        "--bicodec_tokenizer_path",
+        type=str,
+        default="/data3/gpa_ckpt/gpa_final/BiCodec/",
+        help="Path to BiCodec tokenizer",
+    )
+    parser.add_argument(
+        "--gpa_model_path",
+        type=str,
+        default="/data3/gpa_ckpt/gpa_final",
+        help="Path to GPA model",
+    )
+    # System Config
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output_gui",
+        help="Directory to save output files",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to use",
+    )
+    # Server Config
+    parser.add_argument(
+        "--server_name", type=str, default="0.0.0.0", help="Address for Gradio server"
+    )
+    parser.add_argument(
+        "--server_port", type=int, default=7868, help="Port for Gradio server"
+    )
+    return parser.parse_args()
+args = parse_args()
+# Instantiate Model
+print(f"Initializing GPA Inference System on {args.device}...")
+os.makedirs(args.output_dir, exist_ok=True)
+inference = GPAInference(
+    tokenizer_path=args.tokenizer_path,
+    text_tokenizer_path=args.text_tokenizer_path,
+    bicodec_tokenizer_path=args.bicodec_tokenizer_path,
+    gpa_model_path=args.gpa_model_path,
+    output_dir=args.output_dir,
+    device=args.device,
+)
+# Launch Gradio Demo
+demo.queue().launch()

data_utils/__init__.py ADDED Viewed

File without changes

data_utils/audio_dataset_ark_audio.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os
+import re
+from models.bicodec_tokenizer.spark_tokenizer import SparkTokenizer
+from models.glm_speech_tokenizer.speech_token_extractor import SpeechTokenExtractor
+from models.glm_speech_tokenizer.modeling_whisper import WhisperVQEncoder
+from transformers import PreTrainedTokenizer,AutoTokenizer,WhisperFeatureExtractor
+import torch
+import torch.nn.functional as F
+import logging
+from typing import List, Dict, Any, Literal, Optional, Union
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+def has_punctuation(text: str) -> bool:
+    # 包含中英文符号
+    pattern = r"[，。！？；：（）“”‘’、,.!?;:()\[\]{}\"']"
+    return bool(re.search(pattern, text))
+ALL_TASKS = ["stt", "tts-a", "vc"]
+class ark_infer_processor:
+    def __init__(
+        self,
+        glm_tokenizer: SpeechTokenExtractor,
+        bicodec_tokenizer: SparkTokenizer,
+        text_tokenizer: PreTrainedTokenizer,
+        max_length: int = 512,
+        glm_semantic_token_offset: int = 151727,
+        semantic_token_offset: int = 172207,
+        global_token_offset: int = 168111,
+        audio_path_name: str = "audio",
+        device: str = "cpu",
+    ):
+        self.glm_tokenizer = glm_tokenizer
+        self.bicodec_tokenizer = bicodec_tokenizer
+        self.text_tokenizer = text_tokenizer
+        self.max_length = max_length
+        self.glm_semantic_token_offset = glm_semantic_token_offset
+        self.semantic_token_offset = semantic_token_offset
+        self.global_token_offset = global_token_offset
+        self.device = device
+        self.audio_path_name = audio_path_name
+    def _process_example_stt(self, audio_path: str):
+        ##target 音频
+        with torch.no_grad():
+            glm_semantic_tokens = self.glm_tokenizer.extract([audio_path])
+            glm_semantic_tokens = torch.as_tensor(glm_semantic_tokens, device="cpu", dtype=torch.long)
+            semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+        glm_semantic_tokens_list = (
+            (glm_semantic_tokens + self.glm_semantic_token_offset).cpu().tolist()[0]
+        )
+        semantic_tokens_list = (
+            (semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+        )
+        input_ids = (
+            self.text_tokenizer.encode("<|start_glm_token|>")
+            + glm_semantic_tokens_list
+            + self.text_tokenizer.encode("<|end_glm_token|>")
+            + self.text_tokenizer.encode("<|start_semantic_token|>")
+            + semantic_tokens_list
+            + self.text_tokenizer.encode("<|end_semantic_token|>")
+            + self.text_tokenizer.encode("<|start_content|>")
+        )
+        attention_mask = [1] * (len(input_ids))
+        return input_ids, attention_mask
+    def _process_example_tts_a(self, text: str, ref_audio_path: str):
+        with torch.no_grad():
+            global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+        all_text = "<|start_content|>" + text + "<|end_content|>"
+        global_tokens_list = (
+            (global_tokens + self.global_token_offset).cpu().tolist()[0][0]
+        )
+        text_tokens = self.text_tokenizer(
+            all_text, truncation=True, max_length=self.max_length
+        )
+        input_ids = (
+            self.text_tokenizer.encode("<|start_global_token|>")
+            + global_tokens_list
+            + self.text_tokenizer.encode("<|end_global_token|>")
+            + text_tokens["input_ids"]
+        )
+        attention_mask = [1] * len(input_ids)
+        return input_ids, attention_mask
+    def _process_example_vc(self, audio_path: str, ref_audio_path: str):
+        with torch.no_grad():
+            semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+            new_global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+        semantic_tokens_list = (
+            (semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+        )
+        new_global_tokens_list = (
+            (new_global_tokens + self.global_token_offset).cpu().tolist()[0][0]
+        )
+        all_str = (
+            "<|start_global_token|>"
+            + self.text_tokenizer.decode(new_global_tokens_list)
+            + "<|end_global_token|>"
+            + "<|start_semantic_token|>"
+            + self.text_tokenizer.decode(semantic_tokens_list)
+            + "<|end_semantic_token|>"
+            + "<|end_content|>"
+        )
+        inputs = self.text_tokenizer(all_str)
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+    def process_input(
+        self,
+        task: Literal["stt", "tts-a", "vc"],
+        audio_path: str | None = None,
+        ref_audio_path: str | None = None,
+        text: str | None = None,
+    ):
+        """加载指定音频、特征并根据任务类型返回 token 化结果。"""
+        if task == "stt":
+            assert audio_path is not None
+            input_ids, attention_mask = self._process_example_stt(audio_path)
+        elif task == "tts-a":
+            assert ref_audio_path is not None and text is not None
+            input_ids, attention_mask = self._process_example_tts_a(
+                text, ref_audio_path
+            )
+        elif task == "vc":
+            assert audio_path is not None and ref_audio_path is not None
+            input_ids, attention_mask = self._process_example_vc(
+                audio_path, ref_audio_path
+            )
+        else:
+            raise ValueError(
+                f"Unsupported task: {task}, all supported tasks: {ALL_TASKS}"
+            )
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+class ark_processor:
+    def __init__(self,
+                 glm_tokenizer: SpeechTokenExtractor,
+                 bicodec_tokenizer: SparkTokenizer,
+                 text_tokenizer:PreTrainedTokenizer,
+                 max_length:int = 512,
+                 glm_semantic_token_offset:int = 151727,
+                 semantic_token_offset: int =172207,
+                 global_token_offset: int =168111,
+                 audio_path_name:str = "audio",
+                 device:str ='cpu'):
+        self.glm_tokenizer = glm_tokenizer
+        self.bicodec_tokenizer = bicodec_tokenizer
+        self.text_tokenizer = text_tokenizer
+        self.max_length = max_length
+        self.glm_semantic_token_offset =glm_semantic_token_offset
+        self.semantic_token_offset=semantic_token_offset
+        self.global_token_offset=global_token_offset
+        self.device = device
+        self.audio_path_name =audio_path_name
+    def process_example(self, example: Dict[str, Any]):
+        """
+        这个函数由多个CPU进程并行执行。
+        它负责加载、重采样和对单个样本进行特征提取/分词。
+        """
+        task = example.get("task", "stt")
+        audio_path = example.get(self.audio_path_name, "")
+        ref_audio_path = example.get("ref_audio", "")
+        vc_audio = example.get("vc_audio", "")
+        text = example.get("text", "")
+        if task == "stt":
+            ##target 音频
+            with torch.no_grad():
+                glm_semantic_tokens = self.glm_tokenizer.extract([audio_path])
+                glm_semantic_tokens = torch.as_tensor(glm_semantic_tokens, device="cpu", dtype=torch.long)
+                semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+            glm_semantic_tokens_list = (glm_semantic_tokens + self.glm_semantic_token_offset).cpu().tolist()[0]
+            semantic_tokens_list = (semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+            # print(f"len of semantic is {len(semantic_tokens_list)}")
+            ##对text进行token
+            text_tokens = self.text_tokenizer(text, truncation=True, max_length=self.max_length)
+            input_ids = self.text_tokenizer.encode("<|start_glm_token|>") + glm_semantic_tokens_list + self.text_tokenizer.encode("<|end_glm_token|>") \
+                        + self.text_tokenizer.encode("<|start_semantic_token|>") + semantic_tokens_list + self.text_tokenizer.encode(
+                "<|end_semantic_token|>") \
+                        + self.text_tokenizer.encode("<|start_content|>") + text_tokens["input_ids"] + self.text_tokenizer.encode("<|end_content|>") \
+                        + self.text_tokenizer.encode("<|im_end|>")
+            attention_mask = [1] * (len(input_ids))
+            labels = [-100] * (len(semantic_tokens_list) + 5 + len(glm_semantic_tokens_list)) + text_tokens["input_ids"] + self.text_tokenizer.encode(
+                "<|end_content|>") + self.text_tokenizer.encode("<|im_end|>")
+        elif task == "tts-a":
+            with torch.no_grad():
+                semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+                global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+            all_text = "<|start_content|>" + text + "<|end_content|>"
+            global_tokens_list = (global_tokens + self.global_token_offset).cpu().tolist()[0][0]
+            text_tokens = self.text_tokenizer(all_text, truncation=True, max_length=self.max_length)
+            semantic_tokens_list = (semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+            input_ids = self.text_tokenizer.encode("<|start_global_token|>") + global_tokens_list + self.text_tokenizer.encode(
+                "<|end_global_token|>") + text_tokens["input_ids"] + semantic_tokens_list + self.text_tokenizer.encode("<|im_end|>")
+            attention_mask = [1] * len(input_ids)
+            labels = [-100] * (len(text_tokens["input_ids"]) + 2 + len(global_tokens_list)) + semantic_tokens_list + self.text_tokenizer.encode("<|im_end|>")
+        elif task == "vc":
+            with torch.no_grad():
+                semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+                global_tokens = self.bicodec_tokenizer.tokenize([audio_path])['global_tokens']
+                # global_tokens, semantic_tokens=self.bicodec_tokenizer.tokenize(audio_path=audio_path)
+                # new_global_tokens, new_semantic_tokens=self.bicodec_tokenizer.tokenize(vc_audio,ref_audio_path)
+                new_semantic_tokens = self.bicodec_tokenizer.tokenize([vc_audio])['semantic_tokens']
+                new_global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+            global_tokens_list = (global_tokens + self.global_token_offset).cpu().tolist()[0][0]
+            semantic_tokens_list = (semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+            new_global_tokens_list = (new_global_tokens + self.global_token_offset).cpu().tolist()[0][0]
+            new_semantic_tokens_list = (new_semantic_tokens + self.semantic_token_offset).cpu().tolist()[0]
+            all_str = "<|start_global_token|>" + self.text_tokenizer.decode(new_global_tokens_list) + "<|end_global_token|>" + "<|start_semantic_token|>" + self.text_tokenizer.decode(
+                semantic_tokens_list) + "<|end_semantic_token|>" + "<|end_content|>" + self.text_tokenizer.decode(new_semantic_tokens_list) + "<|im_end|>"
+            ##add token and mask
+            inputs = self.text_tokenizer(all_str)
+            input_ids = inputs['input_ids']
+            attention_mask = inputs['attention_mask']
+            labels = [-100] * (5 + len(new_global_tokens_list) + len(semantic_tokens_list)) + new_semantic_tokens_list + self.text_tokenizer.encode("<|im_end|>")
+        else:
+            ##默认走stt
+            with torch.no_grad():
+                glm_semantic_tokens = self.glm_tokenizer.extract([audio_path])
+                glm_semantic_tokens = torch.as_tensor(glm_semantic_tokens, device="cpu", dtype=torch.long)
+                semantic_tokens = self.bicodec_tokenizer.tokenize([audio_path])['semantic_tokens']
+            glm_semantic_tokens_list = (glm_semantic_tokens+self.glm_semantic_token_offset).cpu().tolist()[0]
+            semantic_tokens_list = (semantic_tokens+self.semantic_token_offset).cpu().tolist()[0]
+            # print(f"len of semantic is {len(semantic_tokens_list)}")
+            ##对text进行token
+            text_tokens = self.text_tokenizer(text, truncation=True, max_length=self.max_length)
+            input_ids = self.text_tokenizer.encode("<|start_glm_token|>")+  glm_semantic_tokens_list + self.text_tokenizer.encode("<|end_glm_token|>") \
+                    + self.text_tokenizer.encode("<|start_semantic_token|>")+  semantic_tokens_list + self.text_tokenizer.encode("<|end_semantic_token|>") \
+                    + text_tokens["input_ids"] \
+                    + self.text_tokenizer.encode("<|im_end|>")
+            attention_mask = [1]*(len(semantic_tokens_list)+4+len(glm_semantic_tokens_list)) +text_tokens["attention_mask"] +[1]
+            labels = [-100]*(len(semantic_tokens_list)+4+len(glm_semantic_tokens_list))+ text_tokens["input_ids"]+ self.text_tokenizer.encode("<|im_end|>")
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            }
+def create_tts_collate_fn(
+    pad_token_id: int,
+    processor,                     # ark_processor
+    max_length: Optional[int]=None,# 传入你想要的截断上限，例如 512
+    truncation_side: str = "right" # "right" 或 "left"，默认右截断
+):
+    """
+    手动填充 + 可选截断的 collate_fn 工厂。
+    参数：
+        pad_token_id: 用于 input_ids 的 pad 值
+        processor:    你的 ark_processor，需提供 .process_example()
+        max_length:   若提供，则对每个样本在拼批前先截断到该长度
+        truncation_side: "right" | "left"，决定从哪侧截断
+    """
+    label_pad_value = -100
+    attention_mask_pad_value = 0
+    def _truncate_1d(x: torch.Tensor, keep_len: int, side: str) -> torch.Tensor:
+        if x.numel() <= keep_len:
+            return x
+        if side == "right":
+            return x[:keep_len]
+        elif side == "left":
+            return x[-keep_len:]
+        else:
+            raise ValueError(f"Unsupported truncation_side: {side}")
+    def _to_long_tensor(x) -> torch.Tensor:
+        if isinstance(x, torch.Tensor):
+            return x.detach().clone().long()
+        return torch.tensor(x, dtype=torch.long)
+    def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        # 1) 预处理（过滤空样本）
+        proc = [processor.process_example(ex) for ex in examples if ex]
+        proc = [d for d in proc if d and ("input_ids" in d) and ("attention_mask" in d) and ("labels" in d)]
+        if len(proc) == 0:
+            # 返回空批，避免 DataLoader 崩溃
+            return {
+                "input_ids": torch.empty(0, dtype=torch.long),
+                "attention_mask": torch.empty(0, dtype=torch.long),
+                "labels": torch.empty(0, dtype=torch.long),
+            }
+        # 2) 样本级截断（如果设置了 max_length）
+        if max_length is not None:
+            trimmed = []
+            for ex in proc:
+                ids  = _to_long_tensor(ex["input_ids"])
+                mask = _to_long_tensor(ex["attention_mask"])
+                labs = _to_long_tensor(ex["labels"])
+                keep_len = min(max_length, ids.numel())
+                ids  = _truncate_1d(ids,  keep_len, truncation_side)
+                mask = _truncate_1d(mask, keep_len, truncation_side)
+                labs = _truncate_1d(labs, keep_len, truncation_side)
+                trimmed.append({"input_ids": ids, "attention_mask": mask, "labels": labs})
+            proc = trimmed
+        # 3) 计算本批最大长度（截断后再取最大）
+        max_len_in_batch = max(int(len(ex["input_ids"])) for ex in proc)
+        # 4) 逐样本右侧 pad 到 batch 最大长度
+        padded_input_ids_list = []
+        padded_attention_mask_list = []
+        padded_labels_list = []
+        for ex in proc:
+            ids  = _to_long_tensor(ex["input_ids"])
+            mask = _to_long_tensor(ex["attention_mask"])
+            labs = _to_long_tensor(ex["labels"])
+            need = max_len_in_batch - ids.numel()
+            if need < 0:
+                # 极端情况：有人为 max_length=None 时超长样本溢出
+                keep_len = max_len_in_batch
+                ids  = _truncate_1d(ids,  keep_len, "right")
+                mask = _truncate_1d(mask, keep_len, "right")
+                labs = _truncate_1d(labs, keep_len, "right")
+                need = 0
+            pad_dims = (0, need)
+            ids  = F.pad(ids,  pad_dims, mode="constant", value=pad_token_id)
+            mask = F.pad(mask, pad_dims, mode="constant", value=attention_mask_pad_value)
+            labs = F.pad(labs, pad_dims, mode="constant", value=label_pad_value)
+            padded_input_ids_list.append(ids)
+            padded_attention_mask_list.append(mask)
+            padded_labels_list.append(labs)
+        # 5) 堆叠成批
+        batch = {
+            "input_ids": torch.stack(padded_input_ids_list, dim=0),
+            "attention_mask": torch.stack(padded_attention_mask_list, dim=0),
+            "labels": torch.stack(padded_labels_list, dim=0),
+        }
+        return batch
+    return collate_fn
+if __name__ == "__main__":
+    device = "cuda:0"
+    bicodec_audio_tokenizer_path = "/data/arki_production/model/SparkAudio/Spark-TTS-0___5B/"
+    glm_speech_tokenizer_path = "/data/yumu/model/glm-4-voice-tokenizer"
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(glm_speech_tokenizer_path)
+    audio_model = WhisperVQEncoder.from_pretrained(glm_speech_tokenizer_path).eval().to(device)
+    glm_tokenizer = SpeechTokenExtractor(model=audio_model, feature_extractor=feature_extractor, device=device)
+    text_tokenizer = AutoTokenizer.from_pretrained("/data/yumu/model/ark_audio_v1_0_3_b",trust_remote_code=True)
+    bicodec_tokenizer = SparkTokenizer(model_path=bicodec_audio_tokenizer_path, device=device)
+    # 配置项
+    DATASET_PATH = "/data/yumu/glm_asr_vllm/test/data/test_meeting.jsonl"
+    MAX_LENGTH = 4096
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"将使用设备: {DEVICE}")
+    # --- 2. 加载流式数据集 ---
+    print(f"以流式方式加载数据集 '{DATASET_PATH}'...")
+    streaming_dataset = load_dataset("json", data_files=DATASET_PATH, streaming=True)['train']
+    # --- 4. 构建数据处理流水线 (Pipeline) ---
+    print("正在对数据流进行shuffle，buffer_size=1000...")
+    shuffled_dataset = streaming_dataset.shuffle(buffer_size=10000, seed=42)
+    processor = ark_processor(
+                            glm_tokenizer=glm_tokenizer,
+                            bicodec_tokenizer=bicodec_tokenizer,
+                            text_tokenizer=text_tokenizer,
+                            device = DEVICE,
+                            audio_path_name="audio")
+    collate_fn = create_tts_collate_fn(text_tokenizer.pad_token_id,processor,max_length=4096)
+        # 创建最终的DataLoader
+    data_loader = DataLoader(
+        shuffled_dataset,
+        batch_size=10, # 根据你的GPU显存和模型大小调整
+        collate_fn=collate_fn,
+        num_workers=0 # DataLoader的worker，负责从打乱后的流中拉取数据
+    )
+    print("\n--- 高性能流式 DataLoader 演示 ---")
+    print("将从DataLoader中获取并展示第一个批次的数据：\n")
+    first_batch = next(iter(data_loader))
+    print("成功获取第一个批次！数据已在collate_fn中填充。")
+    for key, value in first_batch.items():
+        if value is not None:
+            # print(f" - {key}: shape={value.shape}, dtype={value.dtype}")
+            print(f" - {key}: shape={value.shape}, dtype={value}")

gpa_inference.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import os
+import argparse
+import torch
+import soundfile as sf
+import re
+from transformers import AutoTokenizer, AutoModelForCausalLM, WhisperFeatureExtractor
+import numpy as np
+from models.bicodec_tokenizer.spark_tokenizer import SparkTokenizer
+from models.bicodec_tokenizer.spark_detokenizer import SparkDeTokenizer
+from models.glm_speech_tokenizer.speech_token_extractor import SpeechTokenExtractor
+from models.glm_speech_tokenizer.modeling_whisper import WhisperVQEncoder
+from data_utils.audio_dataset_ark_audio import ark_infer_processor
+class GPAInference:
+    def __init__(self, tokenizer_path, text_tokenizer_path, bicodec_tokenizer_path, gpa_model_path, output_dir, device):
+        self.tokenizer_path = tokenizer_path
+        self.text_tokenizer_path = text_tokenizer_path
+        self.bicodec_tokenizer_path = bicodec_tokenizer_path
+        self.gpa_model_path = gpa_model_path
+        self.output_dir = output_dir
+        self.device = device
+        print(f"Using device: {self.device}")
+        self._load_models()
+    def _load_models(self):
+        print("Loading tokenizers...")
+        feature_extractor = WhisperFeatureExtractor.from_pretrained(self.tokenizer_path)
+        audio_model = WhisperVQEncoder.from_pretrained(self.tokenizer_path).eval().to(self.device)
+        self.glm_tokenizer = SpeechTokenExtractor(model=audio_model, feature_extractor=feature_extractor, device=self.device)
+        self.text_tokenizer = AutoTokenizer.from_pretrained(
+            self.text_tokenizer_path,
+            trust_remote_code=True
+        )
+        self.bicodec_tokenizer = SparkTokenizer(model_path=self.bicodec_tokenizer_path, device=self.device)
+        self.bicodec_detokenizer = SparkDeTokenizer(model_path=self.bicodec_tokenizer_path, device=self.device)
+        self.processor = ark_infer_processor(
+            glm_tokenizer=self.glm_tokenizer,
+            bicodec_tokenizer=self.bicodec_tokenizer,
+            text_tokenizer=self.text_tokenizer,
+            device=self.device,
+            audio_path_name="audio",
+        )
+        print("Loading model...")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.gpa_model_path,
+            trust_remote_code=True
+        ).to(self.device)
+    def generate(self, inputs, **kwargs):
+        """
+        Base generation method that accepts dynamic generation parameters.
+        """
+        for k in inputs:
+            if isinstance(inputs[k], (list, np.ndarray)):
+                inputs[k] = torch.tensor(inputs[k]).unsqueeze(0).to(self.device)
+            elif isinstance(inputs[k], torch.Tensor):
+                inputs[k] = inputs[k].unsqueeze(0).to(self.device)
+        # Default generation config
+        generation_config = {
+            "max_new_tokens": 1000,
+            "do_sample": False,
+            "eos_token_id": self.text_tokenizer.convert_tokens_to_ids("<|im_end|>"),
+        }
+        # Override defaults with any passed kwargs
+        generation_config.update(kwargs)
+        # Remove keys that might be None if passed from args mistakenly
+        generation_config = {k: v for k, v in generation_config.items() if v is not None}
+        print(f"Generation config: {generation_config}")
+        outputs = self.model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            **generation_config
+        )
+        return outputs
+    def run_stt(self, audio_path, **kwargs):
+        if not audio_path:
+            raise ValueError("audio_path is required for STT")
+        print("\n--- Speech to Text (STT) ---")
+        inputs = self.processor.process_input(
+            task="stt",
+            audio_path=audio_path,
+        )
+        # recommend hyperparameters for TTS
+        kwargs = {
+            "max_new_tokens": 512,
+            "do_sample": False,
+        }
+        # Pass generation arguments (temperature, etc.) to generate
+        outputs = self.generate(inputs, **kwargs)
+        text = self.text_tokenizer.decode(outputs[0].tolist())
+        if "<|start_content|>" in text:
+            return text.split("<|start_content|>")[1].replace("<|im_end|>","").replace("<|end_content|>","")
+        else:
+            return text.replace("<|im_end|>","")
+    def run_tts(self, task, output_filename, text, ref_audio_path, **kwargs):
+        """
+        gen_kwargs: dict, parameters for model.generate (temp, top_p, etc.)
+        """
+        if not text:
+            raise ValueError("text is required for TTS")
+        # Check ref_audio_path requirement based on task
+        if task == "tts-a" and not ref_audio_path:
+            raise ValueError(f"ref_audio_path is required for {task}")
+        # recommend hyperparameters for TTS
+        kwargs = {
+            "max_new_tokens": 512,
+            "temperature": 0.2,
+            "repetition_penalty": 1.2,
+            "do_sample": True,
+        }
+        print(f"\n--- {task.upper()} ---")
+        output_path = os.path.join(self.output_dir, output_filename)
+        # Pass processor specific args (e.g. emotion, pitch) here
+        inputs = self.processor.process_input(
+            task=task,
+            ref_audio_path=ref_audio_path,
+            text=text,
+        )
+        # Pass generation specific args (e.g. temperature) here
+        # Note: Original code hardcoded temperature=0.8 for TTS, we use gen_kwargs or fallback to generate defaults
+        outputs = self.generate(inputs, **kwargs)
+        text_output = self.text_tokenizer.decode(outputs[0].tolist())
+        if "<|end_content|>" in text_output:
+            content = text_output.split("<|end_content|>")[1]
+        else:
+            print("Warning: <|end_content|> not found")
+            content = text_output
+        audio_ids = re.findall(r"<\|bicodec_semantic_(\d+)\|>", content)
+        audio_list = [int(x) for x in audio_ids]
+        if ref_audio_path:
+            global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+        else:
+            global_tokens = torch.zeros((1, 32), dtype=torch.long).to(self.device)
+        req = {
+            "global_tokens": global_tokens,
+            "semantic_tokens": torch.tensor(audio_list).unsqueeze(0).to(self.device),
+        }
+        out = self.bicodec_detokenizer.detokenize(**req)
+        reconstructed_wav = out.detach().cpu().float().squeeze().numpy()
+        # Simple DC offset removal
+        if reconstructed_wav.size > 0:
+            reconstructed_wav -= reconstructed_wav.mean()
+        sf.write(output_path, reconstructed_wav, 16000)
+        print(f"Saved output to {output_path}")
+        return 16000, reconstructed_wav
+    def run_vc(
+        self,
+        source_audio_path,
+        ref_audio_path,
+        output_filename="output_gpa_vc.wav",
+        **kwargs,
+    ):
+        if not source_audio_path:
+            raise ValueError("source_audio_path is required for VC")
+        if not ref_audio_path:
+            raise ValueError("ref_audio_path is required for VC")
+        print("\n--- Voice Conversion (VC) ---")
+        output_path = os.path.join(self.output_dir, output_filename)
+        inputs = self.processor.process_input(
+            task="vc",
+            audio_path=source_audio_path,
+            ref_audio_path=ref_audio_path,
+        )
+        outputs = self.generate(inputs, **kwargs)
+        text_output = self.text_tokenizer.decode(outputs[0].tolist())
+        if "<|end_content|>" in text_output:
+            content = text_output.split("<|end_content|>")[1]
+        else:
+            content = text_output
+        audio_ids = re.findall(r"<\|bicodec_semantic_(\d+)\|>", content)
+        audio_list = [int(x) for x in audio_ids]
+        global_tokens = self.bicodec_tokenizer.tokenize([ref_audio_path])['global_tokens']
+        req = {
+            "global_tokens": global_tokens,
+            "semantic_tokens": torch.tensor(audio_list).unsqueeze(0).to(self.device),
+        }
+        out = self.bicodec_detokenizer.detokenize(**req)
+        reconstructed_wav = out.detach().cpu().float().squeeze().numpy()
+        if reconstructed_wav.size > 0:
+            reconstructed_wav -= reconstructed_wav.mean()
+        sf.write(output_path, reconstructed_wav, 16000)
+        print(f"Saved VC output to {output_path}")
+        return 16000, reconstructed_wav
+def parse_args():
+    parser = argparse.ArgumentParser(description="GPA Inference Script")
+    # Paths
+    parser.add_argument("--tokenizer_path", type=str, default="/nasdata/model/gpa/glm-4-voice-tokenizer", help="Path to GLM4 tokenizer")
+    parser.add_argument("--text_tokenizer_path", type=str, default="/nasdata/model/gpa", help="Path to text tokenizer")
+    parser.add_argument("--bicodec_tokenizer_path", type=str, default="/nasdata/model/gpa/BiCodec/", help="Path to BiCodec tokenizer")
+    parser.add_argument("--gpa_model_path", type=str, default="/nasdata/model/gpa", help="Path to GPA model")
+    # Audio inputs
+    parser.add_argument(
+        "--ref_audio_path", type=str, default=None, help="Reference audio path"
+    )
+    parser.add_argument(
+        "--src_audio_path", type=str, default=None, help="Source audio path for VC/STT"
+    )
+    # Output
+    parser.add_argument("--output_dir", type=str, default=".", help="Directory to save output files")
+    # Device
+    default_device = "cuda" if torch.cuda.is_available() else "cpu"
+    parser.add_argument("--device", type=str, default=default_device, help="Device to use (e.g., cuda:0, cpu)")
+    # Task
+    parser.add_argument("--task", type=str, required=True, choices=["stt", "tts-a", "vc"], help="Task to run")
+    # TTS Inputs (Processor Arguments)
+    parser.add_argument("--text", type=str, default=None, help="Text for TTS")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # Ensure output directory exists
+    os.makedirs(args.output_dir, exist_ok=True)
+    inference = GPAInference(
+        tokenizer_path=args.tokenizer_path,
+        text_tokenizer_path=args.text_tokenizer_path,
+        bicodec_tokenizer_path=args.bicodec_tokenizer_path,
+        gpa_model_path=args.gpa_model_path,
+        output_dir=args.output_dir,
+        device=args.device,
+    )
+    if args.task == "stt":
+        if not args.src_audio_path:
+            raise ValueError("Error: --src_audio_path is required for STT task.")
+        # Pass gen_kwargs
+        result = inference.run_stt(audio_path=args.src_audio_path)
+        print("STT Result:", result)
+    elif args.task == "tts-a":
+        inference.run_tts(
+            task="tts-a",
+            output_filename="output_gpa_tts_a.wav",
+            text=args.text,
+            ref_audio_path=args.ref_audio_path,
+        )
+    elif args.task == "vc":
+        inference.run_vc(
+            source_audio_path=args.src_audio_path,
+            ref_audio_path=args.ref_audio_path,
+            output_filename="output_gpa_vc.wav",
+        )
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

File without changes

models/bicodec_tokenizer/__init__.py ADDED Viewed

File without changes

models/bicodec_tokenizer/base_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# -*- coding: utf-8 -*-
+# Time      :2025/3/29 10:28
+# Author    :Hui Huang
+import json
+import torch
+import torch.nn as nn
+import yaml
+from .tokenizer_utils import load_config
+import os
+from safetensors.torch import load_file
+class SparkBaseModel(nn.Module):
+    @classmethod
+    def from_pretrained(cls, model_path: str):
+        config = load_config(os.path.join(model_path, "config.yaml"))['audio_tokenizer']
+        model = cls(config)
+        state_dict = load_file(os.path.join(model_path, "model.safetensors"))
+        model.load_state_dict(state_dict, strict=False)
+        model.eval()
+        model.remove_weight_norm()
+        return model
+    def remove_weight_norm(self):
+        """Removes weight normalization from all layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:
+                pass  # The module didn't have weight norm
+        self.apply(_remove_weight_norm)
+class SnacBaseModel(nn.Module):
+    @classmethod
+    def from_config(cls, config_path):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        model = cls(**config)
+        return model
+    @classmethod
+    def from_pretrained(cls, model_path: str):
+        model = cls.from_config(os.path.join(model_path, "config.json"))
+        state_dict = torch.load(
+            os.path.join(model_path, "pytorch_model.bin"),
+            map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=False)
+        model.eval()
+        return model
+class MegaBaseModel(nn.Module):
+    CKPT_NAME = "model"
+    @classmethod
+    def from_pretrained(cls, model_path: str):
+        config_file = None
+        ckpt_path = None
+        for file in os.listdir(model_path):
+            if file.endswith(".ckpt"):
+                ckpt_path = os.path.join(model_path, file)
+            if file.endswith(".yaml"):
+                config_file = os.path.join(model_path, file)
+        if ckpt_path is None:
+            raise FileNotFoundError(f"No checkpoint found at {model_path}")
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        state_dict_all = {
+            k.replace('module.', '').replace('_orig_mod.', ''): v for k, v in checkpoint["state_dict"].items()
+        }
+        state_dict = state_dict_all[cls.CKPT_NAME]
+        state_dict = {k.replace('module.', '').replace('_orig_mod.', ''): v for k, v in state_dict.items()}
+        if config_file is not None:
+            with open(config_file) as f:
+                config = yaml.safe_load(f)
+            model = cls(config)
+        else:
+            model = cls()
+        model.load_state_dict(state_dict, strict=False)
+        model.eval()
+        return model

models/bicodec_tokenizer/batch_processor.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# -*- coding: utf-8 -*-
+# Time      :2024/11/17 15:33
+# Author    :Hui Huang
+import asyncio
+import uuid
+from typing import Callable, List, Any, Awaitable, Tuple
+from asyncio import Queue
+class BatchProcessor:
+    """Batch Processor for handling asynchronous requests in batches.
+    This class manages a queue of requests and processes them in batches
+    using multiple worker tasks.
+    Attributes:
+        processing_function (Callable[[List[Any]], Awaitable[List[Any]]]):
+            The function used for processing requests in batches.
+        num_workers (int): The number of worker tasks to process requests.
+        batch_size (int): The maximum number of requests to process in a single batch.
+        request_queue (Queue): The queue holding incoming requests.
+        loop (asyncio.AbstractEventLoop): The event loop used to create worker tasks.
+        worker_tasks (List[asyncio.Task]): The list of worker tasks.
+    """
+    def __init__(
+            self,
+            processing_function: Callable[[List[Any]], Awaitable[List[Any]]],
+            num_workers: int,
+            batch_size: int,
+            wait_timeout: float = 0.05
+    ) -> None:
+        """Initialize the BatchProcessor with the given processing function, number of workers, and batch size.
+        Args:
+            processing_function (Callable[[List[Any]], Awaitable[List[Any]]]):
+                The function used for processing requests in batches.
+            num_workers (int): The number of worker tasks to process requests.
+            batch_size (int): The maximum number of requests to process in a single batch.
+        """
+        self.processing_function = processing_function
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.wait_timeout = wait_timeout
+        self.request_queue: Queue = Queue()
+        self.loop = asyncio.get_running_loop()
+        self.worker_tasks = [
+            self.loop.create_task(self.batch_processor(i)) for i in range(num_workers)
+        ]
+        # Wait until all worker tasks are started
+        self.loop.create_task(self._log_workers_started())
+    async def _log_workers_started(self):
+        await asyncio.sleep(0)  # Yield control to ensure workers have started
+    async def batch_processor(self, worker_id: int):
+        """Worker task that processes requests from the queue in batches.
+        Args:
+            worker_id (int): The identifier for the worker task.
+        """
+        while True:
+            requests: List[Tuple[Any, asyncio.Future]] = []
+            try:
+                while len(requests) < self.batch_size:
+                    request = await asyncio.wait_for(
+                        self.request_queue.get(), timeout=self.wait_timeout
+                    )
+                    requests.append(request)
+            except asyncio.TimeoutError:
+                pass
+            if requests:
+                all_requests = [
+                    req[0] for req in requests
+                ]  # Extract the actual input data from each request tuple
+                futures = [req[1] for req in requests]  # Extract the futures to resolve
+                try:
+                    results = await self.processing_function(all_requests)
+                    for (future, result) in zip(futures, results):
+                        future.set_result(result)
+                except Exception as e:
+                    for future in futures:
+                        future.set_exception(e)
+    async def add_request(self, single_input: Any):
+        """Add a new request to the queue.
+        Args:
+            single_input (Any): The input data for processing.
+        """
+        # loop = asyncio.get_running_loop()
+        future = self.loop.create_future()
+        self.request_queue.put_nowait((single_input, future))
+        return future
+    async def shutdown(self):
+        """Shutdown the batch processor by cancelling all worker tasks."""
+        for task in self.worker_tasks:
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                print("Worker task cancelled.")
+class AsyncBatchEngine:
+    def __init__(
+            self,
+            processing_function: Callable[[List[Any]], Awaitable[List[Any]]],
+            batch_size: int = 32,
+            wait_timeout: float = 0.01,
+    ):
+        """
+        Initialize the AsyncBatchEngine with a processing function, number of workers, and batch size.
+        Args:
+            processing_function (Callable[[List[Any]], Awaitable[List[Any]]]): The batch processing function.
+            batch_size (int): The maximum number of requests to process in a single batch.
+        """
+        self._processing_function = processing_function
+        self._batch_size = batch_size
+        self._is_running = False
+        self._batch_processor = None
+        self._wait_timeout = wait_timeout
+    async def start(self):
+        """Start the engine by initializing the batch processor and worker tasks."""
+        if self._is_running:
+            return
+        self._batch_processor = BatchProcessor(
+            processing_function=self._processing_function,
+            batch_size=self._batch_size,
+            wait_timeout=self._wait_timeout,
+            num_workers=1
+        )
+        self._is_running = True
+    async def stop(self):
+        """Stop the engine by shutting down the batch processor and worker tasks."""
+        self._check_running()
+        self._is_running = False
+        if self._batch_processor is not None:
+            await self._batch_processor.shutdown()
+    def _check_running(self):
+        """Check if the engine is running.
+        Raises:
+            ValueError: If the engine is not running.
+        """
+        if not self._is_running:
+            raise ValueError(
+                "The engine is not running. "
+                "You must start the engine before using it."
+            )
+    async def add_request(self, single_input: Any, request_id: str = None) -> dict:
+        """Asynchronously add a request to be processed.
+        Args:
+            single_input (Any): The input data for processing.
+            request_id (str): Optional request identifier to avoid data mix-up.
+        Raises:
+            ValueError: If the engine is not running when this method is called.
+        """
+        if not self._is_running:
+            await self.start()
+        if request_id is None:
+            request_id = str(uuid.uuid4())  # Assign a unique ID if not provided
+        future = await self._batch_processor.add_request(single_input=single_input)  # type: ignore
+        result = await future
+        return dict(
+            request_id=request_id,
+            feature=result
+        )

models/bicodec_tokenizer/models/__init__.py ADDED Viewed

File without changes

models/bicodec_tokenizer/models/audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../..")
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Any, Dict, Tuple
+from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
+from arktts.models.sparktts.utils.file import load_config
+from arktts.models.sparktts.utils.audio import load_audio
+from arktts.models.sparktts.models.bicodec import BiCodec
+class BiCodecTokenizer:
+    """BiCodec tokenizer for handling audio input and tokenization."""
+    def __init__(self, model_dir: Path, device: torch.device = None, **kwargs):
+        super().__init__()
+        """
+        Args:
+            model_dir: Path to the model directory.
+            device: Device to run the model on (default is GPU if available).
+        """
+        self.device = device
+        self.model_dir = model_dir
+        self.config = load_config(f"{model_dir}/config.yaml")
+        self._initialize_model()
+    def _initialize_model(self):
+        """Load and initialize the BiCodec model and Wav2Vec2 feature extractor."""
+        self.model = BiCodec.load_from_checkpoint(f"{self.model_dir}/BiCodec").to(
+            self.device
+        )
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(
+            f"{self.model_dir}/wav2vec2-large-xlsr-53"
+        )
+        self.feature_extractor = Wav2Vec2Model.from_pretrained(
+            f"{self.model_dir}/wav2vec2-large-xlsr-53"
+        ).to(self.device)
+        self.feature_extractor.config.output_hidden_states = True
+    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
+        """Get reference audio clip for speaker embedding."""
+        ref_segment_length = (
+            int(self.config["sample_rate"] * self.config["ref_segment_duration"])
+            // self.config["latent_hop_length"]
+            * self.config["latent_hop_length"]
+        )
+        wav_length = len(wav)
+        if ref_segment_length > wav_length:
+            # Repeat and truncate to handle insufficient length
+            wav = np.tile(wav, ref_segment_length // wav_length + 1)
+        return wav[:ref_segment_length]
+    def process_audio(self, wav_path: Path) -> Tuple[np.ndarray, torch.Tensor]:
+        """load auido and get reference audio from wav path"""
+        wav = load_audio(
+            wav_path,
+            sampling_rate=self.config["sample_rate"],
+            volume_normalize=self.config["volume_normalize"],
+        )
+        wav_ref = self.get_ref_clip(wav)
+        wav_ref = torch.from_numpy(wav_ref).unsqueeze(0).float()
+        return wav, wav_ref
+    def extract_wav2vec2_features(self, wavs: torch.Tensor) -> torch.Tensor:
+        """extract wav2vec2 features"""
+        inputs = self.processor(
+            wavs,
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True,
+            output_hidden_states=True,
+        ).input_values
+        feat = self.feature_extractor(inputs.to(self.feature_extractor.device))
+        feats_mix = (
+            feat.hidden_states[11] + feat.hidden_states[14] + feat.hidden_states[16]
+        ) / 3
+        return feats_mix
+    def tokenize_batch(self, batch: Dict[str, Any]) -> torch.Tensor:
+        """tokenize the batch of audio
+        Args:
+            batch:
+                wavs (List[np.ndarray]): batch of audio
+                ref_wavs (torch.Tensor): reference audio. shape: (batch_size, seq_len)
+        Returns:
+            semantic_tokens: semantic tokens. shape: (batch_size, seq_len, latent_dim)
+            global_tokens: global tokens. shape: (batch_size, seq_len, global_dim)
+        """
+        feats = self.extract_wav2vec2_features(batch["wav"])
+        batch["feat"] = feats
+        semantic_tokens, global_tokens = self.model.tokenize(batch)
+        return global_tokens, semantic_tokens
+    def tokenize(self, audio_path: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """tokenize the audio"""
+        wav, ref_wav = self.process_audio(audio_path)
+        feat = self.extract_wav2vec2_features(wav)
+        batch = {
+            "wav": torch.from_numpy(wav).unsqueeze(0).float().to(self.device),
+            "ref_wav": ref_wav.to(self.device),
+            "feat": feat.to(self.device),
+        }
+        semantic_tokens, global_tokens = self.model.tokenize(batch)
+        return global_tokens, semantic_tokens
+    def detokenize(
+        self, global_tokens: torch.Tensor, semantic_tokens: torch.Tensor
+    ) -> np.array:
+        """detokenize the tokens to waveform
+        Args:
+            global_tokens: global tokens. shape: (batch_size, global_dim)
+            semantic_tokens: semantic tokens. shape: (batch_size, latent_dim)
+        Returns:
+            wav_rec: waveform. shape: (batch_size, seq_len) for batch or (seq_len,) for single
+        """
+        global_tokens = global_tokens.unsqueeze(1)
+        wav_rec = self.model.detokenize(semantic_tokens, global_tokens)
+        return wav_rec.detach().squeeze().cpu().numpy()
+# test
+if __name__ == "__main__":
+    import soundfile as sf
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = BiCodecTokenizer(
+        model_dir="pretrained_models/Spark-TTS-0.5B",
+        device=device,
+    )
+    wav_path = "example/prompt_audio.wav"
+    global_tokens, semantic_tokens = tokenizer.tokenize(wav_path)
+    wav_rec = tokenizer.detokenize(global_tokens.squeeze(0), semantic_tokens)
+    sf.write("example/prompt_recon.wav", wav_rec, 16000)

models/bicodec_tokenizer/models/bicodec.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../..")
+import torch
+import torch.nn as nn
+from pathlib import Path
+from typing import Dict, Any
+from omegaconf import DictConfig
+from safetensors.torch import load_file
+from ..utils.file import load_config
+from ..modules.speaker.speaker_encoder import SpeakerEncoder
+from ..modules.encoder_decoder.feat_encoder import Encoder
+from ..modules.encoder_decoder.feat_decoder import Decoder
+from ..modules.encoder_decoder.wave_generator import WaveGenerator
+from ..modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
+class BiCodec(nn.Module):
+    """
+    BiCodec model for speech synthesis, incorporating a speaker encoder, feature encoder/decoder,
+    quantizer, and wave generator.
+    """
+    def __init__(
+        self,
+        mel_params: Dict[str, Any],
+        encoder: nn.Module,
+        decoder: nn.Module,
+        quantizer: nn.Module,
+        speaker_encoder: nn.Module,
+        prenet: nn.Module,
+        postnet: nn.Module,
+        **kwargs
+    ) -> None:
+        """
+        Initializes the BiCodec model with the required components.
+        Args:
+            mel_params (dict): Parameters for the mel-spectrogram transformer.
+            encoder (nn.Module): Encoder module.
+            decoder (nn.Module): Decoder module.
+            quantizer (nn.Module): Quantizer module.
+            speaker_encoder (nn.Module): Speaker encoder module.
+            prenet (nn.Module): Prenet network.
+            postnet (nn.Module): Postnet network.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.speaker_encoder = speaker_encoder
+        self.prenet = prenet
+        self.postnet = postnet
+        self.init_mel_transformer(mel_params)
+    @classmethod
+    def load_from_checkpoint(cls, model_dir: Path, **kwargs) -> "BiCodec":
+        """
+        Loads the model from a checkpoint.
+        Args:
+            model_dir (Path): Path to the model directory containing checkpoint and config.
+        Returns:
+            BiCodec: The initialized BiCodec model.
+        """
+        ckpt_path = f'{model_dir}/model.safetensors'
+        config = load_config(f'{model_dir}/config.yaml')['audio_tokenizer']
+        mel_params = config["mel_params"]
+        encoder = Encoder(**config["encoder"])
+        quantizer = FactorizedVectorQuantize(**config["quantizer"])
+        prenet = Decoder(**config["prenet"])
+        postnet = Decoder(**config["postnet"])
+        decoder = WaveGenerator(**config["decoder"])
+        speaker_encoder = SpeakerEncoder(**config["speaker_encoder"])
+        model = cls(
+            mel_params=mel_params,
+            encoder=encoder,
+            decoder=decoder,
+            quantizer=quantizer,
+            speaker_encoder=speaker_encoder,
+            prenet=prenet,
+            postnet=postnet,
+        )
+        state_dict = load_file(ckpt_path)
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        for key in missing_keys:
+            print(f"Missing tensor: {key}")
+        for key in unexpected_keys:
+            print(f"Unexpected tensor: {key}")
+        model.eval()
+        model.remove_weight_norm()
+        return model
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Performs a forward pass through the model.
+        Args:
+            batch (dict): A dictionary containing features, reference waveform, and target waveform.
+        Returns:
+            dict: A dictionary containing the reconstruction, features, and other metrics.
+        """
+        feat = batch["feat"]
+        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
+        z = self.encoder(feat.transpose(1, 2))
+        vq_outputs = self.quantizer(z)
+        x_vector, d_vector = self.speaker_encoder(mel.transpose(1, 2))
+        conditions = d_vector
+        with_speaker_loss = False
+        x = self.prenet(vq_outputs["z_q"], conditions)
+        pred_feat = self.postnet(x)
+        x = x + conditions.unsqueeze(-1)
+        wav_recon = self.decoder(x)
+        return {
+            "vq_loss": vq_outputs["vq_loss"],
+            "perplexity": vq_outputs["perplexity"],
+            "cluster_size": vq_outputs["active_num"],
+            "recons": wav_recon,
+            "pred_feat": pred_feat,
+            "x_vector": x_vector,
+            "d_vector": d_vector,
+            "audios": batch["wav"].unsqueeze(1),
+            "with_speaker_loss": with_speaker_loss,
+        }
+    @torch.no_grad()
+    def tokenize(self, batch: Dict[str, Any]):
+        """
+        Tokenizes the input audio into semantic and global tokens.
+        Args:
+            batch (dict): The input audio features and reference waveform.
+        Returns:
+            tuple: Semantic tokens and global tokens.
+        """
+        feat = batch["feat"]
+        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
+        z = self.encoder(feat.transpose(1, 2))
+        semantic_tokens = self.quantizer.tokenize(z)
+        global_tokens = self.speaker_encoder.tokenize(mel.transpose(1, 2))
+        return semantic_tokens, global_tokens
+    @torch.no_grad()
+    def detokenize(self, semantic_tokens, global_tokens):
+        """
+        Detokenizes the semantic and global tokens into a waveform.
+        Args:
+            semantic_tokens (tensor): Semantic tokens.
+            global_tokens (tensor): Global tokens.
+        Returns:
+            tensor: Reconstructed waveform.
+        """
+        z_q = self.quantizer.detokenize(semantic_tokens)
+        d_vector = self.speaker_encoder.detokenize(global_tokens)
+        x = self.prenet(z_q, d_vector)
+        x = x + d_vector.unsqueeze(-1)
+        wav_recon = self.decoder(x)
+        return wav_recon
+    def init_mel_transformer(self, config: Dict[str, Any]):
+        """
+        Initializes the MelSpectrogram transformer based on the provided configuration.
+        Args:
+            config (dict): Configuration parameters for MelSpectrogram.
+        """
+        import torchaudio.transforms as TT
+        self.mel_transformer = TT.MelSpectrogram(
+            config["sample_rate"],
+            config["n_fft"],
+            config["win_length"],
+            config["hop_length"],
+            config["mel_fmin"],
+            config["mel_fmax"],
+            n_mels=config["num_mels"],
+            power=1,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+    def remove_weight_norm(self):
+        """Removes weight normalization from all layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:
+                pass  # The module didn't have weight norm
+        self.apply(_remove_weight_norm)
+# Test the model
+if __name__ == "__main__":
+    config = load_config("pretrained_models/SparkTTS-0.5B/BiCodec/config.yaml")
+    model = BiCodec.load_from_checkpoint(
+        model_dir="pretrained_models/SparkTTS-0.5B/BiCodec",
+    )
+    # Generate random inputs for testing
+    duration = 0.96
+    x = torch.randn(20, 1, int(duration * 16000))
+    feat = torch.randn(20, int(duration * 50), 1024)
+    inputs = {"feat": feat, "wav": x, "ref_wav": x}
+    # Forward pass
+    outputs = model(inputs)
+    semantic_tokens, global_tokens = model.tokenize(inputs)
+    wav_recon = model.detokenize(semantic_tokens, global_tokens)
+    # Verify if the reconstruction matches
+    if torch.allclose(outputs["recons"].detach(), wav_recon):
+        print("Test successful")
+    else:
+        print("Test failed")

models/bicodec_tokenizer/modules/blocks/layers.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)

models/bicodec_tokenizer/modules/blocks/samper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SamplingBlock(nn.Module):
+    """Sampling block for upsampling or downsampling"""
+    def __init__(
+        self,
+        dim: int,
+        groups: int = 1,
+        upsample_scale: int = 1,
+        downsample_scale: int = 1,
+    ) -> None:
+        """
+        Args:
+            dim: input dimension
+            groups: number of groups
+            upsample_scale: upsampling scale
+            downsample_scale: downsampling scale
+        """
+        super(SamplingBlock, self).__init__()
+        self.upsample_scale = upsample_scale
+        self.downsample_scale = downsample_scale
+        if self.upsample_scale > 1:
+            self.de_conv_upsampler = nn.Sequential(
+                nn.LeakyReLU(0.2),
+                nn.ConvTranspose1d(
+                    dim,
+                    dim,
+                    kernel_size=upsample_scale * 2,
+                    stride=upsample_scale,
+                    padding=upsample_scale // 2 + upsample_scale % 2,
+                    output_padding=upsample_scale % 2,
+                    groups=groups,
+                ),
+            )
+        if self.downsample_scale > 1:
+            self.conv_downsampler = nn.Sequential(
+                nn.LeakyReLU(0.2),
+                nn.Conv1d(
+                    dim,
+                    dim,
+                    kernel_size=2 * downsample_scale,
+                    stride=downsample_scale,
+                    padding=downsample_scale // 2 + downsample_scale % 2,
+                    groups=groups,
+                ),
+            )
+    @staticmethod
+    def repeat_upsampler(x, upsample_scale):
+        return x.repeat_interleave(upsample_scale, dim=2)
+    @staticmethod
+    def skip_downsampler(x, downsample_scale):
+        return F.avg_pool1d(x, kernel_size=downsample_scale, stride=downsample_scale)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        if self.upsample_scale > 1:
+            repeat_res = self.repeat_upsampler(x, self.upsample_scale)
+            deconv_res = self.de_conv_upsampler(x)
+            upmerge_res = repeat_res + deconv_res
+        else:
+            upmerge_res = x
+            repeat_res = x
+        if self.downsample_scale > 1:
+            conv_res = self.conv_downsampler(upmerge_res)
+            skip2_res = self.skip_downsampler(upmerge_res, self.downsample_scale)
+            skip1_res = self.skip_downsampler(repeat_res, self.downsample_scale)
+        else:
+            conv_res = upmerge_res
+            skip2_res = upmerge_res
+            skip1_res = repeat_res
+        final_res = conv_res + skip1_res + skip2_res
+        return final_res
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    model = SamplingBlock(1024, 1024, upsample_scale=2)
+    model_down = SamplingBlock(1024, 1024, downsample_scale=2)
+    output = model(test_input)
+    output_down = model_down(test_input)
+    print("shape after upsample * 2", output.shape)  # torch.Size([8, 1024, 100])
+    print("shape after downsample * 2", output_down.shape)  # torch.Size([8, 1024, 25])
+    if output.shape == torch.Size([8, 1024, 100]) and output_down.shape == torch.Size(
+        [8, 1024, 25]
+    ):
+        print("test successful")

models/bicodec_tokenizer/modules/blocks/vocos.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import Tuple
+from torch.nn.utils import weight_norm, remove_weight_norm
+from typing import Optional
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+        condition_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.adanorm = condition_dim is not None
+        if condition_dim:
+            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class AdaLayerNorm(nn.Module):
+    """
+    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
+    Args:
+        condition_dim (int): Dimension of the condition.
+        embedding_dim (int): Dimension of the embeddings.
+    """
+    def __init__(self, condition_dim: int, embedding_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = embedding_dim
+        self.scale = nn.Linear(condition_dim, embedding_dim)
+        self.shift = nn.Linear(condition_dim, embedding_dim)
+        torch.nn.init.ones_(self.scale.weight)
+        torch.nn.init.zeros_(self.shift.weight)
+    def forward(self, x: torch.Tensor, cond_embedding: torch.Tensor) -> torch.Tensor:
+        scale = self.scale(cond_embedding)
+        shift = self.shift(cond_embedding)
+        x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
+        x = x * scale.unsqueeze(1) + shift.unsqueeze(1)
+        return x
+class ResBlock1(nn.Module):
+    """
+    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
+    but without upsampling layers.
+    Args:
+        dim (int): Number of input channels.
+        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
+        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
+            Defaults to (1, 3, 5).
+        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
+            Defaults to 0.1.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 3,
+        dilation: Tuple[int, int, int] = (1, 3, 5),
+        lrelu_slope: float = 0.1,
+        layer_scale_init_value: Optional[float] = None,
+    ):
+        super().__init__()
+        self.lrelu_slope = lrelu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=self.get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=self.get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=self.get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.gamma = nn.ParameterList(
+            [
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
+            xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
+            xt = c1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
+            xt = c2(xt)
+            if gamma is not None:
+                xt = gamma * xt
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+    @staticmethod
+    def get_padding(kernel_size: int, dilation: int = 1) -> int:
+        return int((kernel_size * dilation - dilation) / 2)
+class Backbone(nn.Module):
+    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
+                        C denotes output features, and L is the sequence length.
+        Returns:
+            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+                    and H denotes the model dimension.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class VocosBackbone(Backbone):
+    """
+    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+        num_layers (int): Number of ConvNeXtBlock layers.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+                                                None means non-conditional model. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        layer_scale_init_value: Optional[float] = None,
+        condition_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
+        self.adanorm = condition_dim is not None
+        if condition_dim:
+            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+        self.convnext = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    layer_scale_init_value=layer_scale_init_value,
+                    condition_dim=condition_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
+        x = self.embed(x)
+        if self.adanorm:
+            assert condition is not None
+            x = self.norm(x.transpose(1, 2), condition)
+        else:
+            x = self.norm(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        for conv_block in self.convnext:
+            x = conv_block(x, condition)
+        x = self.final_layer_norm(x.transpose(1, 2))
+        return x
+class VocosResNetBackbone(Backbone):
+    """
+    Vocos backbone module built with ResBlocks.
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        num_blocks (int): Number of ResBlock1 blocks.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels,
+        dim,
+        num_blocks,
+        layer_scale_init_value=None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = weight_norm(
+            nn.Conv1d(input_channels, dim, kernel_size=3, padding=1)
+        )
+        layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3
+        self.resnet = nn.Sequential(
+            *[
+                ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value)
+                for _ in range(num_blocks)
+            ]
+        )
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.embed(x)
+        x = self.resnet(x)
+        x = x.transpose(1, 2)
+        return x

models/bicodec_tokenizer/modules/encoder_decoder/feat_decoder.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List
+from ..blocks.vocos import VocosBackbone
+from ..blocks.samper import SamplingBlock
+class Decoder(nn.Module):
+    """Decoder module with convnext and upsampling blocks
+    Args:
+        sample_ratios (List[int]): sample ratios
+            example: [2, 2] means downsample by 2x and then upsample by 2x
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        condition_dim: int = None,
+        sample_ratios: List[int] = [1, 1],
+        use_tanh_at_final: bool = False,
+    ):
+        super().__init__()
+        self.linear_pre = nn.Linear(input_channels, vocos_dim)
+        modules = [
+            nn.Sequential(
+                SamplingBlock(
+                    dim=vocos_dim,
+                    groups=vocos_dim,
+                    upsample_scale=ratio,
+                ),
+                VocosBackbone(
+                    input_channels=vocos_dim,
+                    dim=vocos_dim,
+                    intermediate_dim=vocos_intermediate_dim,
+                    num_layers=2,
+                    condition_dim=None,
+                ),
+            )
+            for ratio in sample_ratios
+        ]
+        self.downsample = nn.Sequential(*modules)
+        self.vocos_backbone = VocosBackbone(
+            input_channels=vocos_dim,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+            condition_dim=condition_dim,
+        )
+        self.linear = nn.Linear(vocos_dim, out_channels)
+        self.use_tanh_at_final = use_tanh_at_final
+    def forward(self, x: torch.Tensor, c: torch.Tensor = None):
+        """encoder forward.
+        Args:
+            x (torch.Tensor): (batch_size, input_channels, length)
+        Returns:
+            x (torch.Tensor): (batch_size, encode_channels, length)
+        """
+        x = self.linear_pre(x.transpose(1, 2))
+        x = self.downsample(x).transpose(1, 2)
+        x = self.vocos_backbone(x, condition=c)
+        x = self.linear(x).transpose(1, 2)
+        if self.use_tanh_at_final:
+            x = torch.tanh(x)
+        return x
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    condition = torch.randn(8, 256)
+    decoder = Decoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        condition_dim=256,
+        sample_ratios=[2, 2],
+    )
+    output = decoder(test_input, condition)
+    print(output.shape)  # torch.Size([8, 256, 200])
+    if output.shape == torch.Size([8, 256, 200]):
+        print("Decoder test passed")
+    else:
+        print("Decoder test failed")

models/bicodec_tokenizer/modules/encoder_decoder/feat_encoder.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List
+import sys
+sys.path.append("../../../..")
+sys.path.append("../../../../..")
+from ..blocks.vocos import VocosBackbone
+from ..blocks.samper import SamplingBlock
+class Encoder(nn.Module):
+    """Encoder module with convnext and downsampling blocks"""
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        sample_ratios: List[int] = [1, 1],
+    ):
+        super().__init__()
+        """
+        Encoder module with VocosBackbone and sampling blocks.
+        Args:
+            sample_ratios (List[int]): sample ratios
+                example: [2, 2] means downsample by 2x and then upsample by 2x
+        """
+        self.encoder = VocosBackbone(
+            input_channels=input_channels,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+            condition_dim=None,
+        )
+        modules = [
+            nn.Sequential(
+                SamplingBlock(
+                    dim=vocos_dim,
+                    groups=vocos_dim,
+                    downsample_scale=ratio,
+                ),
+                VocosBackbone(
+                    input_channels=vocos_dim,
+                    dim=vocos_dim,
+                    intermediate_dim=vocos_intermediate_dim,
+                    num_layers=2,
+                    condition_dim=None,
+                ),
+            )
+            for ratio in sample_ratios
+        ]
+        self.downsample = nn.Sequential(*modules)
+        self.project = nn.Linear(vocos_dim, out_channels)
+    def forward(self, x: torch.Tensor, *args):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, input_channels, length)
+        Returns:
+            x (torch.Tensor): (batch_size, encode_channels, length)
+        """
+        x = self.encoder(x)
+        x = self.downsample(x)
+        x = self.project(x)
+        return x.transpose(1, 2)
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    encoder = Encoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        sample_ratios=[2, 2],
+    )
+    output = encoder(test_input)
+    print(output.shape)  # torch.Size([8, 256, 12])
+    if output.shape == torch.Size([8, 256, 12]):
+        print("test successful")

models/bicodec_tokenizer/modules/encoder_decoder/wave_generator.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2024 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
+import torch.nn as nn
+from ..blocks.layers import (
+    Snake1d,
+    WNConv1d,
+    ResidualUnit,
+    WNConvTranspose1d,
+    init_weights,
+)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        kernel_size: int = 2,
+        stride: int = 1,
+    ):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - stride) // 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class WaveGenerator(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        kernel_sizes,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, (kernel_size, stride) in enumerate(zip(kernel_sizes, rates)):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, kernel_size, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+        self.apply(init_weights)
+    def forward(self, x):
+        return self.model(x)

models/bicodec_tokenizer/modules/fsq/finite_scalar_quantization.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+Code adapted from Jax version in Appendix A.1
+"""
+from __future__ import annotations
+from functools import wraps, partial
+from contextlib import nullcontext
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import Module
+from torch import Tensor, int32
+from torch.amp import autocast
+from einops import rearrange, pack, unpack
+# helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+class FSQ(Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks=1,
+        keep_num_codebooks_dim: bool | None = None,
+        scale: float | None = None,
+        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
+        channel_first: bool = False,
+        projection_has_bias: bool = True,
+        return_indices=True,
+        force_quantization_f32=True,
+    ):
+        super().__init__()
+        _levels = torch.tensor(levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = (
+            nn.Linear(self.dim, effective_codebook_dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(effective_codebook_dim, self.dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
+            self.register_buffer(
+                "implicit_codebook", implicit_codebook, persistent=False
+            )
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def bound(self, z, eps: float = 1e-3):
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings"""
+        indices = rearrange(indices, "... -> ... 1")
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """Inverse of `codes_to_indices`."""
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, "... c d -> ... (c d)")
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, "b ... d -> b d ...")
+        return codes
+    def forward(self, z):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, "b d ... -> b ... d")
+            z, ps = pack_one(z, "b * d")
+        assert (
+            z.shape[-1] == self.dim
+        ), f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+        z = self.project_in(z)
+        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = (
+            partial(autocast, "cuda", enabled=False) if force_f32 else nullcontext
+        )
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            codes = rearrange(codes, "b n c d -> b n (c d)")
+            codes = codes.type(orig_dtype)
+        # project out
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, "b * d")
+            out = rearrange(out, "b ... d -> b d ...")
+            indices = maybe(unpack_one)(indices, ps, "b * c")
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, "... 1 -> ...")
+        # return quantized output and indices
+        return out, indices

models/bicodec_tokenizer/modules/fsq/residual_fsq.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import random
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from typing import List
+from torch import nn
+from torch.nn import Module
+from torch.amp import autocast
+from einx import get_at
+from einops import rearrange, reduce, pack, unpack
+from .finite_scalar_quantization import FSQ
+def exists(val):
+    return val is not None
+def first(l):
+    return l[0]
+def default(val, d):
+    return val if exists(val) else d
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+# distributed helpers
+def is_distributed():
+    return dist.is_initialized() and dist.get_world_size() > 1
+def get_maybe_sync_seed(device, max_size=10_000):
+    rand_int = torch.randint(0, max_size, (), device=device)
+    if is_distributed():
+        dist.all_reduce(rand_int)
+    return rand_int.item()
+class ResidualFSQ(Module):
+    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""
+    def __init__(
+        self,
+        *,
+        levels: List[int],
+        num_quantizers,
+        dim=None,
+        is_channel_first=False,
+        quantize_dropout=False,
+        quantize_dropout_cutoff_index=0,
+        quantize_dropout_multiple_of=1,
+        **kwargs,
+    ):
+        super().__init__()
+        codebook_dim = len(levels)
+        dim = default(dim, codebook_dim)
+        requires_projection = codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.has_projections = requires_projection
+        self.is_channel_first = is_channel_first
+        self.num_quantizers = num_quantizers
+        self.levels = levels
+        self.layers = nn.ModuleList([])
+        levels_tensor = torch.Tensor(levels)
+        scales = []
+        for ind in range(num_quantizers):
+            scales.append((levels_tensor - 1) ** -ind)
+            fsq = FSQ(levels=levels, dim=codebook_dim, **kwargs)
+            self.layers.append(fsq)
+        assert all([not fsq.has_projections for fsq in self.layers])
+        self.codebook_size = self.layers[0].codebook_size
+        self.register_buffer("scales", torch.stack(scales), persistent=False)
+        self.quantize_dropout = quantize_dropout and num_quantizers > 1
+        assert quantize_dropout_cutoff_index >= 0
+        self.quantize_dropout_cutoff_index = quantize_dropout_cutoff_index
+        self.quantize_dropout_multiple_of = quantize_dropout_multiple_of  # encodec paper proposes structured dropout, believe this was set to 4
+    @property
+    def codebooks(self):
+        codebooks = [layer.implicit_codebook for layer in self.layers]
+        codebooks = torch.stack(codebooks, dim=0)
+        return codebooks
+    def get_codes_from_indices(self, indices):
+        batch, quantize_dim = indices.shape[0], indices.shape[-1]
+        # may also receive indices in the shape of 'b h w q' (accept_image_fmap)
+        indices, ps = pack([indices], "b * q")
+        # because of quantize dropout, one can pass in indices that are coarse
+        # and the network should be able to reconstruct
+        if quantize_dim < self.num_quantizers:
+            assert (
+                self.quantize_dropout > 0.0
+            ), "quantize dropout must be greater than 0 if you wish to reconstruct from a signal with less fine quantizations"
+            indices = F.pad(indices, (0, self.num_quantizers - quantize_dim), value=-1)
+        # take care of quantizer dropout
+        mask = indices == -1
+        indices = indices.masked_fill(
+            mask, 0
+        )  # have it fetch a dummy code to be masked out later
+        all_codes = get_at("q [c] d, b n q -> q b n d", self.codebooks, indices)
+        # mask out any codes that were dropout-ed
+        all_codes = all_codes.masked_fill(rearrange(mask, "b n q -> q b n 1"), 0.0)
+        # scale the codes
+        scales = rearrange(self.scales, "q d -> q 1 1 d")
+        all_codes = all_codes * scales
+        # if (accept_image_fmap = True) then return shape (quantize, batch, height, width, dimension)
+        (all_codes,) = unpack(all_codes, ps, "q b * d")
+        return all_codes
+    def get_output_from_indices(self, indices):
+        codes = self.get_codes_from_indices(indices)
+        codes_summed = reduce(codes, "q ... -> ...", "sum")
+        return self.project_out(codes_summed)
+    def forward(self, x, return_all_codes=False, rand_quantize_dropout_fixed_seed=None):
+        num_quant, quant_dropout_multiple_of, device = (
+            self.num_quantizers,
+            self.quantize_dropout_multiple_of,
+            x.device,
+        )
+        # handle channel first
+        if self.is_channel_first:
+            x = rearrange(x, "b d ... -> b ... d")
+            x, ps = pack([x], "b * d")
+        # maybe project in
+        x = self.project_in(x)
+        quantized_out = 0.0
+        residual = x
+        all_indices = []
+        should_quantize_dropout = self.training and self.quantize_dropout
+        # sample a layer index at which to dropout further residual quantization
+        # also prepare null indices
+        if should_quantize_dropout:
+            # check if seed is manually passed in
+            if not exists(rand_quantize_dropout_fixed_seed):
+                rand_quantize_dropout_fixed_seed = get_maybe_sync_seed(device)
+            rand = random.Random(rand_quantize_dropout_fixed_seed)
+            rand_quantize_dropout_index = rand.randrange(
+                self.quantize_dropout_cutoff_index, num_quant
+            )
+            if quant_dropout_multiple_of != 1:
+                rand_quantize_dropout_index = (
+                    round_up_multiple(
+                        rand_quantize_dropout_index + 1, quant_dropout_multiple_of
+                    )
+                    - 1
+                )
+            null_indices = torch.full(
+                x.shape[:2], -1.0, device=device, dtype=torch.long
+            )
+        # go through the layers
+        with autocast("cuda", enabled=False):
+            for quantizer_index, (layer, scale) in enumerate(
+                zip(self.layers, self.scales)
+            ):
+                if (
+                    should_quantize_dropout
+                    and quantizer_index > rand_quantize_dropout_index
+                ):
+                    all_indices.append(null_indices)
+                    continue
+                quantized, indices = layer(residual / scale)
+                quantized = quantized * scale
+                residual = residual - quantized.detach()
+                quantized_out = quantized_out + quantized
+                all_indices.append(indices)
+        # project out, if needed
+        quantized_out = self.project_out(quantized_out)
+        # stack all indices
+        all_indices = torch.stack(all_indices, dim=-1)
+        # channel first out
+        if self.is_channel_first:
+            (quantized_out,) = unpack(quantized_out, ps, "b * d")
+            (all_indices,) = unpack(all_indices, ps, "b * d")
+            quantized_out = rearrange(quantized_out, "b ... d -> b d ...")
+            all_indices = rearrange(all_indices, "b ... d -> b d ...")
+        # return
+        ret = (quantized_out, all_indices)
+        if not return_all_codes:
+            return ret
+        # whether to return all codes from all codebooks across layers
+        all_codes = self.get_codes_from_indices(all_indices)
+        # will return all codes in shape (quantizer, batch, sequence length, codebook dimension)
+        return (*ret, all_codes)
+# grouped residual fsq
+class GroupedResidualFSQ(Module):
+    def __init__(self, *, dim, groups=1, accept_image_fmap=False, **kwargs):
+        super().__init__()
+        self.dim = dim
+        self.groups = groups
+        assert (dim % groups) == 0
+        dim_per_group = dim // groups
+        self.accept_image_fmap = accept_image_fmap
+        self.rvqs = nn.ModuleList([])
+        for _ in range(groups):
+            self.rvqs.append(ResidualFSQ(dim=dim_per_group, **kwargs))
+        self.codebook_size = self.rvqs[0].codebook_size
+    @property
+    def codebooks(self):
+        return torch.stack(tuple(rvq.codebooks for rvq in self.rvqs))
+    @property
+    def split_dim(self):
+        return 1 if self.accept_image_fmap else -1
+    def get_codes_from_indices(self, indices):
+        codes = tuple(
+            rvq.get_codes_from_indices(chunk_indices)
+            for rvq, chunk_indices in zip(self.rvqs, indices)
+        )
+        return torch.stack(codes)
+    def get_output_from_indices(self, indices):
+        outputs = tuple(
+            rvq.get_output_from_indices(chunk_indices)
+            for rvq, chunk_indices in zip(self.rvqs, indices)
+        )
+        return torch.cat(outputs, dim=self.split_dim)
+    def forward(self, x, return_all_codes=False):
+        shape, split_dim, device = x.shape, self.split_dim, x.device
+        assert shape[split_dim] == self.dim
+        # split the feature dimension into groups
+        x = x.chunk(self.groups, dim=split_dim)
+        forward_kwargs = dict(
+            return_all_codes=return_all_codes,
+            rand_quantize_dropout_fixed_seed=(
+                get_maybe_sync_seed(device) if self.training else None
+            ),
+        )
+        # invoke residual vq on each group
+        out = tuple(rvq(chunk, **forward_kwargs) for rvq, chunk in zip(self.rvqs, x))
+        out = tuple(zip(*out))
+        # otherwise, get all the zipped outputs and combine them
+        quantized, all_indices, *maybe_all_codes = out
+        quantized = torch.cat(quantized, dim=split_dim)
+        all_indices = torch.stack(all_indices)
+        ret = (quantized, all_indices, *maybe_all_codes)
+        return ret
+if __name__ == "__main__":
+    model = ResidualFSQ(
+        levels=[4, 4, 4, 4, 4, 4],
+        num_quantizers=1,
+        dim=30,
+        is_channel_first=True,
+        quantize_dropout=False,
+    )
+    x = torch.randn(2, 30, 10)
+    quantize, embed_ind = model(x)
+    emb_from_ind = model.get_output_from_indices(embed_ind.transpose(1, 2))
+    print(quantize == emb_from_ind.transpose(1, 2))
+    print("quantize shape", quantize.shape)
+    print("embed_ind", embed_ind)

models/bicodec_tokenizer/modules/speaker/__init__.py ADDED Viewed

File without changes

models/bicodec_tokenizer/modules/speaker/ecapa_tdnn.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) 2021 Zhengyang Chen (chenzhengyang117@gmail.com)
+#               2022 Hongji Wang (jijijiang77@gmail.com)
+#               2023 Bing Han (hanbing97@sjtu.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" This implementation is adapted from github repo:
+    https://github.com/lawlict/ECAPA-TDNN.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import pooling_layers
+class Res2Conv1dReluBn(nn.Module):
+    """
+    in_channels == out_channels == channels
+    """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        sp = spx[0]
+        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
+            # Order: conv -> relu -> bn
+            if i >= 1:
+                sp = sp + spx[i]
+            sp = conv(sp)
+            sp = bn(F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
+class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
+        )
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+""" The SE connection of 1D case.
+"""
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+class SE_Res2Block(nn.Module):
+    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
+        super().__init__()
+        self.se_res2block = nn.Sequential(
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            Res2Conv1dReluBn(
+                channels, kernel_size, stride, padding, dilation, scale=scale
+            ),
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            SE_Connect(channels),
+        )
+    def forward(self, x):
+        return x + self.se_res2block(x)
+class ECAPA_TDNN(nn.Module):
+    def __init__(
+        self,
+        channels=512,
+        feat_dim=80,
+        embed_dim=192,
+        pooling_func="ASTP",
+        global_context_att=False,
+        emb_bn=False,
+    ):
+        super().__init__()
+        self.layer1 = Conv1dReluBn(feat_dim, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8
+        )
+        self.layer3 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8
+        )
+        self.layer4 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8
+        )
+        cat_channels = channels * 3
+        out_channels = 512 * 3
+        self.conv = nn.Conv1d(cat_channels, out_channels, kernel_size=1)
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=out_channels, global_context_att=global_context_att
+        )
+        self.pool_out_dim = self.pool.get_out_dim()
+        self.bn = nn.BatchNorm1d(self.pool_out_dim)
+        self.linear = nn.Linear(self.pool_out_dim, embed_dim)
+        self.emb_bn = emb_bn
+        if emb_bn:  # better in SSL for SV
+            self.bn2 = nn.BatchNorm1d(embed_dim)
+        else:
+            self.bn2 = nn.Identity()
+    def forward(self, x, return_latent=False):
+        x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = torch.cat([out2, out3, out4], dim=1)
+        latent = F.relu(self.conv(out))
+        out = self.bn(self.pool(latent))
+        out = self.linear(out)
+        if self.emb_bn:
+            out = self.bn2(out)
+        if return_latent:
+            return out, latent
+        return out
+def ECAPA_TDNN_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+if __name__ == "__main__":
+    x = torch.zeros(1, 200, 100)
+    model = ECAPA_TDNN_GLOB_c512(feat_dim=100, embed_dim=256, pooling_func="ASTP")
+    model.eval()
+    out, latent = model(x, True)
+    print(out.shape)
+    print(latent.shape)
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))
+    # from thop import profile
+    # x_np = torch.randn(1, 200, 80)
+    # flops, params = profile(model, inputs=(x_np, ))
+    # print("FLOPs: {} G, Params: {} M".format(flops / 1e9, params / 1e6))

models/bicodec_tokenizer/modules/speaker/perceiver_encoder.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
+from collections import namedtuple
+from functools import wraps
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from packaging import version
+from torch import einsum, nn
+def exists(val):
+    return val is not None
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# main class
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, causal=False, use_flash=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.causal = causal
+        self.register_buffer("mask", None, persistent=False)
+        self.use_flash = use_flash
+        assert not (
+            use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
+        ), "in order to use flash attention, you must be using pytorch 2.0 or above"
+        # determine efficient attention configs for cuda and cpu
+        self.config = namedtuple(
+            "EfficientAttentionConfig",
+            ["enable_flash", "enable_math", "enable_mem_efficient"],
+        )
+        self.cpu_config = self.config(True, True, True)
+        self.cuda_config = None
+        if not torch.cuda.is_available() or not use_flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            print_once(
+                "A100 GPU detected, using flash attention if input tensor is on cuda"
+            )
+            self.cuda_config = self.config(True, False, False)
+        else:
+            print_once(
+                "Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda"
+            )
+            self.cuda_config = self.config(False, True, True)
+    def get_mask(self, n, device):
+        if exists(self.mask) and self.mask.shape[-1] >= n:
+            return self.mask[:n, :n]
+        mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1)
+        self.register_buffer("mask", mask, persistent=False)
+        return mask
+    def flash_attn(self, q, k, v, mask=None):
+        _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
+        # Recommended for multi-query single-key-value attention by Tri Dao
+        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+        if k.ndim == 3:
+            k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
+        if v.ndim == 3:
+            v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
+        # Check if mask exists and expand to compatible shape
+        # The mask is B L, so it would have to be expanded to B H N L
+        if exists(mask):
+            mask = rearrange(mask, "b j -> b 1 1 j")
+            mask = mask.expand(-1, heads, q_len, -1)
+        # Check if there is a compatible device for flash attention
+        config = self.cuda_config if is_cuda else self.cpu_config
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=self.causal,
+            )
+        return out
+    def forward(self, q, k, v, mask=None):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        n, device = q.shape[-2], q.device
+        scale = q.shape[-1] ** -0.5
+        if self.use_flash:
+            return self.flash_attn(q, k, v, mask=mask)
+        kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
+        # similarity
+        sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale
+        # key padding mask
+        if exists(mask):
+            mask = rearrange(mask, "b j -> b 1 1 j")
+            sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
+        # causal mask
+        if self.causal:
+            causal_mask = self.get_mask(n, device)
+            sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
+        # attention
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+        # aggregate values
+        out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v)
+        return out
+def Sequential(*mods):
+    return nn.Sequential(*filter(exists, mods))
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+class RMSNorm(nn.Module):
+    def __init__(self, dim, scale=True, dim_cond=None):
+        super().__init__()
+        self.cond = exists(dim_cond)
+        self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim)) if scale else None
+    def forward(self, x, cond=None):
+        gamma = default(self.gamma, 1)
+        out = F.normalize(x, dim=-1) * self.scale * gamma
+        if not self.cond:
+            return out
+        assert exists(cond)
+        gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1)
+        gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta))
+        return out * gamma + beta
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        (kernel_size,) = self.kernel_size
+        (dilation,) = self.dilation
+        (stride,) = self.stride
+        assert stride == 1
+        self.causal_padding = dilation * (kernel_size - 1)
+    def forward(self, x):
+        causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        return super().forward(causal_padded_x)
+class GEGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.gelu(gate) * x
+def FeedForward(dim, mult=4, causal_conv=False):
+    dim_inner = int(dim * mult * 2 / 3)
+    conv = None
+    if causal_conv:
+        conv = nn.Sequential(
+            Rearrange("b n d -> b d n"),
+            CausalConv1d(dim_inner, dim_inner, 3),
+            Rearrange("b d n -> b n d"),
+        )
+    return Sequential(
+        nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim)
+    )
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_context=None,
+        causal=False,
+        dim_head=64,
+        heads=8,
+        dropout=0.0,
+        use_flash=False,
+        cross_attn_include_queries=False,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.cross_attn_include_queries = cross_attn_include_queries
+        dim_inner = dim_head * heads
+        dim_context = default(dim_context, dim)
+        self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash)
+        self.to_q = nn.Linear(dim, dim_inner, bias=False)
+        self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
+        self.to_out = nn.Linear(dim_inner, dim, bias=False)
+    def forward(self, x, context=None, mask=None):
+        h, has_context = self.heads, exists(context)
+        context = default(context, x)
+        if has_context and self.cross_attn_include_queries:
+            context = torch.cat((x, context), dim=-2)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = self.attend(q, k, v, mask=mask)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=2,
+        dim_context=None,
+        num_latents=32,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        use_flash_attn=False,
+    ):
+        super().__init__()
+        dim_context = default(dim_context, dim)
+        self.proj_context = (
+            nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
+        )
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        nn.init.normal_(self.latents, std=0.02)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        Attention(
+                            dim=dim,
+                            dim_head=dim_head,
+                            heads=heads,
+                            use_flash=use_flash_attn,
+                            cross_attn_include_queries=True,
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = RMSNorm(dim)
+    def forward(self, x, mask=None):
+        batch = x.shape[0]
+        x = self.proj_context(x)
+        latents = repeat(self.latents, "n d -> b n d", b=batch)
+        for attn, ff in self.layers:
+            latents = attn(latents, x, mask=mask) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+if __name__ == "__main__":
+    model = PerceiverResampler(dim=256, dim_context=80)
+    x = torch.randn(8, 200, 80)
+    out = model(x)
+    print(out.shape)  # [8, 32, 80]
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))

models/bicodec_tokenizer/modules/speaker/pooling_layers.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pooling functions to aggregate frame-level deep features
+into segment-level speaker embeddings
+High-order statistics are surprisingly effective, TSDP acts similarly as TSTP,
+even though we remove the mean statistic, on Voxceleb.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class TAP(nn.Module):
+    """
+    Temporal average pooling, only first-order mean is considered
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TAP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        pooling_mean = x.mean(dim=-1)
+        # To be compatable with 2D input
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        return pooling_mean
+    def get_out_dim(self):
+        self.out_dim = self.in_dim
+        return self.out_dim
+class TSDP(nn.Module):
+    """
+    Temporal standard deviation pooling, only second-order std is considered
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TSDP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        return pooling_std
+    def get_out_dim(self):
+        self.out_dim = self.in_dim
+        return self.out_dim
+class TSTP(nn.Module):
+    """
+    Temporal statistics pooling, concatenate mean and std, which is used in
+    x-vector
+    Comment: simple concatenation can not make full use of both statistics
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TSTP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_mean = x.mean(dim=-1)
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        stats = torch.cat((pooling_mean, pooling_std), 1)
+        return stats
+    def get_out_dim(self):
+        self.out_dim = self.in_dim * 2
+        return self.out_dim
+class ASTP(nn.Module):
+    """ Attentive statistics pooling: Channel- and context-dependent
+        statistics pooling, first used in ECAPA_TDNN.
+    """
+    def __init__(self,
+                 in_dim,
+                 bottleneck_dim=128,
+                 global_context_att=False,
+                 **kwargs):
+        super(ASTP, self).__init__()
+        self.in_dim = in_dim
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear, then we don't
+        # need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(
+                in_dim, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
+                                 kernel_size=1)  # equals V and k in the paper
+    def forward(self, x):
+        """
+        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
+            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(x.shape) == 4:
+            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+        assert len(x.shape) == 3
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-7).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! ReLU may be hard to converge.
+        alpha = torch.tanh(
+            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        var = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(var.clamp(min=1e-7))
+        return torch.cat([mean, std], dim=1)
+    def get_out_dim(self):
+        self.out_dim = 2 * self.in_dim
+        return self.out_dim
+class MHASTP(torch.nn.Module):
+    """ Multi head attentive statistics pooling
+    Reference:
+        Self Multi-Head Attention for Speaker Recognition
+        https://arxiv.org/pdf/1906.09890.pdf
+    """
+    def __init__(self,
+                 in_dim,
+                 layer_num=2,
+                 head_num=2,
+                 d_s=1,
+                 bottleneck_dim=64,
+                 **kwargs):
+        super(MHASTP, self).__init__()
+        assert (in_dim % head_num
+                ) == 0  # make sure that head num can be divided by input_dim
+        self.in_dim = in_dim
+        self.head_num = head_num
+        d_model = int(in_dim / head_num)
+        channel_dims = [bottleneck_dim for i in range(layer_num + 1)]
+        if d_s > 1:
+            d_s = d_model
+        else:
+            d_s = 1
+        self.d_s = d_s
+        channel_dims[0], channel_dims[-1] = d_model, d_s
+        heads_att_trans = []
+        for i in range(self.head_num):
+            att_trans = nn.Sequential()
+            for i in range(layer_num - 1):
+                att_trans.add_module(
+                    'att_' + str(i),
+                    nn.Conv1d(channel_dims[i], channel_dims[i + 1], 1, 1))
+                att_trans.add_module('tanh' + str(i), nn.Tanh())
+            att_trans.add_module(
+                'att_' + str(layer_num - 1),
+                nn.Conv1d(channel_dims[layer_num - 1], channel_dims[layer_num],
+                          1, 1))
+            heads_att_trans.append(att_trans)
+        self.heads_att_trans = nn.ModuleList(heads_att_trans)
+    def forward(self, input):
+        """
+        input: a 3-dimensional tensor in xvector architecture
+            or a 4-dimensional tensor in resnet architecture
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(input.shape) == 4:  # B x F x T
+            input = input.reshape(input.shape[0],
+                                  input.shape[1] * input.shape[2],
+                                  input.shape[3])
+        assert len(input.shape) == 3
+        bs, f_dim, t_dim = input.shape
+        chunks = torch.chunk(input, self.head_num, 1)
+        # split
+        chunks_out = []
+        # for i in range(self.head_num):
+        #     att_score = self.heads_att_trans[i](chunks[i])
+        for i, layer in enumerate(self.heads_att_trans):
+            att_score = layer(chunks[i])
+            alpha = F.softmax(att_score, dim=-1)
+            mean = torch.sum(alpha * chunks[i], dim=2)
+            var = torch.sum(alpha * chunks[i]**2, dim=2) - mean**2
+            std = torch.sqrt(var.clamp(min=1e-7))
+            chunks_out.append(torch.cat((mean, std), dim=1))
+        out = torch.cat(chunks_out, dim=1)
+        return out
+    def get_out_dim(self):
+        self.out_dim = 2 * self.in_dim
+        return self.out_dim
+class MQMHASTP(torch.nn.Module):
+    """ An attentive pooling
+    Reference:
+        multi query multi head attentive statistics pooling
+        https://arxiv.org/pdf/2110.05042.pdf
+    Args:
+        in_dim: the feature dimension of input
+        layer_num: the number of layer in the pooling layer
+        query_num: the number of querys
+        head_num: the number of heads
+        bottleneck_dim: the bottleneck dimension
+    SA (H = 1, Q = 1, n = 2, d_s = 1) ref:
+        https://www.danielpovey.com/files/2018_interspeech_xvector_attention.pdf
+    MHA (H > 1, Q = 1, n = 1, d_s = 1) ref:
+        https://arxiv.org/pdf/1906.09890.pdf
+    AS (H = 1, Q > 1, n = 2, d_s = 1) ref:
+        https://arxiv.org/pdf/1803.10963.pdf
+    VSA (H = 1, Q > 1, n = 2, d_s = d_h) ref:
+        http://www.interspeech2020.org/uploadfile/pdf/Mon-2-10-5.pdf
+    """
+    def __init__(self,
+                 in_dim,
+                 layer_num=2,
+                 query_num=2,
+                 head_num=8,
+                 d_s=2,
+                 bottleneck_dim=64,
+                 **kwargs):
+        super(MQMHASTP, self).__init__()
+        self.n_query = nn.ModuleList([
+            MHASTP(in_dim,
+                   layer_num=layer_num,
+                   head_num=head_num,
+                   d_s=d_s,
+                   bottleneck_dim=bottleneck_dim) for i in range(query_num)
+        ])
+        self.query_num = query_num
+        self.in_dim = in_dim
+    def forward(self, input):
+        """
+        input: a 3-dimensional tensor in xvector architecture
+            or a 4-dimensional tensor in resnet architecture
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(input.shape) == 4:  # B x F x T
+            input = input.reshape(input.shape[0],
+                                  input.shape[1] * input.shape[2],
+                                  input.shape[3])
+        assert len(input.shape) == 3
+        res = []
+        for i, layer in enumerate(self.n_query):
+            res.append(layer(input))
+        out = torch.cat(res, dim=-1)
+        return out
+    def get_out_dim(self):
+        self.out_dim = self.in_dim * 2 * self.query_num
+        return self.out_dim
+if __name__ == '__main__':
+    data = torch.randn(16, 512, 10, 35)
+    # model = StatisticsPooling()
+    model = MQMHASTP(512 * 10)
+    model = MHASTP(512 * 10)
+    model = MQMHASTP(512 * 10, context=False)
+    print(model)
+    out = model(data)
+    print(out.shape)
+    print(model.get_out_dim())

models/bicodec_tokenizer/modules/speaker/speaker_encoder.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+from ..fsq.residual_fsq import ResidualFSQ
+from .ecapa_tdnn import ECAPA_TDNN_GLOB_c512
+from .perceiver_encoder import PerceiverResampler
+"""
+x-vector + d-vector
+"""
+class SpeakerEncoder(nn.Module):
+    """
+    Args:
+        input_dim (int): acoustic feature dimension
+        out_dim (int): output dimension of x-vector and d-vector
+        latent_dim (int): latent dimension before quantization
+        token_num (int): sequence length of speaker tokens
+        fsq_levels (List[int]): number of levels for each quantizer
+        fsq_num_quantizers (int): number of quantizers
+    Return:
+        speaker_embs: (B, T2, out_dim)
+    """
+    def __init__(
+        self,
+        input_dim: int = 100,
+        out_dim: int = 512,
+        latent_dim: int = 128,
+        token_num: int = 32,
+        fsq_levels: List[int] = [4, 4, 4, 4, 4, 4],
+        fsq_num_quantizers: int = 1,
+    ):
+        super(SpeakerEncoder, self).__init__()
+        self.speaker_encoder = ECAPA_TDNN_GLOB_c512(
+            feat_dim=input_dim, embed_dim=out_dim
+        )
+        self.perceiver_sampler = PerceiverResampler(
+            dim=latent_dim, dim_context=512 * 3, num_latents=token_num
+        )
+        self.quantizer = ResidualFSQ(
+            levels=fsq_levels,
+            num_quantizers=fsq_num_quantizers,
+            dim=latent_dim,
+            is_channel_first=True,
+            quantize_dropout=False,
+        )
+        self.project = nn.Linear(latent_dim * token_num, out_dim)
+    def get_codes_from_indices(self, indices: torch.Tensor) -> torch.Tensor:
+        zq = self.quantizer.get_codes_from_indices(indices.transpose(1, 2))
+        return zq.transpose(1, 2)
+    def get_indices(self, mels: torch.Tensor) -> torch.Tensor:
+        mels = mels.transpose(1, 2)
+        x = self.perceiver_sampler(mels).transpose(1, 2)
+        zq, indices = self.quantizer(x)
+        return indices
+    def forward(self, mels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            mels: (B, D_mel, T1)
+        Return:
+            x_vector: (B, out_dim)
+            d_vector: (B, out_dim)
+        """
+        # mels = mels.transpose(1,2)
+        x_vector, features = self.speaker_encoder(mels, True)
+        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
+        zq, indices = self.quantizer(x)  # zq: (B, latent_dim, T2, latent_dim)
+        x = zq.reshape(zq.shape[0], -1)
+        d_vector = self.project(x)
+        return x_vector, d_vector
+    def tokenize(self, mels: torch.Tensor) -> torch.Tensor:
+        """tokenize the input mel spectrogram"""
+        _, features = self.speaker_encoder(mels, True)
+        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
+        zq, indices = self.quantizer(x)
+        return indices
+    def detokenize(self, indices: torch.Tensor) -> torch.Tensor:
+        """detokenize the input indices to d-vector"""
+        zq = self.quantizer.get_output_from_indices(indices.transpose(1, 2)).transpose(1, 2)
+        x = zq.reshape(zq.shape[0], -1)
+        d_vector = self.project(x)
+        return d_vector
+if __name__ == "__main__":
+    model = SpeakerEncoder(
+        input_dim=100,
+        latent_dim=128,
+        token_num=32,
+        fsq_levels=[4, 4, 4, 4, 4, 4],
+        fsq_num_quantizers=1,
+    )
+    mel = torch.randn(8, 200, 100)
+    x_vector, d_vector = model(mel)
+    print("x-vector shape", x_vector.shape)
+    print("d-vector shape", d_vector.shape)
+    indices = model.tokenize(mel)
+    print("indices shape", indices.shape)
+    d_vector_post = model.detokenize(indices)
+    print("d-vector shape", d_vector_post.shape)
+    if d_vector_post.all() == d_vector.all():
+        print("d-vector post and d-vector are the same")
+    else:
+        print("d-vector post and d-vector are different")
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))

models/bicodec_tokenizer/modules/vq/factorized_vector_quantize.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Heavily based on https://github.com/lucidrains/vector-quantize-pytorch
+from typing import Any, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+class FactorizedVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        codebook_size: int,
+        codebook_dim: int,
+        commitment: float,
+        codebook_loss_weight: float = 1.0,
+        decay: float = 0.99,
+        threshold_ema_dead_code: float = 2,
+        momentum: float = 0.99,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        self.codebook_loss_weight = codebook_loss_weight
+        self.decay = decay
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.momentum = momentum
+        if input_dim != self.codebook_dim:
+            self.in_project = WNConv1d(input_dim, self.codebook_dim, kernel_size=1)
+            self.out_project = WNConv1d(self.codebook_dim, input_dim, kernel_size=1)
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+        self.codebook = nn.Embedding(self.codebook_size, self.codebook_dim)
+        self.register_buffer("cluster_size", torch.zeros(self.codebook_size))
+    def forward(self, z: torch.Tensor) -> Dict[str, Any]:
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # transpose since we use linear
+        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
+        z_e = self.in_project(z)
+        z_q, indices, dists = self.decode_latents(z_e)
+        # statistic the usage of codes
+        embed_onehot = F.one_hot(indices, self.codebook_size).type(z_e.dtype)
+        avg_probs = torch.mean(embed_onehot.reshape(-1, self.codebook_size), dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        active_num = (embed_onehot.sum(0).sum(0) > 0).sum()
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            ema_inplace(self.cluster_size, embed_onehot.sum(0).sum(0), self.decay)
+            active_num = sum(self.cluster_size > self.threshold_ema_dead_code)
+        if self.training:
+            commit_loss = (
+                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+                * self.commitment
+            )
+            codebook_loss = (
+                F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+                * self.codebook_loss_weight
+            )
+        else:
+            commit_loss = torch.zeros(0, device=z.device)
+            codebook_loss = torch.zeros(0, device=z.device)
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_project(z_q)
+        vq_loss = (commit_loss + codebook_loss).mean()
+        return {
+            "z_q": z_q,
+            "indices": indices,
+            "dists": dists,
+            "vq_loss": vq_loss,
+            "perplexity": perplexity,
+            "active_num": active_num.float(),
+        }
+    def vq2emb(self, vq, out_proj=True):
+        emb = self.embed_code(vq)
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb
+    def tokenize(self, z: torch.Tensor) -> torch.Tensor:
+        """tokenize the input tensor"""
+        z_e = self.in_project(z)
+        _, indices, _ = self.decode_latents(z_e)
+        return indices
+    def detokenize(self, indices):
+        """detokenize the input indices"""
+        z_q = self.decode_code(indices)
+        z_q = self.out_project(z_q)
+        return z_q
+    def get_emb(self):
+        return self.codebook.weight
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight
+        # L2 normalize encodings and codebook
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance between encodings and codebook,
+        # with L2 normalization, the distance is equal to cosine distance
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices, dist

models/bicodec_tokenizer/spark_detokenizer.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# -*- coding: utf-8 -*-
+# Time      :2025/3/29 10:34
+# Author    :Hui Huang
+import os
+from typing import Literal
+import torch
+from .base_model import SparkBaseModel
+from .batch_processor import AsyncBatchEngine
+from .tokenizer_utils import get_dtype
+from .modules.encoder_decoder.feat_decoder import Decoder
+from .modules.encoder_decoder.wave_generator import WaveGenerator
+from .modules.speaker.speaker_encoder import SpeakerEncoder
+from .modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
+__all__ = ["SparkDeTokenizer"]
+class SparkDeTokenizerModel(SparkBaseModel):
+    def __init__(self, config):
+        super().__init__()
+        self.quantizer = FactorizedVectorQuantize(**config["quantizer"])
+        self.prenet = Decoder(**config["prenet"])
+        self.decoder = WaveGenerator(**config["decoder"])
+        self.speaker_encoder = SpeakerEncoder(**config["speaker_encoder"])
+    @torch.no_grad()
+    def forward(
+            self,
+            semantic_tokens: torch.Tensor,
+            global_tokens: torch.Tensor
+    ) -> torch.Tensor:
+        z_q = self.quantizer.detokenize(semantic_tokens)
+        d_vector = self.speaker_encoder.detokenize(global_tokens)
+        x = self.prenet(z_q, d_vector)
+        x = x + d_vector.unsqueeze(-1)
+        wav_recon = self.decoder(x)
+        return wav_recon.detach()
+class SparkDeTokenizer:
+    def __init__(
+            self,
+            model_path: str,
+            device: Literal["cpu", "cuda", "mps"] | str = "cpu",
+            batch_size: int = 32,
+            wait_timeout: float = 0.01):
+        self.device = torch.device(device)
+        self.model = SparkDeTokenizerModel.from_pretrained(model_path).to(self.device)
+        self.device_type = device
+        self.dtype = get_dtype(self.device_type)
+        self._batch_processor = AsyncBatchEngine(
+            processing_function=self.batch_detokenize_async,
+            batch_size=batch_size,
+            wait_timeout=wait_timeout
+        )
+    @torch.no_grad()
+    def detokenize(
+            self,
+            semantic_tokens: torch.Tensor,
+            global_tokens: torch.Tensor
+    ) -> torch.Tensor:
+        with torch.amp.autocast(self.device_type, dtype=self.dtype):
+            output = self.model(
+                semantic_tokens.to(self.device),
+                global_tokens.to(self.device)
+            )
+        return output
+    async def batch_detokenize_async(self, requests: list[dict[str, torch.Tensor]]) -> list[dict[str, torch.Tensor]]:
+        semantic_tokens, global_tokens = [], []
+        lengths = []
+        for request in requests:
+            semantic_tokens.append(request["semantic_tokens"])
+            global_tokens.append(request["global_tokens"])
+            lengths.append(len(request['semantic_tokens']))
+        # Concatenate tokens for batch processing
+        global_tokens = torch.stack(global_tokens, dim=0)
+        semantic_tokens = torch.nn.utils.rnn.pad_sequence(
+            semantic_tokens, batch_first=True, padding_value=0
+        )
+        # print(f"tokenizer global_tokens shape is {global_tokens.shape}")
+        # print(f"tokenizer semantic_tokens shape is {semantic_tokens.shape}")
+        audios = self.detokenize(
+            semantic_tokens=semantic_tokens,
+            global_tokens=global_tokens
+        ).detach().cpu()
+        # Prepare responses
+        responses = []
+        for i in range(len(requests)):
+            audio = audios[i, :, :(lengths[i] * 320)]  # 大概一个token对应audio长度320
+            responses.append({
+                "audio": audio,
+            })
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+        return responses
+    async def detokenize_async(self, request: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        output = await self._batch_processor.add_request(
+            single_input=request
+        )
+        return output.get("feature")

models/bicodec_tokenizer/spark_tokenizer.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# -*- coding: utf-8 -*-
+# Time      :2025/3/29 10:30
+# Author    :Hui Huang
+import os
+from typing import Literal, Optional, Tuple, Dict, Any, List, Union
+import torch
+import torchaudio
+import torchaudio.transforms as TT
+from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
+import numpy as np
+from loguru import logger
+from pathlib import Path
+# ----------------- 假设这些模块位于你的项目路径下 -----------------
+from .utils.file import load_config
+from .utils.audio import load_audio
+from .models.bicodec import BiCodec
+from .base_model import SparkBaseModel
+from .batch_processor import AsyncBatchEngine
+# ---------------------------------------------------------------
+__all__ = ["SparkTokenizer"]
+class SparkTokenizer:
+    def __init__(
+            self,
+            model_path: str,
+            device: Literal["cpu", "cuda", "mps"] | str = "cuda",
+            attn_implementation: Optional[Literal["sdpa", "flash_attention_2", "eager"]] = "eager",
+            batch_size: int = 32,
+            wait_timeout: float = 0.01,
+    ):
+        self.device = torch.device(device)
+        self.model_dir = Path(model_path)
+        # 1. 加载配置
+        self.config = load_config(self.model_dir / "config.yaml")
+        self.device_type = "cuda" if "cuda" in str(device) else "cpu"
+        self.dtype = torch.float16 if self.device_type == "cuda" else torch.float32
+        self.target_sample_rate = self.config.get("sample_rate", 16000)
+        # 2. 加载模型
+        wav2vec_path = self.model_dir / "wav2vec2-large-xlsr-53"
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_path)
+        self.feature_extractor = Wav2Vec2Model.from_pretrained(
+            wav2vec_path,
+            attn_implementation=attn_implementation,
+            torch_dtype=self.dtype
+        )
+        self.feature_extractor.config.output_hidden_states = True
+        self.feature_extractor.to(self.device)
+        self.feature_extractor.eval()
+        # BiCodec model
+        self.model = (
+            BiCodec.load_from_checkpoint(str(self.model_dir)).to(self.device).half()
+        )
+        self.model.eval()
+        # 异步处理引擎
+        self._batch_processor = AsyncBatchEngine(
+            processing_function=self.batch_tokenize_async,
+            batch_size=batch_size,
+            wait_timeout=wait_timeout
+        )
+    def _to_ndarray(self, audio_input: Union[str, Path, torch.Tensor]) -> np.ndarray:
+        """
+        将输入（路径或Tensor）统一转换为指定采样率的 numpy 数组。
+        """
+        if isinstance(audio_input, (str, Path)):
+            # 如果是路径，直接使用原有的 load_audio
+            wav = load_audio(
+                str(audio_input),
+                sampling_rate=self.target_sample_rate,
+                volume_normalize=self.config.get("volume_normalize", True),
+            )
+        elif isinstance(audio_input, torch.Tensor):
+            # 如果是 Tensor
+            wav = audio_input.detach().cpu().float()
+            # 处理通道: [C, T] -> [T]
+            if wav.ndim > 1:
+                wav = torch.mean(wav, dim=0)
+            # 这里默认输入的 Tensor 采样率已经是 self.target_sample_rate
+            # 如果需要在这里做重采样，需要额外传入输入采样率参数
+            wav = wav.numpy()
+            # 可选：音量归一化逻辑（如果 Tensor 没归一化）
+            if self.config.get("volume_normalize", True):
+                max_val = np.abs(wav).max()
+                if max_val > 0:
+                    wav = wav / max_val * 0.9
+        else:
+            raise ValueError(f"Unsupported audio type: {type(audio_input)}")
+        return wav
+    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
+        """获取参考音频片段"""
+        ref_segment_length = (
+            int(self.target_sample_rate * self.config["ref_segment_duration"])
+            // self.config["latent_hop_length"]
+            * self.config["latent_hop_length"]
+        )
+        wav_length = len(wav)
+        if ref_segment_length > wav_length:
+            wav = np.tile(wav, ref_segment_length // wav_length + 1)
+        return wav[:ref_segment_length]
+    def process_audio(self, audio_input: Union[str, torch.Tensor], ref_audio_input: Union[str, torch.Tensor] = None) -> Tuple[np.ndarray, torch.Tensor]:
+        """
+        处理音频和参考音频。
+        """
+        wav = self._to_ndarray(audio_input)
+        if ref_audio_input is None:
+            wav_ref_np = self.get_ref_clip(wav)
+        else:
+            ref_wav = self._to_ndarray(ref_audio_input)
+            wav_ref_np = self.get_ref_clip(ref_wav)
+        wav_ref = torch.from_numpy(wav_ref_np).unsqueeze(0).float()
+        return wav, wav_ref
+    def extract_wav2vec2_features(self, wavs: torch.Tensor) -> torch.Tensor:
+        """提取 wav2vec2 特征"""
+        # processor 期望是 list of numpy
+        inputs = self.processor(
+            [w.cpu().numpy() for w in wavs],
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True,
+        ).input_values
+        with torch.no_grad():
+            with torch.amp.autocast(self.device_type, dtype=self.dtype):
+                feat = self.feature_extractor(inputs.to(self.feature_extractor.device))
+        feats_mix = (
+            feat.hidden_states[11] + feat.hidden_states[14] + feat.hidden_states[16]
+        ) / 3
+        return feats_mix
+    @torch.no_grad()
+    def tokenize(self, audios: List[Union[str, torch.Tensor]]):
+        """
+        支持音频路径列表或 Tensor 列表。
+        """
+        batch_wavs = []
+        batch_ref_wavs = []
+        for audio_item in audios:
+            wav, wav_ref = self.process_audio(audio_input=audio_item, ref_audio_input=audio_item)
+            batch_wavs.append(torch.from_numpy(wav).float())
+            batch_ref_wavs.append(wav_ref.squeeze(0))
+        # Padding wavs
+        wav_lengths = [len(w) for w in batch_wavs]
+        max_wav_len = max(wav_lengths)
+        padded_wavs = torch.zeros(len(batch_wavs), max_wav_len, dtype=self.dtype).to(self.device)
+        for i, w in enumerate(batch_wavs):
+            padded_wavs[i, :len(w)] = w.to(self.dtype)
+        # Padding ref_wavs
+        ref_lengths = [len(w) for w in batch_ref_wavs]
+        max_ref_len = max(ref_lengths)
+        padded_ref_wavs = torch.zeros(len(batch_ref_wavs), max_ref_len, dtype=self.dtype).to(self.device)
+        for i, w in enumerate(batch_ref_wavs):
+            padded_ref_wavs[i, :len(w)] = w.to(self.dtype)
+        # 提取特征
+        feats = self.extract_wav2vec2_features(padded_wavs)
+        batch = {
+            "wav": padded_wavs,
+            "ref_wav": padded_ref_wavs,
+            "feat": feats,
+        }
+        semantic_tokens, global_tokens = self.model.tokenize(batch)
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+        return {"semantic_tokens": semantic_tokens, "global_tokens": global_tokens}
+    async def batch_tokenize_async(self, audios: list) -> list[dict[str, torch.Tensor]]:
+        tokenized = self.tokenize(audios)
+        responses = []
+        for i in range(len(audios)):
+            responses.append({
+                "global_tokens": tokenized["global_tokens"][i],
+                "semantic_tokens": tokenized["semantic_tokens"][i]
+            })
+        return responses
+    async def tokenize_async(self, audio: Union[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        output = await self._batch_processor.add_request(
+            single_input=audio
+        )
+        return output
+# ------------------------------------------------------------------
+# 测试用例
+# ------------------------------------------------------------------
+if __name__ == "__main__":
+    # 配置你的模型路径
+    MODEL_DIR = "/data/yumu/model/ark_tts_v1"
+    # 初始化
+    # 注意：在没有真实环境时，这行会因为找不到文件报错，请在有环境的地方运行
+    tokenizer = SparkTokenizer(model_path=MODEL_DIR, device="cuda" if torch.cuda.is_available() else "cpu")
+    # 准备数据：一个是本地存在的 wav 路径，一个是构造的 Tensor
+    dummy_wav_path = "/data/yumu/arktts/dufu.wav"
+    # 构造一个 16kHz 的 2 秒音频 Tensor (假设模型要求16k)
+    import torchaudio
+    dummy_tensor, sr = torchaudio.load(dummy_wav_path)
+    # 1. 测试路径输入
+    print("Testing path input...")
+    if os.path.exists(dummy_wav_path):
+        res1 = tokenizer.tokenize([dummy_wav_path])
+        print(f"Path results: {res1['semantic_tokens'].shape}")
+    # 2. 测试 Tensor 输入
+    print("Testing tensor input...")
+    res2 = tokenizer.tokenize([dummy_tensor])
+    print(f"Tensor results: {res2['semantic_tokens'].shape}")
+    # 3. 测试混合输入 (List 包含 str 和 Tensor)
+    print("Testing mixed input...")
+    # 为了演示，我们传两个相同的 tensor
+    res3 = tokenizer.tokenize([dummy_tensor, dummy_tensor])
+    print(f"Mixed results: {res3['semantic_tokens'].shape}")
+    print("All tests passed!")

models/bicodec_tokenizer/tokenizer_utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# -*- coding: utf-8 -*-
+# Time      :2025/3/29 10:27
+# Author    :Hui Huang
+from omegaconf import OmegaConf, DictConfig
+import torch
+def load_config(config_path: str) -> DictConfig:
+    """Loads a configuration file and optionally merges it with a base configuration.
+    Args:
+    config_path (Path): Path to the configuration file.
+    """
+    # Load the initial configuration from the given path
+    config = OmegaConf.load(config_path)
+    # Check if there is a base configuration specified and merge if necessary
+    if config.get("base_config", None) is not None:
+        base_config = OmegaConf.load(config["base_config"])
+        config = OmegaConf.merge(base_config, config)
+    return config
+def gpu_supports_fp16() -> bool:
+    # 1. 确保 CUDA 可用
+    if not torch.cuda.is_available():
+        return False
+    # 2. 获取设备的 compute capability
+    major, minor = torch.cuda.get_device_capability()
+    # 3. 判断是否 >= 5.3
+    if major > 5 or (major == 5 and minor >= 3):
+        return True
+    else:
+        return False
+def get_dtype(device: str):
+    if device.startswith('cuda') and gpu_supports_fp16():
+        return torch.float16
+    else:
+        return torch.float32

models/bicodec_tokenizer/utils/__init__.py ADDED Viewed

File without changes

models/bicodec_tokenizer/utils/audio.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Description:
+    This script contains a collection of functions designed to handle various
+    audio processing.
+"""
+import random
+import soxr
+import soundfile
+import torch
+import torchaudio
+import numpy as np
+from pathlib import Path
+from typing import Tuple
+from numpy.lib.stride_tricks import sliding_window_view
+def audio_volume_normalize(audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
+    """
+    Normalize the volume of an audio signal.
+    Parameters:
+        audio (numpy array): Input audio signal array.
+        coeff (float): Target coefficient for normalization, default is 0.2.
+    Returns:
+        numpy array: The volume-normalized audio signal.
+    """
+    # Sort the absolute values of the audio signal
+    temp = np.sort(np.abs(audio))
+    # If the maximum value is less than 0.1, scale the array to have a maximum of 0.1
+    if temp[-1] < 0.1:
+        scaling_factor = max(
+            temp[-1], 1e-3
+        )  # Prevent division by zero with a small constant
+        audio = audio / scaling_factor * 0.1
+    # Filter out values less than 0.01 from temp
+    temp = temp[temp > 0.01]
+    L = temp.shape[0]  # Length of the filtered array
+    # If there are fewer than or equal to 10 significant values, return the audio without further processing
+    if L <= 10:
+        return audio
+    # Compute the average of the top 10% to 1% of values in temp
+    volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
+    # Normalize the audio to the target coefficient level, clamping the scale factor between 0.1 and 10
+    audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
+    # Ensure the maximum absolute value in the audio does not exceed 1
+    max_value = np.max(np.abs(audio))
+    if max_value > 1:
+        audio = audio / max_value
+    return audio
+def load_audio(
+    adfile: Path,
+    sampling_rate: int = None,
+    length: int = None,
+    volume_normalize: bool = False,
+    segment_duration: int = None,
+) -> np.ndarray:
+    r"""Load audio file with target sampling rate and lsength
+    Args:
+        adfile (Path): path to audio file.
+        sampling_rate (int, optional): target sampling rate. Defaults to None.
+        length (int, optional): target audio length. Defaults to None.
+        volume_normalize (bool, optional): whether perform volume normalization. Defaults to False.
+        segment_duration (int): random select a segment with duration of {segment_duration}s.
+                                Defualt to None which means the whole audio will be used.
+    Returns:
+        audio (np.ndarray): audio
+    """
+    audio, sr = soundfile.read(adfile)
+    if len(audio.shape) > 1:
+        audio = audio[:, 0]
+    if sampling_rate is not None and sr != sampling_rate:
+        audio = soxr.resample(audio, sr, sampling_rate, quality="VHQ")
+        sr = sampling_rate
+    if segment_duration is not None:
+        seg_length = int(sr * segment_duration)
+        audio = random_select_audio_segment(audio, seg_length)
+    # Audio volume normalize
+    if volume_normalize:
+        audio = audio_volume_normalize(audio)
+    # check the audio length
+    if length is not None:
+        assert abs(audio.shape[0] - length) < 1000
+        if audio.shape[0] > length:
+            audio = audio[:length]
+        else:
+            audio = np.pad(audio, (0, int(length - audio.shape[0])))
+    return audio
+def random_select_audio_segment(audio: np.ndarray, length: int) -> np.ndarray:
+    """get an audio segment given the length
+    Args:
+        audio (np.ndarray):
+        length (int): audio length = sampling_rate * duration
+    """
+    if audio.shape[0] < length:
+        audio = np.pad(audio, (0, int(length - audio.shape[0])))
+    start_index = random.randint(0, audio.shape[0] - length)
+    end_index = int(start_index + length)
+    return audio[start_index:end_index]
+def audio_highpass_filter(audio, sample_rate, highpass_cutoff_freq):
+    """apply highpass fileter to audio
+    Args:
+        audio (np.ndarray):
+        sample_rate (ind):
+        highpass_cutoff_freq (int):
+    """
+    audio = torchaudio.functional.highpass_biquad(
+        torch.from_numpy(audio), sample_rate, cutoff_freq=highpass_cutoff_freq
+    )
+    return audio.numpy()
+def stft(
+    x: torch.Tensor,
+    fft_size: int,
+    hop_size: int,
+    win_length: int,
+    window: str,
+    use_complex: bool = False,
+) -> torch.Tensor:
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(
+        x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True
+    )
+    # clamp is needed to avoid nan or inf
+    if not use_complex:
+        return torch.sqrt(
+            torch.clamp(x_stft.real**2 + x_stft.imag**2, min=1e-7, max=1e3)
+        ).transpose(2, 1)
+    else:
+        res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1)
+        res = res.transpose(2, 3)  # [B, 2, T, F]
+        return res
+def detect_speech_boundaries(
+    wav: np.ndarray,
+    sample_rate: int,
+    window_duration: float = 0.1,
+    energy_threshold: float = 0.01,
+    margin_factor: int = 2
+) -> Tuple[int, int]:
+    """Detect the start and end points of speech in an audio signal using RMS energy.
+    Args:
+        wav: Input audio signal array with values in [-1, 1]
+        sample_rate: Audio sample rate in Hz
+        window_duration: Duration of detection window in seconds
+        energy_threshold: RMS energy threshold for speech detection
+        margin_factor: Factor to determine extra margin around detected boundaries
+    Returns:
+        tuple: (start_index, end_index) of speech segment
+    Raises:
+        ValueError: If the audio contains only silence
+    """
+    window_size = int(window_duration * sample_rate)
+    margin = margin_factor * window_size
+    step_size = window_size // 10
+    # Create sliding windows using stride tricks to avoid loops
+    windows = sliding_window_view(wav, window_size)[::step_size]
+    # Calculate RMS energy for each window
+    energy = np.sqrt(np.mean(windows ** 2, axis=1))
+    speech_mask = energy >= energy_threshold
+    if not np.any(speech_mask):
+        raise ValueError("No speech detected in audio (only silence)")
+    start = max(0, np.argmax(speech_mask) * step_size - margin)
+    end = min(len(wav), (len(speech_mask) - 1 - np.argmax(speech_mask[::-1])) * step_size + margin)
+    return start, end
+def remove_silence_on_both_ends(
+    wav: np.ndarray,
+    sample_rate: int,
+    window_duration: float = 0.1,
+    volume_threshold: float = 0.01
+) -> np.ndarray:
+    """Remove silence from both ends of an audio signal.
+    Args:
+        wav: Input audio signal array
+        sample_rate: Audio sample rate in Hz
+        window_duration: Duration of detection window in seconds
+        volume_threshold: Amplitude threshold for silence detection
+    Returns:
+        np.ndarray: Audio signal with silence removed from both ends
+    Raises:
+        ValueError: If the audio contains only silence
+    """
+    start, end = detect_speech_boundaries(
+        wav,
+        sample_rate,
+        window_duration,
+        volume_threshold
+    )
+    return wav[start:end]
+def hertz_to_mel(pitch: float) -> float:
+    """
+    Converts a frequency from the Hertz scale to the Mel scale.
+    Parameters:
+    - pitch: float or ndarray
+        Frequency in Hertz.
+    Returns:
+    - mel: float or ndarray
+        Frequency in Mel scale.
+    """
+    mel = 2595 * np.log10(1 + pitch / 700)
+    return mel

models/bicodec_tokenizer/utils/file.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang (w.xinshawn@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Description:
+    This script contains a collection of functions designed to handle various
+    file reading and writing operations. It provides utilities to read from files,
+    write data to files, and perform file manipulation tasks.
+"""
+import os
+import json
+import json
+import csv
+from tqdm import tqdm
+from typing import List, Dict, Any, Set, Union
+from pathlib import Path
+from omegaconf import OmegaConf, DictConfig
+def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
+    """
+    Resolves the absolute path of a symbolic link.
+    Args:
+        symbolic_link_path (Path): The path to the symbolic link.
+    Returns:
+        Path: The absolute path that the symbolic link points to.
+    """
+    link_directory = os.path.dirname(symbolic_link_path)
+    target_path_relative = os.readlink(symbolic_link_path)
+    return os.path.join(link_directory, target_path_relative)
+def write_jsonl(metadata: List[dict], file_path: Path) -> None:
+    """Writes a list of dictionaries to a JSONL file.
+    Args:
+    metadata : List[dict]
+        A list of dictionaries, each representing a piece of meta.
+    file_path : Path
+        The file path to save the JSONL file
+    This function writes each dictionary in the list to a new line in the specified file.
+    """
+    with open(file_path, "w", encoding="utf-8") as f:
+        for meta in tqdm(metadata, desc="writing jsonl"):
+            # Convert dictionary to JSON string and write it to the file with a newline
+            json_str = json.dumps(meta, ensure_ascii=False) + "\n"
+            f.write(json_str)
+    print(f"jsonl saved to {file_path}")
+def read_jsonl(file_path: Path) -> List[dict]:
+    """
+    Reads a JSONL file and returns a list of dictionaries.
+    Args:
+    file_path : Path
+        The path to the JSONL file to be read.
+    Returns:
+    List[dict]
+        A list of dictionaries parsed from each line of the JSONL file.
+    """
+    metadata = []
+    # Open the file for reading
+    with open(file_path, "r", encoding="utf-8") as f:
+        # Split the file into lines
+        lines = f.read().splitlines()
+    # Process each line
+    for line in lines:
+        # Convert JSON string back to dictionary and append to list
+        meta = json.loads(line)
+        metadata.append(meta)
+    # Return the list of metadata
+    return metadata
+def read_json_as_jsonl(file_path: Path) -> List[dict]:
+    metadata = []
+    with open(file_path, 'r', encoding='utf-8') as infile:
+        data = json.load(infile)
+    for k in sorted(data.keys()):
+        meta = {'index': k}
+        meta.update(data[k])
+        metadata.append(meta)
+    return metadata
+def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
+    processed_meta = {}
+    for k, v in meta.items():
+        if isinstance(v, str):
+            processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
+        else:
+            processed_meta[k] = v
+    return processed_meta
+def load_config(config_path: Path) -> DictConfig:
+    """Loads a configuration file and optionally merges it with a base configuration.
+    Args:
+    config_path (Path): Path to the configuration file.
+    """
+    # Load the initial configuration from the given path
+    config = OmegaConf.load(config_path)
+    # Check if there is a base configuration specified and merge if necessary
+    if config.get("base_config", None) is not None:
+        base_config = OmegaConf.load(config["base_config"])
+        config = OmegaConf.merge(base_config, config)
+    return config
+def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
+    """
+    Converts a JSONL file to a CSV file.
+    This function reads a JSONL file, determines all unique keys present in the file,
+    and writes the data to a CSV file with columns for all these keys.
+    """
+    all_keys = set()
+    data_rows = []
+    # Read the JSONL file once to extract keys and collect data
+    with open(jsonl_file_path, 'r') as file:
+        for line in file:
+            data = json.loads(line.strip())
+            data_rows.append(data)
+            all_keys.update(data.keys())
+    # Convert the set of keys to a sorted list for consistent column order
+    sorted_keys = sorted(all_keys)
+    # Write the data to a CSV file
+    with open(csv_file_path, 'w', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
+        # Write the header row
+        writer.writeheader()
+        # Write each row of data
+        for data in data_rows:
+            writer.writerow(data)
+    print(f"CSV file has been created at {csv_file_path}")
+def save_metadata(data, filename, headers=None):
+    """
+    Save metadata to a file.
+    Args:
+        data (list of dict): Metadata to be saved.
+        filename (str): Name of the file to save the metadata.
+        headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
+    """
+    # Set headers to keys from the first dictionary in data if not explicitly provided
+    if headers is None:
+        headers = list(data[0].keys())
+    with open(filename, "w", encoding="utf-8") as file:
+        # Write the headers to the file
+        file.write("|".join(headers) + "\n")
+        for entry in data:
+            # Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
+            formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers]
+            # Write the formatted values to the file
+            file.write("|".join(formatted_values) + "\n")
+def read_metadata(filename, headers=None):
+    """
+    Read metadata from a file.
+    Args:
+        filename (str): The file from which to read the metadata.
+    Returns:
+        list of dict: The metadata read from the file.
+        list of str: The headers used in the file.
+    """
+    with open(filename, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+    data = []
+    # Set headers from the first line of the file if not provided
+    if headers is None:
+        headers = lines[0].strip().split("|")
+        lines = lines[1:]
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines
+        if not line:
+            continue
+        # Split the line by '|' and pair with headers to form a dictionary
+        entry_data = dict(zip(headers, line.split("|")))
+        data.append(entry_data)
+    return data, headers

models/bicodec_tokenizer/utils/parse_options.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+# for ((argpos=1; argpos<$#; argpos++)); do
+#   if [ "${!argpos}" == "--config" ]; then
+#     argpos_plus1=$((argpos+1))
+#     config=${!argpos_plus1}
+#     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+#     . $config  # source the config file.
+#   fi
+# done
+###
+### No we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.

models/bicodec_tokenizer/utils/token_parser.py ADDED Viewed

	@@ -0,0 +1,187 @@

+TASK_TOKEN_MAP = {
+    "vc": "<|task_vc|>",
+    "tts": "<|task_tts|>",
+    "asr": "<|task_asr|>",
+    "s2s": "<|task_s2s|>",
+    "t2s": "<|task_t2s|>",
+    "understand": "<|task_understand|>",
+    "caption": "<|task_cap|>",
+    "controllable_tts": "<|task_controllable_tts|>",
+    "prompt_tts": "<|task_prompt_tts|>",
+    "speech_edit": "<|task_edit|>",
+}
+LEVELS_MAP = {
+    "very_low": 0,
+    "low": 1,
+    "moderate": 2,
+    "high": 3,
+    "very_high": 4,
+}
+LEVELS_MAP_UI = {
+    1: 'very_low',
+    2: 'low',
+    3: 'moderate',
+    4: 'high',
+    5: 'very_high'
+}
+GENDER_MAP = {
+    "female": 0,
+    "male": 1,
+}
+AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
+EMO_MAP = {
+    "UNKNOWN": 0,
+    "NEUTRAL": 1,
+    "ANGRY": 2,
+    "HAPPY": 3,
+    "SAD": 4,
+    "FEARFUL": 5,
+    "DISGUSTED": 6,
+    "SURPRISED": 7,
+    "SARCASTIC": 8,
+    "EXCITED": 9,
+    "SLEEPY": 10,
+    "CONFUSED": 11,
+    "EMPHASIS": 12,
+    "LAUGHING": 13,
+    "SINGING": 14,
+    "WORRIED": 15,
+    "WHISPER": 16,
+    "ANXIOUS": 17,
+    "NO-AGREEMENT": 18,
+    "APOLOGETIC": 19,
+    "CONCERNED": 20,
+    "ENUNCIATED": 21,
+    "ASSERTIVE": 22,
+    "ENCOURAGING": 23,
+    "CONTEMPT": 24,
+}
+class TokenParser:
+    """Turn label to special token"""
+    def __init__(self):
+        pass
+    """Parse the attributes of a person."""
+    def __init__(self):
+        pass
+    @staticmethod
+    def age(age: str) -> str:
+        """Turn age token."""
+        age_id = AGE_MAP[age]
+        return f"<|age_{age_id}|>"
+    @staticmethod
+    def gender(gender: str) -> str:
+        """Turn gender token."""
+        gender_id = GENDER_MAP[gender]
+        return f"<|gender_{gender_id}|>"
+    @staticmethod
+    def mel_value(mel: int):
+        """Turn special token of mel scale pitch."""
+        mel = max(0, int(mel))
+        mel = min(1000, int(mel))
+        return f"<|pitch_value_{mel}|>"
+    @staticmethod
+    def mel_level(level: str):
+        """Turn special token of mel level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|pitch_label_{level_tag}|>"
+    @staticmethod
+    def pitch_var_value(pitch_std: int):
+        """Turn special token of pitch_std value."""
+        assert isinstance(pitch_std, int)
+        pitch_std = max(0, int(pitch_std))
+        pitch_std = min(10, int(pitch_std))
+        return f"<|pitch_var_value_{pitch_std}|>"
+    @staticmethod
+    def pitch_var_level(level: str):
+        """Turn special token of pitch std level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|pitch_var_label_{level_tag}|>"
+    @staticmethod
+    def loudness_value(loudness: int):
+        """Turn special toak of loudness value [0, 30]"""
+        assert loudness >= 0
+        loudness = max(0, int(loudness))
+        loudness = min(30, int(loudness))
+        return f"<|loudness_value_{loudness}|>"
+    @staticmethod
+    def loudness_level(level: str):
+        """Turn special token of loudness level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|loudness_label_{level_tag}|>"
+    @staticmethod
+    def speed_value(speed: int):
+        """Turn special token of speed value."""
+        speed = max(0, int(speed))
+        speed = min(10, int(speed))
+        return f"<|speed_value_{speed}|>"
+    @staticmethod
+    def speed_level(level: str):
+        """Turn special token of speed level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|speed_label_{level_tag}|>"
+    @staticmethod
+    def task(task: str) -> str:
+        """Turn special token of task."""
+        assert task in TASK_TOKEN_MAP.keys()
+        return TASK_TOKEN_MAP[task]
+    @staticmethod
+    def emotion(emotion: str):
+        emo_id = EMO_MAP[emotion]
+        return f"<|emotion_{emo_id}|>"
+# test
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        "/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
+    )
+    tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
+    ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
+    genders = ["female", "female", "female", "male", "male"]
+    mels = [100, 200, 300, 400, 500]
+    mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
+    loudnesses = [1, 10, 23, 19, 30]
+    loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
+    emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
+    for i in range(5):
+        task = TokenParser.task(tasks[i])
+        age = TokenParser.age(ages[i])
+        gender = TokenParser.gender(genders[i])
+        mel = TokenParser.mel_value(mels[i])
+        mel_level = TokenParser.mel_level(mel_levels[i])
+        loudness = TokenParser.loudness_value(loudnesses[i])
+        loudness_level = TokenParser.loudness_level(loudness_levels[i])
+        emotion = TokenParser.emotion(emotions[i])
+        inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
+        inputs = "".join(inputs)
+        ids = tokenizer.encode(inputs, add_special_tokens=False)
+        print(ids)
+        print("decode", tokenizer.decode(ids))

models/glm_speech_tokenizer/__init__.py ADDED Viewed

File without changes

models/glm_speech_tokenizer/batch_processor.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# -*- coding: utf-8 -*-
+# Time      :2024/11/17 15:33
+# Author    :Hui Huang
+import asyncio
+import uuid
+from typing import Callable, List, Any, Awaitable, Tuple
+from asyncio import Queue
+class BatchProcessor:
+    """Batch Processor for handling asynchronous requests in batches.
+    This class manages a queue of requests and processes them in batches
+    using multiple worker tasks.
+    Attributes:
+        processing_function (Callable[[List[Any]], Awaitable[List[Any]]]):
+            The function used for processing requests in batches.
+        num_workers (int): The number of worker tasks to process requests.
+        batch_size (int): The maximum number of requests to process in a single batch.
+        request_queue (Queue): The queue holding incoming requests.
+        loop (asyncio.AbstractEventLoop): The event loop used to create worker tasks.
+        worker_tasks (List[asyncio.Task]): The list of worker tasks.
+    """
+    def __init__(
+            self,
+            processing_function: Callable[[List[Any]], Awaitable[List[Any]]],
+            num_workers: int,
+            batch_size: int,
+            wait_timeout: float = 0.05
+    ) -> None:
+        """Initialize the BatchProcessor with the given processing function, number of workers, and batch size.
+        Args:
+            processing_function (Callable[[List[Any]], Awaitable[List[Any]]]):
+                The function used for processing requests in batches.
+            num_workers (int): The number of worker tasks to process requests.
+            batch_size (int): The maximum number of requests to process in a single batch.
+        """
+        self.processing_function = processing_function
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.wait_timeout = wait_timeout
+        self.request_queue: Queue = Queue()
+        self.loop = asyncio.get_running_loop()
+        self.worker_tasks = [
+            self.loop.create_task(self.batch_processor(i)) for i in range(num_workers)
+        ]
+        # Wait until all worker tasks are started
+        self.loop.create_task(self._log_workers_started())
+    async def _log_workers_started(self):
+        await asyncio.sleep(0)  # Yield control to ensure workers have started
+    async def batch_processor(self, worker_id: int):
+        """Worker task that processes requests from the queue in batches.
+        Args:
+            worker_id (int): The identifier for the worker task.
+        """
+        while True:
+            requests: List[Tuple[Any, asyncio.Future]] = []
+            try:
+                while len(requests) < self.batch_size:
+                    request = await asyncio.wait_for(
+                        self.request_queue.get(), timeout=self.wait_timeout
+                    )
+                    requests.append(request)
+            except asyncio.TimeoutError:
+                pass
+            if requests:
+                all_requests = [
+                    req[0] for req in requests
+                ]  # Extract the actual input data from each request tuple
+                futures = [req[1] for req in requests]  # Extract the futures to resolve
+                try:
+                    results = await self.processing_function(all_requests)
+                    for (future, result) in zip(futures, results):
+                        future.set_result(result)
+                except Exception as e:
+                    for future in futures:
+                        future.set_exception(e)
+    async def add_request(self, single_input: Any):
+        """Add a new request to the queue.
+        Args:
+            single_input (Any): The input data for processing.
+        """
+        # loop = asyncio.get_running_loop()
+        future = self.loop.create_future()
+        self.request_queue.put_nowait((single_input, future))
+        return future
+    async def shutdown(self):
+        """Shutdown the batch processor by cancelling all worker tasks."""
+        for task in self.worker_tasks:
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                print("Worker task cancelled.")
+class AsyncBatchEngine:
+    def __init__(
+            self,
+            processing_function: Callable[[List[Any]], Awaitable[List[Any]]],
+            batch_size: int = 32,
+            wait_timeout: float = 0.01,
+    ):
+        """
+        Initialize the AsyncBatchEngine with a processing function, number of workers, and batch size.
+        Args:
+            processing_function (Callable[[List[Any]], Awaitable[List[Any]]]): The batch processing function.
+            batch_size (int): The maximum number of requests to process in a single batch.
+        """
+        self._processing_function = processing_function
+        self._batch_size = batch_size
+        self._is_running = False
+        self._batch_processor = None
+        self._wait_timeout = wait_timeout
+    async def start(self):
+        """Start the engine by initializing the batch processor and worker tasks."""
+        if self._is_running:
+            return
+        self._batch_processor = BatchProcessor(
+            processing_function=self._processing_function,
+            batch_size=self._batch_size,
+            wait_timeout=self._wait_timeout,
+            num_workers=1
+        )
+        self._is_running = True
+    async def stop(self):
+        """Stop the engine by shutting down the batch processor and worker tasks."""
+        self._check_running()
+        self._is_running = False
+        if self._batch_processor is not None:
+            await self._batch_processor.shutdown()
+    def _check_running(self):
+        """Check if the engine is running.
+        Raises:
+            ValueError: If the engine is not running.
+        """
+        if not self._is_running:
+            raise ValueError(
+                "The engine is not running. "
+                "You must start the engine before using it."
+            )
+    async def add_request(self, single_input: Any, request_id: str = None) -> dict:
+        """Asynchronously add a request to be processed.
+        Args:
+            single_input (Any): The input data for processing.
+            request_id (str): Optional request identifier to avoid data mix-up.
+        Raises:
+            ValueError: If the engine is not running when this method is called.
+        """
+        if not self._is_running:
+            await self.start()
+        if request_id is None:
+            request_id = str(uuid.uuid4())  # Assign a unique ID if not provided
+        future = await self._batch_processor.add_request(single_input=single_input)  # type: ignore
+        result = await future
+        return dict(
+            request_id=request_id,
+            feature=result
+        )

models/glm_speech_tokenizer/configuration_whisper.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers import WhisperConfig
+class WhisperVQConfig(WhisperConfig):
+    def __init__(self,
+                 pooling_kernel_size=None,
+                 pooling_type="max",
+                 pooling_position=0,
+                 quantize_vocab_size=None,
+                 quantize_position=16,
+                 quantize_commit_coefficient=0.25,
+                 quantize_loss_scale=1.0,
+                 quantize_ema_decay=None,
+                 quantize_restart_interval=None,
+                 quantize_encoder_only=False,
+                 quantize_causal_encoder=False,
+                 quantize_causal_block_size=None,
+                 skip_language_detection=False,
+                 encoder_causal_attention=False,
+                 encoder_causal_convolution=False,
+                 **kwargs):
+        self.pooling_kernel_size = pooling_kernel_size
+        self.pooling_type = pooling_type
+        self.pooling_position = pooling_position
+        self.quantize_vocab_size = quantize_vocab_size
+        self.quantize_position = quantize_position
+        self.quantize_commit_coefficient = quantize_commit_coefficient
+        self.quantize_loss_scale = quantize_loss_scale
+        self.quantize_ema_decay = quantize_ema_decay
+        self.quantize_restart_interval = quantize_restart_interval
+        self.quantize_encoder_only = quantize_encoder_only
+        self.quantize_causal_encoder = quantize_causal_encoder
+        self.quantize_causal_block_size = quantize_causal_block_size
+        self.skip_language_detection = skip_language_detection
+        self.encoder_causal_attention = encoder_causal_attention
+        self.encoder_causal_convolution = encoder_causal_convolution
+        super().__init__(**kwargs)

models/glm_speech_tokenizer/generation_whisper.py ADDED Viewed

	@@ -0,0 +1,1828 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import warnings
+import zlib
+from typing import Callable, Iterator, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.cache_utils import EncoderDecoderCache
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+    WhisperNoSpeechDetection,
+    WhisperTimeStampLogitsProcessor,
+)
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.utils import logging
+from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
+logger = logging.get_logger(__name__)
+def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Applies a median filter of width `filter_width` along the last dimension of the input.
+    The `inputs` tensor is assumed to be 3- or 4-dimensional.
+    """
+    if filter_width <= 0 or filter_width % 2 != 1:
+        raise ValueError("`filter_width` should be an odd number")
+    pad_width = filter_width // 2
+    if inputs.shape[-1] <= pad_width:
+        return inputs
+    # Pad the left and right edges.
+    inputs = nn.functional.pad(inputs, (pad_width, pad_width, 0, 0), mode="reflect")
+    # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+    result = inputs.unfold(-1, filter_width, 1).sort()[0][..., pad_width]
+    return result
+def _dynamic_time_warping(matrix: np.ndarray):
+    """
+    Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate
+    token-level timestamps.
+    """
+    output_length, input_length = matrix.shape
+    cost = np.ones((output_length + 1, input_length + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((output_length + 1, input_length + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, input_length + 1):
+        for i in range(1, output_length + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = matrix[i - 1, j - 1] + c
+            trace[i, j] = t
+    # backtrace
+    i = trace.shape[0] - 1
+    j = trace.shape[1] - 1
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    text_indices = []
+    time_indices = []
+    while i > 0 or j > 0:
+        text_indices.append(i - 1)
+        time_indices.append(j - 1)
+        if trace[i, j] == 0:
+            i -= 1
+            j -= 1
+        elif trace[i, j] == 1:
+            i -= 1
+        elif trace[i, j] == 2:
+            j -= 1
+        else:
+            raise RuntimeError(
+                f"Internal error in dynamic time warping. Unexpected trace[{i}, {j}]. Please file a bug report."
+            )
+    text_indices = np.array(text_indices)[::-1]
+    time_indices = np.array(time_indices)[::-1]
+    return text_indices, time_indices
+def _get_attr_from_logit_processors(logits_processor, logit_processor_class, attribute_name):
+    if logits_processor is not None:
+        logit_processor = next((cls for cls in logits_processor if isinstance(cls, logit_processor_class)), None)
+        if logit_processor:
+            return getattr(logit_processor, attribute_name, None)
+    return None
+def _pad_to_max_length(
+    current_segments,
+    pad_token_id,
+    device,
+    padding_side="right",
+    padding="longest",
+    bos_token_tensor=None,
+    cut_off_length=None,
+):
+    max_total_length = 0
+    sequences = []
+    if padding_side not in ["right", "left"]:
+        raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
+    if padding not in ["longest", "max_length"]:
+        raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
+    elif padding == "max_length" and cut_off_length is None:
+        raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
+    for current_segment_list in current_segments:
+        if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
+            sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
+            if cut_off_length is not None:
+                sequence = sequence[-cut_off_length:]
+            if bos_token_tensor is not None:
+                sequence = torch.cat([bos_token_tensor, sequence])
+            sequences.append(sequence)
+            max_total_length = max(max_total_length, len(sequences[-1]))
+        elif bos_token_tensor is not None:
+            sequences.append(bos_token_tensor)
+        else:
+            sequences.append(torch.tensor([], device=device))
+    max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
+    for i in range(len(current_segments)):
+        pad_length = max_total_length - len(sequences[i])
+        pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
+        sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
+    sequences = torch.stack(sequences, dim=0)
+    return sequences
+class WhisperGenerationMixin:
+    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
+        """
+        Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
+        map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
+        cross-attentions will be cropped before applying DTW.
+        Returns:
+            tensor containing the timestamps in seconds for each predicted token
+        """
+        # Create a list with `decoder_layers` elements, each a tensor of shape
+        # (batch size, attention_heads, output length, input length).
+        cross_attentions = []
+        for i in range(self.config.decoder_layers):
+            cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
+        # Select specific cross-attention layers and heads. This is a tensor
+        # of shape (batch size, num selected, output length, input length).
+        weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
+        weights = weights.permute([1, 0, 2, 3])
+        weight_length = None
+        if "beam_indices" in generate_outputs:
+            # If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths
+            # since the beam search strategy chooses the most probable sequences at the end of the search.
+            # In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length
+            weight_length = (generate_outputs.beam_indices != -1).sum(-1).max()
+            weights = weights[:, :, :weight_length]
+            # If beam index is still -1, it means that the associated token id is EOS
+            # We need to replace the index with 0 since index_select gives an error if any of the indexes is -1.
+            beam_indices = generate_outputs.beam_indices[:, :weight_length]
+            beam_indices = beam_indices.masked_fill(beam_indices == -1, 0)
+            # Select the cross attention from the right beam for each output sequences
+            weights = torch.stack(
+                [
+                    torch.index_select(weights[:, :, i, :], dim=0, index=beam_indices[:, i])
+                    for i in range(beam_indices.shape[1])
+                ],
+                dim=2,
+            )
+        # make sure timestamps are as long as weights
+        input_length = weight_length or cross_attentions[0].shape[2]
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, : input_length + 1]
+        batch_size = timestamps.shape[0]
+        if num_frames is not None:
+            # two cases:
+            # 1. num_frames is the same for each sample -> compute the DTW matrix for each sample in parallel
+            # 2. num_frames is different, compute the DTW matrix for each sample sequentially
+            # we're using np.unique because num_frames can be int/list/tuple
+            if isinstance(num_frames, int):
+                weights = weights[..., : num_frames // 2]
+            elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+            elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+            else:
+                # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
+                repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
+                num_frames = np.repeat(num_frames, repeat_time)
+        if num_frames is None or isinstance(num_frames, int):
+            # Normalize and smoothen the weights.
+            std = torch.std(weights, dim=-2, keepdim=True, unbiased=False)
+            mean = torch.mean(weights, dim=-2, keepdim=True)
+            weights = (weights - mean) / std
+            weights = _median_filter(weights, self.config.median_filter_width)
+            # Average the different cross-attention heads.
+            weights = weights.mean(dim=1)
+        # Perform dynamic time warping on each element of the batch.
+        for batch_idx in range(batch_size):
+            if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)):
+                matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
+                # Normalize and smoothen the weights.
+                std = torch.std(matrix, dim=-2, keepdim=True, unbiased=False)
+                mean = torch.mean(matrix, dim=-2, keepdim=True)
+                matrix = (matrix - mean) / std
+                matrix = _median_filter(matrix, self.config.median_filter_width)
+                # Average the different cross-attention heads.
+                matrix = matrix.mean(dim=0)
+            else:
+                matrix = weights[batch_idx]
+            text_indices, time_indices = _dynamic_time_warping(-matrix.cpu().double().numpy())
+            jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+            jump_times = time_indices[jumps] * time_precision
+            timestamps[batch_idx, 1:] = torch.tensor(jump_times)
+        return timestamps
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: bool = False,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[Union[str, List[str]]] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[torch.Tensor] = None,
+        prompt_condition_type: Optional[str] = None,  # first-segment, all-segments
+        condition_on_prev_tokens: Optional[bool] = None,
+        temperature: Optional[Union[float, Tuple[float, ...]]] = None,
+        compression_ratio_threshold: Optional[float] = None,
+        logprob_threshold: Optional[float] = None,
+        no_speech_threshold: Optional[float] = None,
+        num_segment_frames: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        time_precision: float = 0.02,
+        return_token_timestamps: Optional[bool] = None,
+        return_segments: bool = False,
+        return_dict_in_generate: Optional[bool] = None,
+        **kwargs,
+    ):
+        """
+        Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.
+        <Tip warning={true}>
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+        </Tip>
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str` or list of `str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. For
+                batched generation, a list of language tokens can be passed. You can find all the possible language
+                tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`torch.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            prompt_condition_type (`str`, *optional*):
+                Only relevant for long-form transcription. Condition type of `prompt_ids`. 'first-segment' means only the first segment is conditioned on `prompt_ids`. 'all-segments' means each segment is conditioned on `prompt_ids`. Make sure to enable `condition_on_prev_tokens` for 'all-segments'.
+                Defaults to 'first-segment'. For short-term transcription only 'first-segment' is possible.
+            condition_on_prev_tokens (`bool`, *optional*):
+                Only relevant for long-form transcription. Whether to condition each segment on the previous segment.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            temperature (`float` or list of `float`, *optional*):
+                The temperature to be used for generation. Passing a single `float` value and `do_sample=True` activates
+                generation using sampling. For long-form transcription, temperature fallback can be activated by passing
+                a list of float values such as (0.0, 0.2, 0.4, 0.6, 0.8, 1.0). As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            compression_ratio_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the zlib compression rate of each segment will be computed. If the compression rate of
+                a segment is higher than `compression_ratio_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
+                repeated using a higher temperature. The intuition behind this feature is that segments with very high compression rates
+                suffer from a lot of repetition. The unwanted repetition can be reduced by injecting more randomness by increasing the temperature. If `compression_ratio_threshold` is defined
+                make sure that `temperature` is a list of values. A common value for `compression_ratio_threshold` is 1.35.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            logprob_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the average log-probability of each segment will be computed. If the log-probability of
+                a given segment is lower than `logprob_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
+                repeated using a higher temperature. The intuition behind this feature is that segments of low log-probability
+                can be improved by injecting more randomness by increasing the temperature. If `logprob_threshold` is defined
+                make sure that `temperature` is a list of values. A common value for `logprob_threshold` is -1.0.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            no_speech_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the "no-speech" token combined with the `logprob_threshold`
+                is used to determine whether a segment contains only silence. In this case, the transcription for this segment
+                is skipped.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            num_segment_frames (`int`, *optional*):
+                The number of frames a single segment is made of. If not defined, `num_segment_frames` defaults to the model's stride
+                times the maximum input length.
+            attention_mask (`torch.Tensor`, *optional*):
+                `attention_mask` needs to be passed when doing long-form transcription using a batch size > 1.
+            time_precision (`int`, *optional*, defaults to 0.02):
+                The duration of output token in seconds. *E.g.* 0.02 means that a generated token on average accounts
+                for 20 ms.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            return_segments (`bool`, *optional*, defaults to `False`):
+                Whether to additionally return a list of all segments. Note that this option can only be enabled
+                when doing long-form transcription.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of just returning the generated tokens.
+                Note that when doing long-form transcription, `return_dict_in_generate` can only be enabled when
+                `return_segments` is set True. In this case the generation outputs of each segment is added to each
+                segment.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor` or `Dict[str, Any]`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor` or a dict of segments when `return_segments=True`.
+                If the passed input is > 30 seconds / > 3000 mel input features and `return_segments=True` then a dictionary of generated sequence ids, called `sequences` and a list of each generated segment is returned.
+                else if the passed input is <= 30 seconds / >= 3000 mel input features, the possible [`~utils.ModelOutput`] types are:
+                    - [`~generation.GenerateEncoderDecoderOutput`],
+                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
+                else only the generated output sequence ids are returned.
+        Example:
+        - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate.
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset, Audio
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        >>> model.cuda()  # doctest: +IGNORE_RESULT
+        >>> # load audios > 30 seconds
+        >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        >>> # resample to 16kHz
+        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+        >>> # take first 8 audios and retrieve array
+        >>> audio = ds[:8]["audio"]
+        >>> audio = [x["array"] for x in audio]
+        >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
+        >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
+        >>> inputs = inputs.to("cuda", torch.float32)
+        >>> # transcribe audio to ids
+        >>> generated_ids = model.generate(**inputs)
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        >>> transcription[0]
+        " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile."
+        ```
+        - *Shortform transcription*: If passed mel input features are < 30 seconds, the whole audio will be transcribed with a single call to generate.
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+        >>> generated_ids = model.generate(inputs=input_features)
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```
+        """
+        # 0. deprecate old inputs
+        if "inputs" in kwargs:
+            input_features = kwargs.pop("inputs")
+            warnings.warn(
+                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
+                FutureWarning,
+            )
+        # 1. prepare generation config
+        generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
+        # 2. set global generate variables
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+        num_segment_frames = input_stride * self.config.max_source_positions
+        batch_size, total_input_frames = self._retrieve_total_input_frames(
+            input_features=input_features, input_stride=input_stride, kwargs=kwargs
+        )
+        is_shortform = total_input_frames <= num_segment_frames
+        # 3. Make sure generation config is correctly set
+        # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+        return_dict_in_generate = self._set_return_outputs(
+            return_dict_in_generate=return_dict_in_generate,
+            return_token_timestamps=return_token_timestamps,
+            logprob_threshold=logprob_threshold,
+            generation_config=generation_config,
+        )
+        timestamp_begin = self._set_return_timestamps(
+            return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
+        )
+        self._set_language_and_task(
+            language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
+        )
+        self._set_num_frames(
+            return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
+        )
+        self._set_thresholds_and_condition(
+            generation_config=generation_config,
+            logprob_threshold=logprob_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            no_speech_threshold=no_speech_threshold,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+        )
+        self._set_prompt_condition_type(
+            generation_config=generation_config,
+            prompt_condition_type=prompt_condition_type,
+        )
+        kwargs["attention_mask"] = attention_mask
+        # pass self.config for backward compatibility
+        init_tokens = self._retrieve_init_tokens(
+            input_features,
+            batch_size=batch_size,
+            generation_config=generation_config,
+            config=self.config,
+            num_segment_frames=num_segment_frames,
+            kwargs=kwargs,
+        )
+        # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
+        # where the input ids are handled explicitly by the generate method
+        self._check_decoder_input_ids(kwargs=kwargs)
+        # 3. Retrieve logits processors
+        device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
+        begin_index = init_tokens.shape[1]
+        logits_processor = self._retrieve_logit_processors(
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            begin_index=begin_index,  # begin index is index of first generated decoder token
+            num_beams=kwargs.get("num_beams", 1),
+            device=device,
+        )
+        # 4 Set and retrieve global generation variables
+        self._set_condition_on_prev_tokens(
+            condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
+        )
+        temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
+        temperature = temperatures[0]
+        max_frames, seek = self._retrieve_max_frames_and_seek(
+            batch_size=batch_size,
+            attention_mask=attention_mask,
+            total_input_frames=total_input_frames,
+            is_shortform=is_shortform,
+        )
+        # 5 Prepare running variables, list for generation
+        num_return_sequences = generation_config.num_return_sequences
+        (
+            batch_idx_map,
+            cur_bsz,
+            input_features,
+            seek,
+            max_frames,
+            init_tokens,
+            do_condition_on_prev_tokens,
+        ) = self._expand_variables_for_generation(
+            input_features=input_features,
+            seek=seek,
+            max_frames=max_frames,
+            init_tokens=init_tokens,
+            batch_size=batch_size,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+            generation_config=generation_config,
+        )
+        current_segments = self._prepare_segments(
+            prompt_ids=prompt_ids,
+            batch_size=cur_bsz,
+            generation_config=generation_config,
+        )
+        # 6 Transcribe audio until we reach the end of all input audios
+        while (seek < max_frames).any():
+            # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+            # to know which original audio is being decoded
+            # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+            input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch(
+                input_features=input_features,
+                seek=seek,
+                max_frames=max_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            time_offset = seek * time_precision / input_stride
+            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+            # 6.2 cut out next 30s segment from input features
+            segment_input = self._get_input_segment(
+                input_features=input_features,
+                seek=seek,
+                seek_num_frames=seek_num_frames,
+                num_segment_frames=num_segment_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            # 6.3 prepare decoder input ids
+            suppress_tokens = _get_attr_from_logit_processors(
+                logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
+            )
+            decoder_input_ids, kwargs = self._prepare_decoder_input_ids(
+                cur_bsz=cur_bsz,
+                init_tokens=init_tokens,
+                current_segments=current_segments,
+                batch_idx_map=batch_idx_map,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                prompt_ids=prompt_ids,
+                generation_config=generation_config,
+                config=self.config,
+                device=init_tokens.device,
+                suppress_tokens=suppress_tokens,
+                kwargs=kwargs,
+            )
+            # 6.4 set max new tokens or max length
+            self._set_max_new_tokens_and_length(
+                config=self.config,
+                decoder_input_ids=decoder_input_ids,
+                generation_config=generation_config,
+            )
+            # 6.5 Set current `begin_index` for all logit processors
+            if logits_processor is not None:
+                for proc in logits_processor:
+                    if hasattr(proc, "set_begin_index"):
+                        proc.set_begin_index(decoder_input_ids.shape[-1])
+            # 6.6 Run generate with fallback
+            (
+                seek_sequences,
+                seek_outputs,
+                should_skip,
+                do_condition_on_prev_tokens,
+                model_output_type,
+            ) = self.generate_with_fallback(
+                segment_input=segment_input,
+                decoder_input_ids=decoder_input_ids,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+                seek=seek,
+                num_segment_frames=num_segment_frames,
+                max_frames=max_frames,
+                temperatures=temperatures,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                return_token_timestamps=return_token_timestamps,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                is_shortform=is_shortform,
+                batch_size=batch_size,
+                kwargs=kwargs,
+            )
+            # 6.7 In every generated sequence, split by timestamp tokens and extract segments
+            for i, seek_sequence in enumerate(seek_sequences):
+                prev_i = batch_idx_map[i]
+                if should_skip[i]:
+                    seek[prev_i] += seek_num_frames[prev_i]
+                    continue
+                segments, segment_offset = self._retrieve_segment(
+                    seek_sequence=seek_sequence,
+                    seek_outputs=seek_outputs,
+                    time_offset=time_offset,
+                    timestamp_begin=timestamp_begin,
+                    seek_num_frames=seek_num_frames,
+                    time_precision=time_precision,
+                    input_stride=input_stride,
+                    prev_idx=prev_i,
+                    idx=i,
+                    return_token_timestamps=return_token_timestamps,
+                )
+                current_segments[prev_i] += segments
+                if is_shortform:
+                    seek[prev_i] += max_frames[i]
+                else:
+                    seek[prev_i] += segment_offset
+        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
+        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
+        final_segments = (
+            [x[1:] for x in current_segments]
+            if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
+            else current_segments
+        )
+        sequences = _pad_to_max_length(
+            final_segments, generation_config.pad_token_id, device=self.device, padding_side="right"
+        )
+        # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
+        if return_segments:
+            return {"sequences": sequences, "segments": final_segments}
+        if is_shortform:
+            # add eos token:
+            if generation_config.max_new_tokens is None and generation_config.max_length is None:
+                eos_tokens = torch.full((sequences.shape[0], 1), generation_config.eos_token_id)
+                sequences = torch.cat([sequences, eos_tokens], dim=-1)
+            if return_token_timestamps:
+                outputs = {}
+                outputs["sequences"] = sequences
+                outputs["token_timestamps"] = torch.stack([d["token_timestamps"] for d in seek_outputs], dim=0)
+            else:
+                outputs = sequences
+            if return_dict_in_generate and generation_config.return_dict_in_generate:
+                dict_outputs = self._stack_split_outputs(seek_outputs, model_output_type, sequences.device, kwargs)
+                if num_return_sequences > 1:
+                    if hasattr(dict_outputs, "encoder_attentions") and dict_outputs.encoder_attentions is not None:
+                        dict_outputs.encoder_attentions = tuple(
+                            dict_outputs.encoder_attentions[i][::num_return_sequences]
+                            for i in range(len(dict_outputs.encoder_attentions))
+                        )
+                    if (
+                        hasattr(dict_outputs, "encoder_hidden_states")
+                        and dict_outputs.encoder_hidden_states is not None
+                    ):
+                        dict_outputs.encoder_hidden_states = tuple(
+                            dict_outputs.encoder_hidden_states[i][::num_return_sequences]
+                            for i in range(len(dict_outputs.encoder_hidden_states))
+                        )
+                if return_token_timestamps:
+                    dict_outputs["token_timestamps"] = outputs["token_timestamps"]
+                return dict_outputs
+            return outputs
+        return sequences
+    def generate_with_fallback(
+        self,
+        segment_input,
+        decoder_input_ids,
+        cur_bsz,
+        batch_idx_map,
+        seek,
+        num_segment_frames,
+        max_frames,
+        temperatures,
+        generation_config,
+        logits_processor,
+        stopping_criteria,
+        prefix_allowed_tokens_fn,
+        synced_gpus,
+        return_token_timestamps,
+        do_condition_on_prev_tokens,
+        is_shortform,
+        batch_size,
+        kwargs,
+    ):
+        kwargs = copy.copy(kwargs)
+        # 6.6 Batch generate current chunk
+        seek_sequence_list = [None for _ in range(cur_bsz)]
+        seek_outputs_list = [None for _ in range(cur_bsz)]
+        needs_fallback = [False for _ in range(cur_bsz)]
+        should_skip = [False for _ in range(cur_bsz)]
+        fallback_index_map = list(range(cur_bsz))
+        if generation_config.no_speech_threshold is not None:
+            self._setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs)
+        for fallback_idx, temperature in enumerate(temperatures):
+            generation_config.do_sample = temperature is not None and temperature > 0.0
+            generation_config.temperature = temperature if generation_config.do_sample else 1.0
+            if generation_config.do_sample:
+                generation_config.num_beams = 1
+            generate_kwargs = copy.copy(kwargs)
+            for key in ["do_sample", "temperature", "num_beams"]:
+                if key in generate_kwargs:
+                    del generate_kwargs[key]
+            cur_bsz = decoder_input_ids.shape[0]
+            if generation_config.cache_implementation == "static" and cur_bsz < batch_size:
+                segment_input = F.pad(segment_input, (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0)
+                decoder_input_ids = F.pad(
+                    decoder_input_ids, (0, 0, 0, batch_size - cur_bsz), value=generation_config.pad_token_id
+                )
+                if generate_kwargs.get("decoder_attention_mask") is not None:
+                    generate_kwargs["decoder_attention_mask"] = F.pad(
+                        generate_kwargs["decoder_attention_mask"], (0, 0, 0, batch_size - cur_bsz), value=True
+                    )
+                if generate_kwargs.get("encoder_outputs") is not None:
+                    generate_kwargs["encoder_outputs"] = F.pad(
+                        generate_kwargs["encoder_outputs"], (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0
+                    )
+            seek_outputs = super().generate(
+                segment_input,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                decoder_input_ids=decoder_input_ids,
+                **generate_kwargs,
+            )
+            model_output_type = type(seek_outputs)
+            # post-process sequence tokens and outputs to be in list form
+            seek_sequences, seek_outputs = self._postprocess_outputs(
+                seek_outputs=seek_outputs,
+                decoder_input_ids=decoder_input_ids,
+                return_token_timestamps=return_token_timestamps,
+                generation_config=generation_config,
+                is_shortform=is_shortform,
+            )
+            if cur_bsz < batch_size:
+                seek_sequences = seek_sequences[:cur_bsz]
+                seek_outputs = seek_outputs[:cur_bsz]
+            # 6.7 Extract cut sequences from every sequence and check if fallback should be applied
+            # Loop over each decoded audio individually as each decoding can be of a different length
+            new_fallback_index_map = []
+            new_segment_input = []
+            new_decoder_input_ids = []
+            new_decoder_attention_mask = []
+            for i, seek_sequence in enumerate(seek_sequences):
+                # make sure we cut a predicted EOS token if we are not finished with the generation yet
+                prev_i = batch_idx_map[fallback_index_map[i]]
+                is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i]
+                # remove eos token id
+                if is_not_final and seek_sequence[-1] == generation_config.eos_token_id:
+                    seek_sequence = seek_sequence[:-1]
+                    if return_token_timestamps and not is_shortform:
+                        seek_outputs[i]["token_timestamps"] = seek_outputs[i]["token_timestamps"][:-1]
+                # remove all padding tokens
+                if seek_sequence[-1] == generation_config.pad_token_id:
+                    num_paddings = (seek_sequence == generation_config.pad_token_id).sum()
+                    seek_sequence = seek_sequence[:-num_paddings]
+                    if return_token_timestamps and not is_shortform:
+                        seek_outputs[i]["token_timestamps"] = seek_outputs[i]["token_timestamps"][:-num_paddings]
+                # check which sequences in batch need fallback & which should be skipped
+                needs_fallback[i], should_skip[i] = self._need_fallback(
+                    seek_sequence,
+                    seek_outputs,
+                    i,
+                    logits_processor,
+                    generation_config,
+                    self.config.vocab_size,
+                    temperature,
+                )
+                seek_sequence_list[fallback_index_map[i]] = seek_sequence
+                seek_outputs_list[fallback_index_map[i]] = seek_outputs[i]
+                is_low_temperature = temperature is None or temperature < 0.5
+                do_condition_on_prev_tokens[fallback_index_map[i]] = (
+                    generation_config.condition_on_prev_tokens and is_low_temperature
+                )
+                if needs_fallback[i]:
+                    new_fallback_index_map.append(fallback_index_map[i])
+                    new_segment_input.append(segment_input[i])
+                    new_decoder_input_ids.append(decoder_input_ids[i])
+                    if "decoder_attention_mask" in kwargs:
+                        new_decoder_attention_mask.append(kwargs["decoder_attention_mask"][i])
+            fallback_index_map = new_fallback_index_map
+            # if no sequence needs to be run with temperature fallback, we're finished
+            if len(fallback_index_map) == 0 or fallback_idx == len(temperatures) - 1:
+                seek_sequences = seek_sequence_list
+                seek_outputs = seek_outputs_list
+                break
+            # if we're still in the loop, make sure that decoder_input_ids and segment inputs are tensors
+            decoder_input_ids = torch.stack(new_decoder_input_ids)
+            segment_input = torch.stack(new_segment_input)
+            if "decoder_attention_mask" in kwargs:
+                kwargs["decoder_attention_mask"] = torch.stack(new_decoder_attention_mask)
+        return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens, model_output_type
+    @staticmethod
+    def _prepare_segments(prompt_ids, batch_size, generation_config):
+        if prompt_ids is not None and generation_config.prompt_condition_type == "first-segment":
+            prev_sot_token_id = getattr(generation_config, "prev_sot_token_id", None)
+            prompt_ids = prompt_ids[1:] if prompt_ids[0] == prev_sot_token_id else prompt_ids
+            current_segments = [[{"tokens": prompt_ids}] for _ in range(batch_size)]
+        else:
+            current_segments = [[] for _ in range(batch_size)]
+        return current_segments
+    def _postprocess_outputs(
+        self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config, is_shortform
+    ):
+        # remove all previously passed decoder input ids
+        start_idx = decoder_input_ids.shape[-1] if not is_shortform else torch.tensor(0)
+        if isinstance(seek_outputs, torch.Tensor):
+            seek_outputs = seek_outputs[:, start_idx:]
+            return seek_outputs, seek_outputs
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            num_frames = getattr(generation_config, "num_frames", None)
+            seek_outputs["token_timestamps"] = self._extract_token_timestamps(
+                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+            )
+            seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, start_idx:]
+        seek_outputs["sequences"] = seek_outputs["sequences"][:, start_idx:]
+        def split_by_batch_index(values, key, batch_idx, is_shortform):
+            if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+                return [v[batch_idx].cpu() for v in values]
+            if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+                return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values)
+            elif key == "past_key_values":
+                if not is_shortform:
+                    # we don't save `past_key_values` as this is too costly for longform
+                    return None
+                elif isinstance(values, EncoderDecoderCache):
+                    all_past_key_values = []
+                    for layer_idx in range(self.config.decoder_layers):
+                        layer_past_key_values = []
+                        for cache_cls in [values.self_attention_cache, values.cross_attention_cache]:
+                            for v in [cache_cls.key_cache, cache_cls.value_cache]:
+                                layer_past_key_values.append(v[layer_idx][batch_idx][None].cpu())
+                        all_past_key_values.append(tuple(layer_past_key_values))
+                    return tuple(all_past_key_values)
+                else:
+                    all_past_key_values = []
+                    for v in range(len(values)):
+                        layer_past_key_values = []
+                        for w in values[v]:
+                            layer_past_key_values.append(w[batch_idx][None].cpu())
+                        all_past_key_values.append(tuple(layer_past_key_values))
+                    return tuple(all_past_key_values)
+            return values[batch_idx].cpu()
+        sequence_tokens = seek_outputs["sequences"]
+        seek_outputs = [
+            {k: split_by_batch_index(v, k, i, is_shortform) for k, v in seek_outputs.items()}
+            for i in range(sequence_tokens.shape[0])
+        ]
+        return sequence_tokens, seek_outputs
+    def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs):
+        # Stack back seek_outputs tensors after splitting them with the split_by_batch_index method
+        outputs = {}
+        for key in seek_outputs[0].keys():
+            if key == "sequences":
+                outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
+            if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+                outputs[key] = tuple(
+                    torch.stack([v[key][i] for v in seek_outputs]).to(device) for i in range(len(seek_outputs[0][key]))
+                )
+            if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+                outputs[key] = tuple(
+                    tuple(
+                        torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device)
+                        for j in range(len(seek_outputs[0][key][0]))
+                    )
+                    for i in range(len(seek_outputs[0][key]))
+                )
+            if key == "past_key_values":
+                past_key_value_type = kwargs.get("past_key_values")
+                if seek_outputs[0][key] is not None:
+                    outputs[key] = tuple(
+                        tuple(
+                            torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device)
+                            for j in range(len(seek_outputs[0][key][0]))
+                        )
+                        for i in range(len(seek_outputs[0][key]))
+                    )
+                    if past_key_value_type is not None and isinstance(past_key_value_type, EncoderDecoderCache):
+                        outputs[key] = past_key_value_type.from_legacy_cache(outputs[key])
+                else:
+                    outputs[key] = None
+        return model_output_type(**outputs)
+    def _need_fallback(
+        self,
+        seek_sequence,
+        seek_outputs,
+        index,
+        logits_processor,
+        generation_config,
+        vocab_size,
+        temperature,
+    ):
+        needs_fallback = False
+        should_skip = False
+        if generation_config.compression_ratio_threshold is not None:
+            compression_ratio = self._retrieve_compression_ratio(seek_sequence, vocab_size)
+            if compression_ratio > generation_config.compression_ratio_threshold:
+                needs_fallback = True
+        if generation_config.logprob_threshold is not None:
+            if hasattr(seek_outputs[0], "sequences_scores"):
+                logprobs = [s["sequences_scores"] for s in seek_outputs][index]
+            else:
+                scores = seek_outputs[index]["scores"]
+                logprobs = self._retrieve_avg_logprobs(
+                    scores, seek_sequence, generation_config.eos_token_id, temperature
+                )
+            if logprobs < generation_config.logprob_threshold:
+                needs_fallback = True
+        if generation_config.no_speech_threshold is not None:
+            no_speech_prob = _get_attr_from_logit_processors(
+                logits_processor, WhisperNoSpeechDetection, "no_speech_prob"
+            )
+            if (
+                logprobs < generation_config.logprob_threshold
+                and no_speech_prob[index] > generation_config.no_speech_threshold
+            ):
+                needs_fallback = False
+                should_skip = True
+        return needs_fallback, should_skip
+    def _expand_variables_for_generation(
+        self, input_features, seek, max_frames, init_tokens, batch_size, condition_on_prev_tokens, generation_config
+    ):
+        if generation_config.num_return_sequences is not None and generation_config.num_return_sequences > 1:
+            batch_idx_map = list(range(batch_size * generation_config.num_return_sequences))
+            cur_bsz = len(batch_idx_map)
+            do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(len(batch_idx_map))]
+            input_features = input_features.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            seek = seek.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            max_frames = max_frames.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            init_tokens = init_tokens.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            generation_config.num_return_sequences = 1
+        else:
+            cur_bsz = batch_size
+            batch_idx_map = list(range(cur_bsz))
+            do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(cur_bsz)]
+        return (
+            batch_idx_map,
+            cur_bsz,
+            input_features,
+            seek,
+            max_frames,
+            init_tokens,
+            do_condition_on_prev_tokens,
+        )
+    @staticmethod
+    def _setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs):
+        set_inputs = _get_attr_from_logit_processors(logits_processor, WhisperNoSpeechDetection, "set_inputs")
+        extra_kwargs = {k: v for k, v in kwargs.items() if torch.is_tensor(v)}
+        set_inputs({"inputs": segment_input, "decoder_input_ids": decoder_input_ids, **extra_kwargs})
+    @staticmethod
+    def _retrieve_total_input_frames(input_features, input_stride, kwargs):
+        if input_features is not None:
+            return input_features.shape[0], input_features.shape[-1]
+        if "encoder_outputs" in kwargs:
+            encoder_outputs_shape = (
+                kwargs["encoder_outputs"][0].shape
+                if isinstance(kwargs["encoder_outputs"], BaseModelOutput)
+                else kwargs["encoder_outputs"].shape
+            )
+            return encoder_outputs_shape[0], encoder_outputs_shape[1] * input_stride
+        raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.")
+    @staticmethod
+    def _maybe_warn_unused_inputs(
+        condition_on_prev_tokens,
+        temperature,
+        compression_ratio_threshold,
+        logprob_threshold,
+        no_speech_threshold,
+        total_input_frames,
+    ):
+        warning_prefix = (
+            f"Audio input consists of only {total_input_frames}. "
+            "Short-form transcription is activated."
+            "{}, but will be ignored."
+        )
+        if condition_on_prev_tokens is not None:
+            logger.warning(warning_prefix.format(f"condition_on_prev_tokens is set to {condition_on_prev_tokens}"))
+        if compression_ratio_threshold is not None:
+            logger.warning(
+                warning_prefix.format(f"compression_ratio_threshold is set to {compression_ratio_threshold}")
+            )
+        if logprob_threshold is not None:
+            logger.warning(warning_prefix.format(f"logprob_threshold is set to {logprob_threshold}"))
+        if no_speech_threshold is not None:
+            logger.warning(warning_prefix.format(f"no_speech_threshold is set to {no_speech_threshold}"))
+        # when passing temperature as a list it cannot just be ignored => throw error in this case
+        if isinstance(temperature, (list, tuple)):
+            raise ValueError(
+                f"Audio input consists of only {total_input_frames}. Short-form transcription is activated."
+                f"temperature cannot be set to {temperature} which can only be used for temperature fallback for long-form generation. Make sure to set `temperature` to a float value or `None` for short-form generation."
+            )
+    @staticmethod
+    def _set_return_outputs(return_dict_in_generate, return_token_timestamps, logprob_threshold, generation_config):
+        if return_dict_in_generate is None:
+            return_dict_in_generate = generation_config.return_dict_in_generate
+        else:
+            generation_config.return_dict_in_generate = return_dict_in_generate
+        generation_config.return_token_timestamps = return_token_timestamps
+        if return_token_timestamps:
+            generation_config.return_dict_in_generate = True
+            generation_config.output_attentions = True
+            generation_config.output_scores = True
+        if logprob_threshold is not None:
+            generation_config.return_dict_in_generate = True
+            generation_config.output_scores = True
+        return return_dict_in_generate
+    def _set_return_timestamps(self, return_timestamps, is_shortform, generation_config):
+        if return_timestamps is None and hasattr(generation_config, "return_timestamps"):
+            return_timestamps = generation_config.return_timestamps
+        if not is_shortform:
+            if return_timestamps is False:
+                raise ValueError(
+                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
+                    "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features."
+                )
+            logger.info("Setting `return_timestamps=True` for long-form generation.")
+            return_timestamps = True
+        if return_timestamps and not hasattr(generation_config, "no_timestamps_token_id"):
+            raise ValueError(
+                "You are trying to return timestamps, but the generation config is not properly set. "
+                "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
+                "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+            )
+        generation_config.return_timestamps = return_timestamps
+        if hasattr(generation_config, "no_timestamps_token_id"):
+            timestamp_begin = generation_config.no_timestamps_token_id + 1
+        else:
+            # BC for models missing the `no_timestamps_token_id` in the generation config when generating short-form with no timestamps
+            # We set the timestamp begin token larger than the vocab size, such that the timestamp condition is never met in the decoding loop
+            timestamp_begin = self.config.vocab_size + 1
+        return timestamp_begin
+    @staticmethod
+    def _set_language_and_task(language, task, is_multilingual, generation_config):
+        if is_multilingual is not None:
+            if not hasattr(generation_config, "is_multilingual"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `is_multilingual` argument "
+                    "to `generate`. Please update the generation config as per the instructions "
+                    "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.is_multilingual = is_multilingual
+        if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual:
+            if task is not None or language is not None:
+                raise ValueError(
+                    "Cannot specify `task` or `language` for an English-only model. If the model is intended to be "
+                    "multilingual, pass `is_multilingual=True` to generate, or update the generation config."
+                )
+        if language is not None:
+            if not hasattr(generation_config, "lang_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `language` argument "
+                    "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.language = language
+        if task is not None:
+            if not hasattr(generation_config, "task_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `task` argument "
+                    "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.task = task
+    def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs):
+        def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
+            """short function to replace num with a itr in lst"""
+            found = any(i in lst for i in itr)
+            if found:
+                lst = [num if i in itr else i for i in lst]
+            else:
+                lst.append(num)
+            return lst
+        def language_to_id(language: str) -> int:
+            language = language.lower()
+            if language in generation_config.lang_to_id.keys():
+                language_token = language
+            elif language in TO_LANGUAGE_CODE.keys():
+                language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+            elif language in TO_LANGUAGE_CODE.values():
+                language_token = f"<|{language}|>"
+            else:
+                is_language_code = len(language) == 2
+                raise ValueError(
+                    f"Unsupported language: {language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+            if language_token not in generation_config.lang_to_id:
+                raise ValueError(
+                    f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                    "(You should just add it to the generation config)"
+                )
+            return generation_config.lang_to_id[language_token]
+        task = getattr(generation_config, "task", None)
+        language = getattr(generation_config, "language", None)
+        forced_decoder_ids = generation_config.forced_decoder_ids
+        if forced_decoder_ids is not None:
+            if language is None and task is None and forced_decoder_ids[0][1] is None:
+                logger.warning_once(
+                    "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English."
+                    "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`."
+                )
+        elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
+            forced_decoder_ids = config.forced_decoder_ids
+        if forced_decoder_ids is not None and task is not None:
+            logger.warning_once(
+                f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}."
+            )
+            forced_decoder_ids = None
+        elif forced_decoder_ids is not None and language is not None:
+            logger.warning_once(
+                f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
+            )
+            forced_decoder_ids = None
+        init_tokens = [generation_config.decoder_start_token_id]
+        if forced_decoder_ids is not None and forced_decoder_ids[0][0] == 1:
+            i = 1
+            while len(forced_decoder_ids) > 0 and forced_decoder_ids[0][0] == i:
+                init_tokens += [forced_decoder_ids[0][1]]
+                forced_decoder_ids = forced_decoder_ids[1:]
+                i += 1
+            if len(forced_decoder_ids) > 0:
+                raise ValueError(
+                    f"You are using token ids in `forced_decoder_ids` that do not seem to correctly follow the prompt pattern of Whisper. Make sure that {forced_decoder_ids} has an entry for all indices >= 1 and < {forced_decoder_ids[0][0]}.",
+                )
+        # from v4.39 the forced decoder ids are always None in favour of decoder input ids
+        generation_config.forced_decoder_ids = None
+        is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None)
+        # Make sure language is a list of strings of the correct length
+        if isinstance(language, (list, tuple)):
+            if any(l is None for l in language):
+                raise TypeError(
+                    "Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`."
+                )
+            if len(language) != batch_size:
+                raise ValueError(
+                    "When passing a list of languages, the length of the list must match the batch size. "
+                    f"Expected length of {batch_size}, but got {len(language)} languages."
+                )
+            languages = language
+        elif language is None:
+            # Language will be detected for each item in batch
+            languages = [None] * batch_size
+        else:
+            languages = [language]  # Use a length-1 list now, broadcast later
+        # Separate init_tokens for each language
+        init_tokens = [copy.copy(init_tokens) for _ in languages]
+        # Update init_tokens with languages
+        lang_ids = None
+        if language is not None:
+            lang_ids = [language_to_id(l) for l in languages]
+        elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
+            # language is not defined or intentially set to `None` to trigger language detection
+            lang_ids = self.detect_language(
+                input_features=input_features,
+                encoder_outputs=kwargs.get("encoder_outputs", None),
+                attention_mask=kwargs.get("attention_mask", None),
+                generation_config=generation_config,
+                num_segment_frames=num_segment_frames,
+            ).tolist()
+        if lang_ids is not None:
+            # append or replace lang_ids to init_tokens
+            for i in range(len(init_tokens)):
+                if len(init_tokens[i]) > 1:
+                    init_tokens[i][1] = lang_ids[i]
+                else:
+                    init_tokens[i].append(lang_ids[i])
+        del languages
+        # Update init_tokens with task
+        for i in range(len(init_tokens)):
+            if task is not None:
+                if task in TASK_IDS:
+                    init_tokens[i].append(generation_config.task_to_id[generation_config.task])
+                    task_id = generation_config.task_to_id[generation_config.task]
+                    # if task is defined it'll overwrite task ids that might have already been defined via the generation_config
+                    replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values())
+                else:
+                    raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`")
+            elif language is not None and hasattr(generation_config, "task_to_id"):
+                # if language is defined, but no task id is in `init_tokens`, default to transcribe
+                if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()):
+                    init_tokens[i].append(generation_config.task_to_id["transcribe"])
+            if (
+                not generation_config.return_timestamps
+                and hasattr(generation_config, "no_timestamps_token_id")
+                and init_tokens[i][-1] != generation_config.no_timestamps_token_id
+            ):
+                init_tokens[i].append(generation_config.no_timestamps_token_id)
+            elif (
+                generation_config.return_timestamps and init_tokens[i][-1] == generation_config.no_timestamps_token_id
+            ):
+                logger.info(
+                    "<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `'True'`."
+                )
+                init_tokens[i] = init_tokens[i][:-1]
+            # let's make sure we don't pass `None` tokens as prompt tokens
+            init_tokens[i] = [t for t in init_tokens[i] if t is not None]
+        return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1)
+    def detect_language(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        num_segment_frames: int = 3000,
+    ) -> torch.Tensor:
+        """
+        Detects language from log-mel input features or encoder_outputs
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+                Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+                `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+                hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            num_segment_frames (`int`, *optional*, defaults to 3000):
+                The number of log-mel frames the model expects
+        Return:
+            A `torch.LongTensor` representing the detected language ids.
+        """
+        if input_features is None and encoder_outputs is None:
+            raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
+        elif input_features is not None and encoder_outputs is not None:
+            raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
+        elif input_features is not None:
+            inputs = {"input_features": input_features[:, :, :num_segment_frames]}
+            batch_size = input_features.shape[0]
+        elif encoder_outputs is not None:
+            inputs = {"encoder_outputs": encoder_outputs}
+            batch_size = (
+                encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+            )
+        if attention_mask is not None:
+            inputs["attention_mask"] = attention_mask
+        generation_config = generation_config or self.generation_config
+        decoder_input_ids = (
+            torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
+            * generation_config.decoder_start_token_id
+        )
+        with torch.no_grad():
+            logits = self(**inputs, decoder_input_ids=decoder_input_ids).logits[:, -1]
+        non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
+        non_lang_mask[list(generation_config.lang_to_id.values())] = False
+        logits[:, non_lang_mask] = -np.inf
+        lang_ids = logits.argmax(-1)
+        return lang_ids
+    @staticmethod
+    def _check_decoder_input_ids(kwargs):
+        decoder_input_ids = kwargs.get("decoder_input_ids", None)
+        assistant_model = kwargs.get("assistant_model", None)
+        if decoder_input_ids is not None and assistant_model is not None:
+            raise ValueError(
+                "Passing `decoder_input_ids` is deprecated. Consider passing `prompt_ids` instead.",
+            )
+    @staticmethod
+    def _set_num_frames(return_token_timestamps, generation_config, kwargs):
+        if return_token_timestamps:
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+            generation_config.num_frames = kwargs.pop("num_frames", None)
+    @staticmethod
+    def _set_thresholds_and_condition(
+        generation_config,
+        logprob_threshold,
+        compression_ratio_threshold,
+        no_speech_threshold,
+        condition_on_prev_tokens,
+    ):
+        generation_config.logprob_threshold = (
+            logprob_threshold
+            if logprob_threshold is not None
+            else getattr(generation_config, "logprob_threshold", None)
+        )
+        generation_config.compression_ratio_threshold = (
+            compression_ratio_threshold
+            if compression_ratio_threshold is not None
+            else getattr(generation_config, "compression_ratio_threshold", None)
+        )
+        generation_config.no_speech_threshold = (
+            no_speech_threshold
+            if no_speech_threshold is not None
+            else getattr(generation_config, "no_speech_threshold", None)
+        )
+        generation_config.condition_on_prev_tokens = (
+            condition_on_prev_tokens
+            if condition_on_prev_tokens is not None
+            else getattr(generation_config, "condition_on_prev_tokens", None)
+        )
+    @staticmethod
+    def _set_prompt_condition_type(generation_config, prompt_condition_type):
+        allowed_cond_types = ["first-segment", "all-segments"]
+        # default to "first-segment"
+        prompt_condition_type = prompt_condition_type or allowed_cond_types[0]
+        if prompt_condition_type not in allowed_cond_types:
+            raise ValueError(
+                f"`prompt_condition_type={prompt_condition_type} does not exist. Make sure to set `prompt_condition_type` to one of {', '.join(allowed_cond_types)}"
+            )
+        if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
+            raise ValueError(
+                "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`."
+            )
+        generation_config.prompt_condition_type = prompt_condition_type
+    @staticmethod
+    def _set_condition_on_prev_tokens(condition_on_prev_tokens, generation_config):
+        condition_on_prev_tokens = (
+            condition_on_prev_tokens
+            if condition_on_prev_tokens is not None
+            else getattr(generation_config, "condition_on_prev_tokens", False)
+        )
+        generation_config.condition_on_prev_tokens = condition_on_prev_tokens
+    @staticmethod
+    def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames, is_shortform):
+        if batch_size > 1 and not is_shortform and attention_mask is None:
+            raise ValueError(
+                "When doing batched long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
+            )
+        elif batch_size > 1 and not is_shortform:
+            max_frames = attention_mask.sum(-1).cpu().to(torch.long)
+            seek = torch.zeros((batch_size,), dtype=torch.long)
+        else:
+            max_frames = torch.ones((batch_size,), dtype=torch.long) * total_input_frames
+            seek = torch.zeros((batch_size,), dtype=torch.long)
+        return max_frames, seek
+    def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, num_beams, device):
+        if generation_config.return_timestamps is True:
+            timestamp_processor = WhisperTimeStampLogitsProcessor(generation_config, begin_index=begin_index)
+            logits_processor = (
+                [timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor
+            )
+        if generation_config.suppress_tokens is not None:
+            suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device)
+            logits_processor = (
+                [suppress_tokens_processor]
+                if logits_processor is None
+                else [suppress_tokens_processor] + logits_processor
+            )
+            generation_config.suppress_tokens = None
+        if generation_config.begin_suppress_tokens is not None:
+            begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor(
+                generation_config.begin_suppress_tokens, begin_index=begin_index, device=device
+            )
+            logits_processor = (
+                [begin_suppress_processor]
+                if logits_processor is None
+                else [begin_suppress_processor] + logits_processor
+            )
+            generation_config.begin_suppress_tokens = None
+        if generation_config.no_speech_threshold is not None:
+            no_speech_detector = WhisperNoSpeechDetection(
+                no_speech_token=generation_config.no_timestamps_token_id - 1,
+                begin_index=begin_index,
+                scores_is_logprobs=num_beams > 1,
+            )
+            logits_processor = (
+                [no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor
+            )
+            no_speech_detector.set_model(self)
+        return logits_processor
+    @staticmethod
+    def _maybe_reduce_batch(input_features, seek, max_frames, cur_bsz, batch_idx_map):
+        prev_bsz = cur_bsz
+        new_batch_idx_map = []
+        for i in range(prev_bsz):
+            prev_i = batch_idx_map[i]
+            if seek[prev_i] >= max_frames[prev_i]:
+                cut_index = i + (cur_bsz - prev_bsz)
+                cur_bsz -= 1
+                input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0)
+            else:
+                # cut out index that goes away
+                new_batch_idx_map.append(prev_i)
+        return input_features, cur_bsz, new_batch_idx_map
+    @staticmethod
+    def _get_input_segment(input_features, seek, seek_num_frames, num_segment_frames, cur_bsz, batch_idx_map):
+        if input_features is None:
+            return None
+        segment_input = []
+        for i in range(cur_bsz):
+            prev_i = batch_idx_map[i]
+            segment_input_slice = input_features[i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i]]
+            if segment_input_slice.shape[-1] < num_segment_frames:
+                # pad to 3000 if necessary
+                segment_input_slice = F.pad(
+                    segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1])
+                )
+            segment_input.append(segment_input_slice)
+        segment_input = torch.cat(segment_input, dim=0)
+        return segment_input
+    @staticmethod
+    def _prepare_decoder_input_ids(
+        cur_bsz,
+        init_tokens,
+        current_segments,
+        batch_idx_map,
+        do_condition_on_prev_tokens,
+        prompt_ids,
+        generation_config,
+        config,
+        device,
+        suppress_tokens,
+        kwargs,
+    ):
+        if "decoder_input_ids" in kwargs:
+            decoder_input_ids = kwargs.pop("decoder_input_ids")
+            return decoder_input_ids, kwargs
+        cut_off_length = config.max_target_positions // 2 - 1
+        decoder_input_ids = init_tokens[batch_idx_map]
+        prev_start_of_text = getattr(generation_config, "prev_sot_token_id", None)
+        if prev_start_of_text is None:
+            prev_start_of_text = suppress_tokens[-2] if suppress_tokens is not None else None
+        if any(do_condition_on_prev_tokens) and len(current_segments[0]) > 0:
+            # according to https://github.com/openai/whisper/blob/e58f28804528831904c3b6f2c0e473f346223433/whisper/decoding.py#L609
+            active_segments = [current_segments[i] if do_condition_on_prev_tokens[i] else None for i in batch_idx_map]
+            if prompt_ids is not None and generation_config.prompt_condition_type == "all-segments":
+                prev_ids = prompt_ids
+            else:
+                one_tensor = torch.ones((cur_bsz, 1), device=device, dtype=torch.long)
+                prev_ids = prev_start_of_text * one_tensor[0] if prev_start_of_text is not None else None
+            padding = "max_length" if generation_config.cache_implementation == "static" else "longest"
+            prev_tokens = _pad_to_max_length(
+                active_segments,
+                generation_config.pad_token_id,
+                device=device,
+                padding_side="left",
+                padding=padding,
+                bos_token_tensor=prev_ids,
+                cut_off_length=cut_off_length,
+            )
+            decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
+            kwargs["decoder_attention_mask"] = decoder_input_ids != generation_config.pad_token_id
+        elif prompt_ids is not None:
+            prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1)
+            decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
+            # make sure `"decoder_attention_mask"` is not passed to forward
+            kwargs.pop("decoder_attention_mask", None)
+        else:
+            # make sure `"decoder_attention_mask"` is not passed to forward
+            kwargs.pop("decoder_attention_mask", None)
+        return decoder_input_ids, kwargs
+    def _set_max_new_tokens_and_length(self, config, decoder_input_ids, generation_config):
+        max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0
+        if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions:
+            raise ValueError(
+                f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
+                f"is {max_new_tokens}. Thus, the combined length of "
+                f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
+                f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
+                "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+                f"so that their combined length is less than {self.config.max_target_positions}."
+            )
+        num_initial_tokens = min(config.max_target_positions // 2 - 1, decoder_input_ids.shape[-1] - 1)
+        # Make sure we don't get larger than `max_length`
+        if generation_config.max_length is not None and generation_config.max_new_tokens is None:
+            max_length = min(generation_config.max_length + num_initial_tokens, config.max_target_positions)
+            logger.info(
+                f"Increase max_length from {generation_config.max_length} to {max_length} since input is conditioned on previous segment."
+            )
+        elif (
+            generation_config.max_new_tokens is not None
+            and generation_config.max_new_tokens + decoder_input_ids.shape[-1] > config.max_target_positions
+        ):
+            max_new_tokens = config.max_target_positions - decoder_input_ids.shape[-1]
+            generation_config.max_new_tokens = max_new_tokens
+    @staticmethod
+    def _retrieve_compression_ratio(tokens, vocab_size):
+        """Compute byte length of zlib compressed token bytes vs. byte length of raw token bytes"""
+        length = int(math.log2(vocab_size) / 8) + 1
+        token_bytes = b"".join([t.to_bytes(length, "little") for t in tokens.tolist()])
+        compression_ratio = len(token_bytes) / len(zlib.compress(token_bytes))
+        return compression_ratio
+    @staticmethod
+    def _retrieve_avg_logprobs(scores, tokens, eos_token_id, temperature):
+        rescale_temperature = temperature if temperature > 0.0 else 1
+        scores = torch.stack(scores).to(tokens.device)
+        if scores.shape[0] > tokens.shape[0]:
+            scores = scores[: tokens.shape[0]]
+        else:
+            tokens = tokens[-scores.shape[0] :]
+        logprobs = F.log_softmax((scores * rescale_temperature).float(), dim=-1).to(scores.dtype)
+        # retrieve logprob of selected tokens and sum
+        sum_logprobs = sum((logprobs[i][tokens[i]] * (tokens[i] != eos_token_id)) for i in range(logprobs.shape[0]))
+        length = (tokens != eos_token_id).sum(-1) if eos_token_id is not None else tokens.shape[0]
+        avg_logprobs = sum_logprobs / (length + 1)
+        return avg_logprobs
+    @staticmethod
+    def _retrieve_segment(
+        seek_sequence,
+        seek_outputs,
+        time_offset,
+        timestamp_begin,
+        seek_num_frames,
+        time_precision,
+        input_stride,
+        prev_idx,
+        idx,
+        return_token_timestamps,
+    ):
+        # find the predicted "end of segment" predictions of Whisper
+        # "end of segment" predictions occur whenever Whisper predicts a timestamp token
+        timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
+        single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+        timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+        timestamp_segment_indices.add_(1)
+        token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
+        # If whisper predicted a "end of segment" via a timestep token, let's go ever each
+        # "end of segment" prediction and slice the decoding into segments accordingly
+        if len(timestamp_segment_indices) > 0:
+            # if the output contains two consecutive timestamp tokens
+            slices = timestamp_segment_indices.tolist()
+            segments = []
+            if single_timestamp_ending:
+                slices.append(len(seek_sequence))
+            last_slice = 0
+            # Add each segment to list of all segments
+            for current_slice in slices:
+                sliced_tokens = seek_sequence[last_slice:current_slice]
+                start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
+                        "tokens": sliced_tokens,
+                        "result": seek_outputs[idx],
+                    }
+                )
+                if return_token_timestamps:
+                    segments[-1]["token_timestamps"] = (
+                        token_timestamps[last_slice:current_slice] + time_offset[prev_idx]
+                    )
+                last_slice = current_slice
+            if single_timestamp_ending:
+                # single timestamp at the end means no speech after the last timestamp.
+                segment_offset = seek_num_frames[prev_idx]
+            else:
+                # otherwise, ignore the unfinished segment and seek to the last timestamp
+                # here we throw away all predictions after the last predicted "end of segment"
+                # since we are cutting right in the middle of an audio
+                last_timestamp_pos = seek_sequence[last_slice - 1].item() - timestamp_begin
+                segment_offset = last_timestamp_pos * input_stride
+        else:
+            # If whisper does not predict any "end of segment" token, then
+            # the whole decoding is considered a segment and we add it to the list of segments
+            timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
+            last_timestamp_pos = seek_num_frames[prev_idx]
+            if timestamps.numel() > 0 and timestamps[-1].item() != timestamp_begin:
+                # no consecutive timestamps but it has a timestamp; use the last one.
+                last_timestamp_pos = timestamps[-1].item() - timestamp_begin
+            segments = [
+                {
+                    "start": time_offset[prev_idx],
+                    "end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
+                    "tokens": seek_sequence,
+                    "result": seek_outputs[idx],
+                }
+            ]
+            if return_token_timestamps:
+                segments[-1]["token_timestamps"] = token_timestamps + time_offset[prev_idx]
+            segment_offset = seek_num_frames[prev_idx]
+        return segments, segment_offset

models/glm_speech_tokenizer/modeling_whisper.py ADDED Viewed

The diff for this file is too large to render. See raw diff

models/glm_speech_tokenizer/speech_token_extractor.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import sys
+sys.path.append("../../..")
+import io
+import glob
+import math
+import tarfile
+import torch
+import torchaudio
+import safetensors
+from .configuration_whisper import WhisperVQConfig
+from .modeling_whisper import WhisperVQEncoder, WhisperVQForConditionalGeneration
+from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
+import asyncio
+from .batch_processor import AsyncBatchEngine  # 修改为你的路径
+from typing import List, Union, Tuple, Literal, Optional
+class SpeechTokenExtractor:
+    def __init__(
+        self,
+        model: WhisperVQEncoder,
+        feature_extractor: WhisperFeatureExtractor,
+        device: Literal["cpu", "cuda", "mps"] | str = "cuda",
+        batch_size: int = 32,
+        wait_timeout: float = 0.01,
+    ):
+        self.model = model.eval().to(device)
+        self.feature_extractor = feature_extractor
+        self.device = device
+        self.wait_timeout = wait_timeout
+        self.dtype = next(model.parameters()).dtype
+        # 帧/采样 stride（用于 pad 对齐 & mask 下采样）
+        self.pooling_kernel_size = getattr(model.config, "pooling_kernel_size", 1)
+        self.frame_stride = (
+            model.conv1.stride[0] *
+            model.conv2.stride[0] *
+            self.pooling_kernel_size
+        )
+        self.sample_stride = self.frame_stride * feature_extractor.hop_length
+        # 重采样缓存（放在 device 上）
+        self._resamplers: dict[int, torchaudio.transforms.Resample] = {}
+        self._batch_processor = AsyncBatchEngine(
+            processing_function=self._batch_extract_async,
+            batch_size=batch_size,
+            wait_timeout=wait_timeout,
+        )
+    # -------- I/O & 重采样：保持在 device 上 --------
+    def _load_audio(self, utt: Union[str, torch.Tensor]) -> torch.Tensor:
+        """读取单条音频 -> 1D float32 waveform（在 self.device 上，采样率16k）。"""
+        # print(f"audio type is {type(utt)}")
+        if isinstance(utt, torch.Tensor):
+            # audio, sr = utt
+            audio = utt.to(self.device, non_blocking=True)
+        else:
+            audio, sr = torchaudio.load(utt)          # CPU
+            if audio.ndim > 1 and audio.size(0) > 1:  # 混单声道
+                audio = audio.mean(dim=0, keepdim=True)
+            audio = audio.squeeze(0).to(torch.float32).to(self.device, non_blocking=True)
+        return audio  # [T] on device
+    # -------- GPU 上做 feature_extractor --------
+    def _extract_features_gpu(self, audios: List[torch.Tensor]) -> dict:
+        """
+        1) 输入统一转 CPU numpy(float32)（FE 的要求）
+        2) 调用 FE，并传 device=self.device，让“输出张量”直接落在 GPU
+        3) 若模型是 fp16，仅将 input_features 转 half（mask 不动）
+        """
+        # 1) CUDA/CPU Tensor -> CPU numpy
+        np_audios = [a.detach().cpu().numpy().astype("float32") for a in audios]
+        feats = self.feature_extractor(
+            np_audios,
+            sampling_rate=16000,
+            return_attention_mask=True,
+            return_tensors="pt",
+            device=self.device,                  # ← 用得上
+            padding="longest",
+            pad_to_multiple_of=self.sample_stride,
+        )
+        feats = {k: (v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v)
+                for k, v in feats.items()}
+        # 3) 半精度对齐（只对 input_features）
+        if self.dtype == torch.float16 and "input_features" in feats:
+            feats["input_features"] = feats["input_features"].half()
+        return feats
+    def _forward(self, feats: dict) -> List[List[int]]:
+        outputs = self.model(**feats)
+        tokens = outputs.quantized_token_ids
+        # mask 下采样对齐：conv 下采样 × pooling
+        attn = feats["attention_mask"][
+            :, :: self.model.conv1.stride[0] * self.model.conv2.stride[0]
+        ][:, :: self.pooling_kernel_size]
+        return [t[m.bool()].tolist() for t, m in zip(tokens, attn)]
+    # -------- 同步批接口 --------
+    def extract(self, utts: List[Union[str, torch.Tensor]]) -> List[List[int]]:
+        """
+        不做 30s 分片，也不做 microbatch。
+        直接：加载/重采样 -> GPU 特征提取 -> 前向 -> 对齐输出。
+        """
+        audios = [self._load_audio(u) for u in utts]          # list[Tensor(T)] on device
+        with torch.inference_mode():
+            feats = self._extract_features_gpu(audios)        # on device
+            return self._forward(feats)
+    # -------- 异步批接口（保持你的返回协议）--------
+    async def _batch_extract_async(self, utts: List[Union[str, torch.Tensor]]):
+        tokens_list = await asyncio.to_thread(self.extract, utts)
+        return [{"tokens": t} for t in tokens_list]
+    async def extract_async(self, utt: Union[str, torch.Tensor]):
+        result = await self._batch_processor.add_request(single_input=utt)
+        feature = result.get("feature")
+        return feature.get("tokens")

models/glm_speech_tokenizer/test_speech_token_extractor.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+sys.path.append("../../..")
+import asyncio
+import time
+from datetime import datetime
+import torch
+import torchaudio
+from transformers import WhisperFeatureExtractor
+from arktts.models.glm_speech_tokenizer.modeling_whisper import WhisperVQEncoder
+from speech_token_extractor import SpeechTokenExtractor  # 你实现的类
+_RESAMPLE_CACHE: dict[int, torchaudio.transforms.Resample] = {}
+def ts() -> str:
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+def sync_cuda(device: str):
+    if isinstance(device, str) and device.startswith("cuda") and torch.cuda.is_available():
+        torch.cuda.synchronize(device=device)
+def load_wav_as_tuple(path: str,target_sr: int = 16000):
+    """读取 wav -> (mono_waveform_1d, sample_rate)；保持在CPU上交给 extractor 处理。"""
+    wav, sr = torchaudio.load(path)  # [C, T]
+    if wav.ndim == 2 and wav.size(0) > 1:
+        wav = wav.mean(dim=0)        # -> [T] 变单声道
+    else:
+        wav = wav.squeeze(0)         # [1, T] -> [T]
+    # 保证是连续的 float32（特征器吃 numpy.float32 会更快）
+    wav = wav.contiguous().to(torch.float32).cpu()
+    if sr != target_sr:
+        if sr not in _RESAMPLE_CACHE:
+            _RESAMPLE_CACHE[sr] = torchaudio.transforms.Resample(
+                orig_freq=sr, new_freq=target_sr
+            )
+        wav = _RESAMPLE_CACHE[sr](wav.unsqueeze(0)).squeeze(0)
+        sr = target_sr
+    # print(f"type wave is {type(wav)}")
+    return wav
+async def main():
+    # --- 1️⃣ 路径配置 ---
+    MODEL_PATH = "/data/yumu/model/glm-4-voice-tokenizer"
+    AUDIO_PATH1 = "/data/yumu/data/audio_data/qiduoduo_tts_out/00000013.wav"
+    AUDIO_PATH2 = "/data/yumu/data/audio_data/qiduoduo_tts_out/00000012.wav"
+    DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+    assert os.path.exists(AUDIO_PATH1), f"音频文件不存在: {AUDIO_PATH1}"
+    assert os.path.exists(MODEL_PATH), f"模型路径不存在: {MODEL_PATH}"
+    print(f"[{ts()}] 启动测试")
+    print(f"  - DEVICE        : {DEVICE}")
+    print(f"  - MODEL_PATH    : {MODEL_PATH}")
+    print(f"  - AUDIO1        : {AUDIO_PATH1}")
+    print(f"  - AUDIO2        : {AUDIO_PATH2 if os.path.exists(AUDIO_PATH2) else '(不存在，将重复 AUDIO1)'}")
+    # --- 2️⃣ 先把音频读入内存（改动点）---
+    audio1 = load_wav_as_tuple(AUDIO_PATH1)
+    audio2 = load_wav_as_tuple(AUDIO_PATH2) if os.path.exists(AUDIO_PATH2) else audio1
+    # --- 3️⃣ 加载模型与特征提取器 ---
+    print(f"\n[{ts()}] 加载 WhisperVQ 模型与特征提取器中...")
+    t0 = time.perf_counter()
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_PATH)
+    model = WhisperVQEncoder.from_pretrained(MODEL_PATH).eval().to(DEVICE)
+    if DEVICE.startswith("cuda"):
+        model = model.half()  # 半精度仅保留一次
+    sync_cuda(DEVICE)
+    t1 = time.perf_counter()
+    print(f"[{ts()}] 模型加载完成，用时 {(t1 - t0)*1000:.1f} ms")
+    # --- 4️⃣ 初始化提取器 ---
+    t0 = time.perf_counter()
+    extractor = SpeechTokenExtractor(
+        model=model,
+        feature_extractor=feature_extractor,
+        device=DEVICE,
+        batch_size=400,
+        wait_timeout=0.01,
+    )
+    sync_cuda(DEVICE)
+    t1 = time.perf_counter()
+    print(f"[{ts()}] ✅ SpeechTokenExtractor 初始化完成，用时 {(t1 - t0)*1000:.1f} ms")
+    # --- 5️⃣ 同步测试（传入预加载的 (wav, sr) 元组）---
+    print(f"\n[{ts()}] [同步模式] extract() 开始")
+    t0 = time.perf_counter()
+    sync_tokens_list = extractor.extract([audio1])  # ★ 改：不再传路径
+    sync_cuda(DEVICE)
+    t1 = time.perf_counter()
+    sync_tokens = sync_tokens_list[0]
+    print(f"[{ts()}] [同步模式] 完成：{len(sync_tokens)} tokens")
+    print(f"  - 预览：{sync_tokens[:20]} ...")
+    print(f"  - 耗时：{(t1 - t0)*1000:.1f} ms  （单样本）")
+    # --- 6️⃣ 异步测试（同样传入元组）---
+    print(f"\n[{ts()}] [异步模式] extract_async() 并发开始")
+    async def async_worker(audio_utt):
+        t_a0 = time.perf_counter()
+        print(f"type audio_utt is {type(audio_utt)}")
+        tokens = await extractor.extract_async(audio_utt)  # ★ 改：不再传路径
+        sync_cuda(DEVICE)
+        t_a1 = time.perf_counter()
+        print(f"  · → {len(tokens)} tokens, {(t_a1 - t_a0)*1000:.1f} ms")
+        return tokens, (t_a1 - t_a0)
+    # 这里保持你原本的 20+20 并发规模，只是把对象换成内存元组
+    test_inputs = [audio1] * 2 + [audio2] * 2
+    t0 = time.perf_counter()
+    results = await asyncio.gather(*(async_worker(aud) for aud in test_inputs))
+    sync_cuda(DEVICE)
+    t1 = time.perf_counter()
+    per_req_ms = [dt * 1000 for _, dt in results]
+    all_tokens = [tokens for tokens, _ in results]
+    print(f"[{ts()}] [异步模式] 完成")
+    print(f"  - 总请求数：{len(results)}")
+    print(f"  - 总耗时  ：{(t1 - t0)*1000:.1f} ms")
+    print(f"  - 单请求耗时（ms）：{[round(x,1) for x in per_req_ms]}")
+    print(f"  - 平均单请求耗时：{(sum(per_req_ms)/len(per_req_ms)):.1f} ms")
+    print(f"  - 任一结果预览  ：{all_tokens[0][:10]}")
+    print(f"\n[{ts()}] ✅ 所有测试完成。")
+if __name__ == "__main__":
+    asyncio.run(main())

models/glm_speech_tokenizer/utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import io
+import glob
+import math
+import tarfile
+import torch
+import torchaudio
+import safetensors
+from .configuration_whisper import WhisperVQConfig
+from .modeling_whisper import WhisperVQEncoder, WhisperVQForConditionalGeneration
+from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
+# import asyncio
+# from ..batch_processor import AsyncBatchEngine  # 修改为你的路径
+# from typing import List, Union, Tuple, Literal, Optional
+def load_quantize_encoder(model_path):
+    config = WhisperVQConfig.from_pretrained(model_path)
+    config.quantize_encoder_only = True
+    model = WhisperVQEncoder(config)
+    state_dict = {}
+    for path in glob.glob(os.path.join(model_path, "model*.safetensors")):
+        with safetensors.safe_open(path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                if key.startswith("model.encoder."):
+                    new_key = key[len("model.encoder."):]
+                    if new_key.startswith("layer_norm"):
+                        continue
+                    if new_key.startswith("layers"):
+                        layer_id = int(new_key.split(".")[1])
+                        if layer_id >= config.quantize_position:
+                            continue
+                    state_dict[new_key] = f.get_tensor(key)
+    model.load_state_dict(state_dict)
+    model.eval()
+    model.cuda()
+    return model
+_resample_buffer: dict[int, torchaudio.transforms.Resample] = {}
+def extract_speech_token(model: WhisperVQEncoder, feature_extractor: WhisperFeatureExtractor, utts,device="cuda"):
+    with torch.no_grad():
+        audios, indices = [], []
+        for idx, utt in enumerate(utts):
+            if isinstance(utt, tuple):
+                audio, sample_rate = utt
+            else:
+                audio, sample_rate = torchaudio.load(utt)
+            audio = audio.to(device)
+            if sample_rate != 16000:
+                if sample_rate not in _resample_buffer:
+                    _resample_buffer[sample_rate] = torchaudio.transforms.Resample(
+                        orig_freq=sample_rate,
+                        new_freq=16000
+                    ).to(device)
+                audio = _resample_buffer[sample_rate](audio)
+            # if audio.shape[0] > 1:
+            #     audio = audio[:1]
+            audio = audio[0]
+            audio = audio.cpu().numpy()
+            time_step = 0
+            while time_step * 16000 < audio.shape[0]:
+                audio_segment = audio[time_step * 16000: (time_step + 30) * 16000]
+                audios.append(audio_segment)
+                indices.append(idx)
+                time_step += 30
+        pooling_kernel_size = model.config.pooling_kernel_size or 1
+        stride = model.conv1.stride[0] * model.conv2.stride[0] * pooling_kernel_size * feature_extractor.hop_length
+        all_speech_tokens = [[] for _ in range(len(utts))]
+        batch_size = 128
+        for start in range(0, len(audios), batch_size):
+            features = feature_extractor(audios[start: start + batch_size], sampling_rate=16000,
+                                         return_attention_mask=True, return_tensors="pt", device=device,
+                                         padding="longest", pad_to_multiple_of=stride)
+            features = features.to(device=device)
+            # ✅ 关键修复：如果模型是FP16，则输入也转为FP16
+            if next(model.parameters()).dtype == torch.float16:
+                features = {k: v.half() for k, v in features.items()}
+            outputs = model(**features)
+            speech_tokens = outputs.quantized_token_ids
+            attention_mask = features["attention_mask"][:, ::model.conv1.stride[0] * model.conv2.stride[0]]
+            attention_mask = attention_mask[:, ::model.config.pooling_kernel_size]
+            assert attention_mask.shape == speech_tokens.shape
+            for i in range(len(speech_tokens)):
+                idx = indices[start + i]
+                speech_token = speech_tokens[i][attention_mask[i].bool()].tolist()
+                all_speech_tokens[idx].extend(speech_token)
+        return all_speech_tokens

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.57.3
+torch==2.8.0
+librosa
+soundfile
+numpy