yueyulin commited on Aug 27, 2025

Commit

acb8f2a

verified ·

1 Parent(s): 66d387e

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

rwkv7-0.1B-g1-respark-voice-tunable-ipa/BiCodecDetokenize.onnx +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/BiCodecTokenize.onnx +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/__pycache__/properties_util.cpython-311.pyc +0 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/__pycache__/ref_audio_utilities.cpython-311.pyc +0 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/config.json +55 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/generation_config.json +6 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/hf_rwkv_tokenizer.py +280 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/model.safetensors +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/model_converted.pth +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/model_padded.pth +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/modeling_rwkvspeech.py +6 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/properties_util.py +221 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/ref_audio_utilities.py +306 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/rwkv_vocab_v20230424.txt +0 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/spark_llm.py +202 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/special_tokens_map.json +24 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/texts_utilities.py +0 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/tokenizer_config.json +836 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/translation_data.py +55 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/tts_cli.py +992 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/utilities.py +209 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/vocab.txt +0 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/wav2vec2-large-xlsr-53.onnx +3 -0
rwkv7-0.1B-g1-respark-voice-tunable-ipa/webrwkv.safetensors +3 -0

rwkv7-0.1B-g1-respark-voice-tunable-ipa/BiCodecDetokenize.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:055f86df2809ca8b9210154e8ddc85aa7458909d4b30aa7f996e3fe053a71e3d
+size 385412236

rwkv7-0.1B-g1-respark-voice-tunable-ipa/BiCodecTokenize.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7080b9790ee020977105d78754628c2b5e03841c0bbfc0294072ec40278222ce
+size 146225395

rwkv7-0.1B-g1-respark-voice-tunable-ipa/__pycache__/properties_util.cpython-311.pyc ADDED Viewed

Binary file (5.93 kB). View file

rwkv7-0.1B-g1-respark-voice-tunable-ipa/__pycache__/ref_audio_utilities.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

rwkv7-0.1B-g1-respark-voice-tunable-ipa/config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "a_low_rank_dim": 64,
+  "architectures": [
+    "RWKV7ForSpeech"
+  ],
+  "attn": null,
+  "attn_mode": "chunk",
+  "audio_global_vocab_size": 4096,
+  "auto_map": {
+    "AutoConfig": "spark_llm.RWKV7SpeechConfig",
+    "AutoModel": "modeling_rwkvspeech.RWKV7Model",
+    "AutoModelForCausalLM": "modeling_rwkvspeech.RWKV7ForSpeech"
+  },
+  "bos_token_id": 0,
+  "decay_low_rank_dim": 64,
+  "eos_token_id": 0,
+  "fuse_cross_entropy": true,
+  "fuse_norm": false,
+  "gate_low_rank_dim": 128,
+  "head_dim": 64,
+  "hidden_act": "sqrelu",
+  "hidden_ratio": 4.0,
+  "hidden_size": 768,
+  "initializer_range": 0.006,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 2048,
+  "model_type": "rwkv7",
+  "norm_bias": true,
+  "norm_eps": 1e-05,
+  "norm_first": true,
+  "num_heads": 32,
+  "num_hidden_layers": 12,
+  "text_vocab_size": 65631,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "use_l2warp": true,
+  "v_low_rank_dim": 32,
+  "value_dim": [
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768
+  ],
+  "vocab_size": 8193
+}

rwkv7-0.1B-g1-respark-voice-tunable-ipa/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.52.4"
+}

rwkv7-0.1B-g1-respark-voice-tunable-ipa/hf_rwkv_tokenizer.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RWKV."""
+import os
+import re
+from typing import TYPE_CHECKING, List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+if TYPE_CHECKING:
+    pass
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "rwkv_vocab_v20230424.txt",
+}
+class TRIE:
+    __slots__ = tuple("ch,to,values,front".split(","))
+    to: list
+    values: set
+    def __init__(self, front=None, ch=None):
+        self.ch = ch
+        self.to = [None for ch in range(256)]
+        self.values = set()
+        self.front = front
+    def __repr__(self):
+        fr = self
+        ret = []
+        while fr != None:
+            if fr.ch != None:
+                ret.append(fr.ch)
+            fr = fr.front
+        return "<TRIE %s %s>" % (ret[::-1], self.values)
+    def add(self, key: bytes, idx: int = 0, val=None):
+        if idx == len(key):
+            if val is None:
+                val = key
+            self.values.add(val)
+            return self
+        ch = key[idx]
+        if self.to[ch] is None:
+            self.to[ch] = TRIE(front=self, ch=ch)
+        return self.to[ch].add(key, idx=idx + 1, val=val)
+    def find_longest(self, key: bytes, idx: int = 0):
+        u: TRIE = self
+        ch: int = key[idx]
+        while u.to[ch] is not None:
+            u = u.to[ch]
+            idx += 1
+            if u.values:
+                ret = idx, u, u.values
+            if idx == len(key):
+                break
+            ch = key[idx]
+        return ret
+class RWKV_TOKENIZER:
+    def __init__(self, file_name):
+        self.idx2token = {}
+        sorted = []  # must be already sorted
+        with open(file_name, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        for l in lines:
+            idx = int(l[: l.index(" ")])
+            x = eval(l[l.index(" ") : l.rindex(" ")])
+            x = x.encode("utf-8") if isinstance(x, str) else x
+            assert isinstance(x, bytes)
+            assert len(x) == int(l[l.rindex(" ") :])
+            sorted += [x]
+            self.idx2token[idx] = x
+        self.token2idx = {}
+        for k, v in self.idx2token.items():
+            self.token2idx[v] = int(k)
+        self.root = TRIE()
+        for t, i in self.token2idx.items():
+            _ = self.root.add(t, val=(t, i))
+    def encodeBytes(self, src: bytes):
+        idx: int = 0
+        tokens = []
+        while idx < len(src):
+            _idx: int = idx
+            idx, _, values = self.root.find_longest(src, idx)
+            assert idx != _idx
+            _, token = next(iter(values))
+            tokens.append(token)
+        return tokens
+    def decodeBytes(self, tokens):
+        return b"".join(map(lambda i: self.idx2token[i], tokens))
+    def encode(self, src):
+        if isinstance(src, str):
+            return [self.encodeBytes(src.encode("utf-8"))]
+        elif isinstance(src, list):
+            return [self.encodeBytes(s.encode("utf-8")) for s in src]
+    def decode(self, tokens):
+        return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
+        # try:
+        #     return self.decodeBytes(tokens).decode('utf-8')
+        # except:
+        #     return '\ufffd' # bad utf-8
+    def printTokens(self, tokens):
+        for i in tokens:
+            s = self.idx2token[i]
+            try:
+                s = s.decode("utf-8")
+            except:
+                pass
+            print(f"{repr(s)}{i}", end=" ")
+        print()
+class RwkvTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self, vocab_file, bos_token="<|rwkv_tokenizer_end_of_text|>", eos_token="<|rwkv_tokenizer_end_of_text|>", unk_token="<|rwkv_tokenizer_end_of_text|>", **kwargs
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'."
+            )
+        with open(vocab_file, "r", encoding="utf-8") as reader:
+            tokens = reader.readlines()
+        if "add_bos_token" in kwargs:
+            self.add_bos_token = kwargs["add_bos_token"]
+        else:
+            self.add_bos_token = False
+        self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
+        vocab = self.trie_tokenizer.token2idx
+        self.encoder = vocab
+        self.decoder = {v: k for k, v in vocab.items()}
+        self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
+        super().__init__(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = self.encoder
+        vocab.update(self.added_tokens_encoder)
+        vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))
+        return vocab
+    def _tokenize(self, text, split_special_tokens=False):
+        # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
+        return self.trie_tokenizer.encode(text)[0]
+    def _convert_token_to_id(self, token):
+        return token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (byte) using the vocab."""
+        token = self.decoder.get(index, self.unk_token)
+        if isinstance(token, (bytes)):
+            token = token.decode("utf-8", errors="replace")
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
+        out_string = b"".join(
+            [k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
+        ).decode("utf-8")
+        return out_string
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
+            )
+        else:
+            vocab_file = (
+                filename_prefix + "-" if filename_prefix else ""
+            ) + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(
+                self.encoder.items(), key=lambda kv: kv[1]
+            ):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(str(token) + "\n")
+                index += 1
+        return (vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is None:
+            return output
+        return output + bos_token_ids + token_ids_1
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        if not self.add_bos_token:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=False,
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))

rwkv7-0.1B-g1-respark-voice-tunable-ipa/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afcb4166f444542ac06f458442bb238555e729eaba9faa864fd34bd3828a963e
+size 626075280

rwkv7-0.1B-g1-respark-voice-tunable-ipa/model_converted.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39945eeb0927bbca576bbb85df3892c0289846e388561acf3f4586c79159bff8
+size 626155657

rwkv7-0.1B-g1-respark-voice-tunable-ipa/model_padded.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01fe9eb457d2e120bc21aa33f226bb06e2452c97e4321c3e82e15aad185e4684
+size 840365002

rwkv7-0.1B-g1-respark-voice-tunable-ipa/modeling_rwkvspeech.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from model.llm.spark_llm import RWKV7SpeechConfig,RWKV7ForSpeech
+from rwkvfla.models.rwkv7 import RWKV7Model
+RWKV7ForCausalLM = RWKV7ForSpeech
+RWKV7Model = RWKV7Model
+RWKV7Config = RWKV7SpeechConfig

rwkv7-0.1B-g1-respark-voice-tunable-ipa/properties_util.py ADDED Viewed

	@@ -0,0 +1,221 @@

+SPEED_MAP = {
+    "very_slow": "SPCT_1",
+    "slow": "SPCT_2",
+    "medium": "SPCT_3",
+    "fast": "SPCT_4",
+    "very_fast": "SPCT_5",
+}
+PITCH_MAP = {
+    "low_pitch": "SPCT_6",
+    "medium_pitch": "SPCT_7",
+    "high_pitch": "SPCT_8",
+    "very_high_pitch": "SPCT_9",
+}
+AGE_MAP = {
+    "child": "SPCT_13",
+    "teenager": "SPCT_14",
+    "youth-adult": "SPCT_15",
+    "middle-aged": "SPCT_16",
+    "elderly": "SPCT_17",
+}
+EMOTION_MAP = {
+    "UNKNOWN": "SPCT_21",
+    "NEUTRAL": "SPCT_22",
+    "ANGRY": "SPCT_23",
+    "HAPPY": "SPCT_24",
+    "SAD": "SPCT_25",
+    "FEARFUL": "SPCT_26",
+    "DISGUSTED": "SPCT_27",
+    "SURPRISED": "SPCT_28",
+    "SARCASTIC": "SPCT_29",
+    "EXCITED": "SPCT_30",
+    "SLEEPY": "SPCT_31",
+    "CONFUSED": "SPCT_32",
+    "EMPHASIS": "SPCT_33",
+    "LAUGHING": "SPCT_34",
+    "SINGING": "SPCT_35",
+    "WORRIED": "SPCT_36",
+    "WHISPER": "SPCT_37",
+    "ANXIOUS": "SPCT_38",
+    "NO-AGREEMENT": "SPCT_39",
+    "APOLOGETIC": "SPCT_40",
+    "CONCERNED": "SPCT_41",
+    "ENUNCIATED": "SPCT_42",
+    "ASSERTIVE": "SPCT_43",
+    "ENCOURAGING": "SPCT_44",
+    "CONTEMPT": "SPCT_45",
+}
+# 注意：这里有两个GENDER_MAP定义，第二个会覆盖第一个
+# 第一个定义包含了"unknown"，第二个只包含"female"和"male"
+# 建议使用第二个定义，因为它更简洁且符合实际使用场景
+GENDER_MAP = {
+    "female": "SPCT_46",
+    "male": "SPCT_47"
+}
+def convert_standard_properties_to_tokens(age: str, gender: str, emotion: str, pitch: str, speed: str) -> list:
+    age_token = AGE_MAP[age.lower()]
+    gender_token = GENDER_MAP[gender.lower()]
+    emotion_token = EMOTION_MAP[emotion.upper()]
+    pitch_token = PITCH_MAP[pitch.lower()]
+    speed_token = SPEED_MAP[speed.lower()]
+    return "SPCT_0"+age_token+gender_token+emotion_token+pitch_token+speed_token
+def convert_properties_to_tokens(age: str, gender: str, emotion: str, pitch: float, speed: float) -> list:
+    age_token = AGE_MAP[age.lower()]
+    gender_token = GENDER_MAP[gender.lower()]
+    emotion_token = EMOTION_MAP[emotion.upper()]
+    pitch_token = PITCH_MAP[classify_pitch(pitch, gender.lower(), age.lower())]
+    speed_token = SPEED_MAP[classify_speed(speed)]
+    return "SPCT_0"+age_token+gender_token+emotion_token+pitch_token+speed_token
+def classify_speed(speed: float) -> str:
+    if speed <= 3.5:
+        return "very_slow"
+    elif 3.5 < speed < 4.0:
+        return "slow"
+    elif 4.0 < speed <= 4.5:
+        return "medium"
+    elif 4.5 < speed <= 5.0:
+        return "fast"
+    else: # speed >= 5.0
+        return "very_fast"
+def classify_pitch(pitch: float, gender: str, age: str) -> str:
+    """
+    根据性别和年龄重新划分pitch区间
+    基于统计结果：
+    - female: 平均212.08, 中位数208.76, 25%分位数187.40, 75%分位数232.08
+    - male: 平均136.22, 中位数129.65, 25%分位数113.76, 75%分位数151.42
+    """
+    gender = gender.lower()
+    age = age.lower()
+    # 女性分类
+    if gender == "female":
+        if age == "child":
+            # Child: 平均280.12, 中位数279.34, 范围216.91-324.25
+            if pitch < 250:
+                return "low_pitch"
+            elif pitch < 290:
+                return "medium_pitch"
+            else:
+                return "high_pitch"
+        elif age == "teenager":
+            # Teenager: 平均240.61, 中位数238.43, 25%分位数207.54, 75%分位数270.12
+            if pitch < 208:
+                return "low_pitch"
+            elif pitch < 238:
+                return "medium_pitch"
+            elif pitch < 270:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "youth-adult":
+            # Youth-Adult: 平均213.26, 中位数210.99, 25%分位数190.81, 75%分位数232.24
+            if pitch < 191:
+                return "low_pitch"
+            elif pitch < 211:
+                return "medium_pitch"
+            elif pitch < 232:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "middle-aged":
+            # Middle-aged: 平均197.68, 中位数195.01, 25%分位数176.34, 75%分位数215.22
+            if pitch < 176:
+                return "low_pitch"
+            elif pitch < 195:
+                return "medium_pitch"
+            elif pitch < 215:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "elderly":
+            # Elderly: 平均194.91, 中位数189.90, 25%分位数170.42, 75%分位数213.41
+            if pitch < 170:
+                return "low_pitch"
+            elif pitch < 190:
+                return "medium_pitch"
+            elif pitch < 213:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        else:
+            # 默认女性分类
+            if pitch < 187:
+                return "low_pitch"
+            elif pitch < 209:
+                return "medium_pitch"
+            elif pitch < 232:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+    # 男性分类
+    elif gender == "male":
+        if age == "teenager":
+            # Teenager: 平均150.93, 中位数142.50, 25%分位数121.47, 75%分位数165.55
+            if pitch < 121:
+                return "low_pitch"
+            elif pitch < 143:
+                return "medium_pitch"
+            elif pitch < 166:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "youth-adult":
+            # Youth-Adult: 平均137.17, 中位数130.92, 25%分位数114.70, 75%分位数153.18
+            if pitch < 115:
+                return "low_pitch"
+            elif pitch < 131:
+                return "medium_pitch"
+            elif pitch < 153:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "middle-aged":
+            # Middle-aged: 平均132.33, 中位数125.30, 25%分位数110.31, 75%分位数146.55
+            if pitch < 110:
+                return "low_pitch"
+            elif pitch < 125:
+                return "medium_pitch"
+            elif pitch < 147:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        elif age == "elderly":
+            # Elderly: 平均132.62, 中位数128.42, 25%分位数114.69, 75%分位数141.57
+            if pitch < 115:
+                return "low_pitch"
+            elif pitch < 128:
+                return "medium_pitch"
+            elif pitch < 142:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+        else:
+            # 默认男性分类
+            if pitch < 114:
+                return "low_pitch"
+            elif pitch < 130:
+                return "medium_pitch"
+            elif pitch < 151:
+                return "high_pitch"
+            else:
+                return "very_high_pitch"
+    # 未知性别，使用通用分类
+    else:
+        if pitch < 130:
+            return "low_pitch"
+        elif pitch < 180:
+            return "medium_pitch"
+        elif pitch < 220:
+            return "high_pitch"
+        else:
+            return "very_high_pitch"

rwkv7-0.1B-g1-respark-voice-tunable-ipa/ref_audio_utilities.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import onnxruntime as ort
+import numpy as np
+import librosa
+import soundfile as sf
+import soxr
+from pathlib import Path
+from typing import Tuple, Union, Optional
+import soundfile as sf
+class RefAudioUtilities:
+    """音频处理工具类，使用ONNX模型生成tokens"""
+    def __init__(self, onnx_model_path: str, wav2vec2_path,
+                 ref_segment_duration: float = 6.0, latent_hop_length: int = 320):
+        """
+        初始化ONNX模型
+        Args:
+            onnx_model_path: ONNX模型文件路径
+            wav2vec2_path: wav2vec2 ONNX模型文件路径，如果为None则不加载wav2vec2模型
+            ref_segment_duration: 参考音频时长（秒）
+            latent_hop_length: 潜在特征跳长度
+        """
+        self.ort_session = ort.InferenceSession(onnx_model_path,
+                                                providers=['CUDAExecutionProvider','CPUExecutionProvider'])
+        print(f"🖥️ONNX Session actual providers: {self.ort_session.get_providers()}")
+        self.sample_rate = 16000
+        self.ref_segment_duration = ref_segment_duration
+        self.latent_hop_length = latent_hop_length
+        # 获取模型输入输出信息
+        self.input_names = [input_info.name for input_info in self.ort_session.get_inputs()]
+        self.output_names = [output_info.name for output_info in self.ort_session.get_outputs()]
+        print(f"模型输入: {self.input_names}")
+        print(f"模型输出: {self.output_names}")
+        # 初始化wav2vec2模型
+        self.wav2vec2_session = ort.InferenceSession(wav2vec2_path,
+                                                providers=['CUDAExecutionProvider','CPUExecutionProvider'])
+        print(f"🖥️Wav2Vec2 Session actual providers: {self.wav2vec2_session.get_providers()}")
+    def load_audio(self, audio_path: Union[str, Path], target_sr: int = 16000,
+                   volume_normalize: bool = False) -> np.ndarray:
+        """
+        加载音频文件，与BiCodecTokenizer保持一致
+        Args:
+            audio_path: 音频文件路径
+            target_sr: 目标采样率
+            volume_normalize: 是否进行音量归一化
+        Returns:
+            音频数据数组
+        """
+        if isinstance(audio_path, str):
+            audio_path = Path(audio_path)
+        # 使用soundfile加载音频，与BiCodecTokenizer保持一致
+        audio, sr = sf.read(audio_path)
+        if len(audio.shape) > 1:
+            audio = audio[:, 0]  # 如果是立体声，取第一个通道
+        # 重采样到目标采样率
+        if sr != target_sr:
+            audio = soxr.resample(audio, sr, target_sr, quality="VHQ")
+            sr = target_sr
+        # 音量归一化
+        if volume_normalize:
+            audio = self._audio_volume_normalize(audio)
+        return audio
+    def _audio_volume_normalize(self, audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
+        """音频音量归一化"""
+        # Sort the absolute values of the audio signal
+        temp = np.sort(np.abs(audio))
+        # If the maximum value is less than 0.1, scale the array to have a maximum of 0.1
+        if temp[-1] < 0.1:
+            scaling_factor = max(
+                temp[-1], 1e-3
+            )  # Prevent division by zero with a small constant
+            audio = audio / scaling_factor * 0.1
+        # Filter out values less than 0.01 from temp
+        temp = temp[temp > 0.01]
+        L = temp.shape[0]  # Length of the filtered array
+        # If there are fewer than or equal to 10 significant values, return the audio without further processing
+        if L <= 10:
+            return audio
+        # Compute the average of the top 10% to 1% of values in temp
+        volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
+        # Normalize the audio to the target coefficient level, clamping the scale factor between 0.1 and 10
+        audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
+        # Ensure the maximum absolute value in the audio does not exceed 1
+        max_value = np.max(np.abs(audio))
+        if max_value > 1:
+            audio = audio / max_value
+        return audio
+    def extract_mel_spectrogram(self, wav: np.ndarray, n_mels: int = 128,
+                               n_fft: int = 1024, hop_length: int = 320,
+                               win_length: int = 640) -> np.ndarray:
+        """
+        提取梅尔频谱图
+        Args:
+            wav: 音频数据
+            n_mels: 梅尔滤波器组数量
+            n_fft: FFT窗口大小
+            hop_length: 帧移
+            win_length: 窗口长度
+        Returns:
+            梅尔频谱图
+        """
+        mel_spec = librosa.feature.melspectrogram(
+            y=wav,
+            sr=self.sample_rate,
+            n_mels=n_mels,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            power=1,
+            norm="slaney",
+            fmin=10,
+        )
+        return mel_spec
+    def extract_wav2vec2_features(self, wav: np.ndarray) -> np.ndarray:
+        """
+        使用ONNX wav2vec2模型提取特征，模拟BiCodecTokenizer的行为
+        Args:
+            wav: 音频数据
+        Returns:
+            特征向量
+        """
+        # 检查wav2vec2模型是否已加载
+        if self.wav2vec2_session is None:
+            raise RuntimeError("wav2vec2模型未加载，请在初始化时提供wav2vec2_path参数")
+        # 添加batch维度
+        input_data = wav[np.newaxis, :].astype(np.float32)  # [1, sequence_length]
+        # 运行wav2vec2推理
+        # 注意：这个ONNX模型已经包含了特征提取器的预处理和多个隐藏层的组合
+        inputs = {'input': input_data}
+        outputs = self.wav2vec2_session.run(None, inputs)
+        # 输出形状应该是 [1, time_steps, 1024]
+        # 这个输出已经是通过选择隐藏层11, 14, 16并计算平均值得到的
+        print(f'outputs: {outputs}')
+        print(f'outputs: {outputs[0].shape}')
+        features = outputs[0][0]  # 移除batch维度，得到 [time_steps, 1024]
+        return features.astype(np.float32)
+    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
+        """
+        获取参考音频片段，与BiCodecTokenizer保持一致
+        Args:
+            wav: 原始音频数据
+        Returns:
+            参考音频片段
+        """
+        # 使用与BiCodecTokenizer相同的计算方式
+        ref_segment_length = (
+            int(self.sample_rate * self.ref_segment_duration)
+            // self.latent_hop_length
+            * self.latent_hop_length
+        )
+        wav_length = len(wav)
+        if ref_segment_length > wav_length:
+            # 如果音频不足指定长度，重复音频直到达到要求
+            repeat_times = ref_segment_length // wav_length + 1
+            wav = np.tile(wav, repeat_times)
+        # 截取指定长度
+        return wav[:ref_segment_length]
+    def process_audio(self, audio_path: Union[str, Path], volume_normalize: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        处理音频文件，返回原始音频和参考音频，与BiCodecTokenizer保持一致
+        Args:
+            audio_path: 音频文件路径
+            volume_normalize: 是否进行音量归一化
+        Returns:
+            (原始音频, 参考音频)
+        """
+        wav = self.load_audio(audio_path, volume_normalize=volume_normalize)
+        ref_wav = self.get_ref_clip(wav)
+        return wav, ref_wav
+    def tokenize(self, audio_path: Union[str, Path]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        使用ONNX模型生成tokens
+        Args:
+            audio_path: 音频文件路径
+        Returns:
+            (global_tokens, semantic_tokens)
+        """
+        # 处理音频
+        wav, ref_wav = self.process_audio(audio_path)
+        # 提取特征
+        feat = self.extract_wav2vec2_features(wav)
+        ref_mel = self.extract_mel_spectrogram(ref_wav)
+        # 添加batch维度
+        ref_mel_input = ref_mel[np.newaxis, :, :].astype(np.float32)  # [1, 128, 301]
+        feat_input = feat[np.newaxis, :, :].astype(np.float32)  # [1, feat_len, 1024]
+        # 运行ONNX模型
+        inputs = {
+            'ref_wav_mel': ref_mel_input,
+            'feat': feat_input
+        }
+        outputs = self.ort_session.run(self.output_names, inputs)
+        # 解析输出
+        semantic_tokens = outputs[0]  # 第一个输出
+        global_tokens = outputs[1]    # 第二个输出
+        return global_tokens, semantic_tokens
+    def tokenize_batch(self, audio_paths: list) -> Tuple[list, list]:
+        """
+        批量处理音频文件
+        Args:
+            audio_paths: 音频文件路径列表
+        Returns:
+            (global_tokens_list, semantic_tokens_list)
+        """
+        global_tokens_list = []
+        semantic_tokens_list = []
+        for audio_path in audio_paths:
+            global_tokens, semantic_tokens = self.tokenize(audio_path)
+            global_tokens_list.append(global_tokens)
+            semantic_tokens_list.append(semantic_tokens)
+        return global_tokens_list, semantic_tokens_list
+# 测试函数
+def test_ref_audio_utilities():
+    """测试RefAudioUtilities类"""
+    # 初始化工具类
+    onnx_model_path = '/Volumes/bigdata/models/RWKVTTS_WebRWKV/BiCodecTokenize.onnx'
+    wav2vec2_path = "/Volumes/bigdata/models/RWKVTTS_WebRWKV/wav2vec2-large-xlsr-53.onnx"
+    # 使用与BiCodecTokenizer相同的���数
+    utilities = RefAudioUtilities(
+        onnx_model_path,
+        wav2vec2_path,
+        ref_segment_duration=6.0,  # 6秒参考音频
+        latent_hop_length=320       # 潜在特征跳长度
+    )
+    # 测试音频文件（使用项目中的示例音频）
+    test_audio_path = "demos/刘德华/dehua_zh.wav"
+    if Path(test_audio_path).exists():
+        print(f"测试音频文件: {test_audio_path}")
+        try:
+            # 生成tokens
+            global_tokens, semantic_tokens = utilities.tokenize(test_audio_path)
+            print(f"Global tokens shape: {global_tokens.shape}")
+            print(f"Semantic tokens shape: {semantic_tokens.shape}")
+            print(f"Global tokens: {global_tokens.flatten().tolist()}")
+            print(f"Semantic tokens : {semantic_tokens.flatten().tolist()}")
+        except Exception as e:
+            print(f"处理音频时出错: {e}")
+    else:
+        print(f"测试音频文件不存在: {test_audio_path}")
+        print("请确保测试音频文件存在")
+if __name__ == "__main__":
+    test_ref_audio_utilities()

rwkv7-0.1B-g1-respark-voice-tunable-ipa/rwkv_vocab_v20230424.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

rwkv7-0.1B-g1-respark-voice-tunable-ipa/spark_llm.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import torch.nn as nn
+from typing import Optional, Union, Tuple, Dict, Unpack
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils.deprecation import deprecate_kwarg
+from rwkvfla.models.rwkv7.modeling_rwkv7 import RWKV7Model, RWKV7PreTrainedModel, Cache,RWKV7ForCausalLM
+from rwkvfla.models.rwkv7.modeling_rwkv7 import FusedLinearCrossEntropyLoss, FusedCrossEntropyLoss
+from transformers.generation.utils import GenerationMixin
+from rwkvfla.models.rwkv7.configuration_rwkv7 import RWKV7Config
+class RWKV7SpeechConfig(RWKV7Config):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.text_vocab_size = kwargs.get("text_vocab_size", kwargs.get("text_vocab_size"))
+        self.audio_global_vocab_size = kwargs.get("audio_global_vocab_size", kwargs.get("audio_global_vocab_size"))
+class RWKV7ForSpeech(RWKV7ForCausalLM):
+    config_class = RWKV7SpeechConfig
+    def __init__(self, config: RWKV7SpeechConfig):
+        super().__init__(config)
+        self.model = RWKV7Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)#Spark 0.5B vocab size is 8192 + 1 for eos resulting in 8193
+        self.criterion = None
+        self.text_embedder = nn.Embedding(config.text_vocab_size, config.hidden_size)
+        self.global_embedder = nn.Embedding(config.audio_global_vocab_size, config.hidden_size)#Spark 0.5B global token size is 4096
+        #TTS Tag includes GLOBAL=0, SEMANTIC=1,START_TTS=2
+        self.tts_tag_embedder = nn.Embedding(3, config.hidden_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.dropout = torch.nn.Dropout(0.02)
+    def get_input_embeddings(self):
+        return self.model.embeddings
+    def set_input_embeddings(self, value):
+        self.model.embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def generate(self, *args, **kwargs):
+        try:
+            return super().generate(*args, **kwargs)
+        except AttributeError as exception:
+            if 'past_key_values' in str(exception):
+                raise AttributeError(
+                    f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, "
+                    f"which is not supported for {self.__class__.__name__}. "
+                    f"Try another generation strategy instead. "
+                    f"For the available generation strategies, check this doc: "
+                    f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
+                )
+            else:
+                raise exception
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: bool = True,
+        logits_to_keep: Optional[int] = None,
+        **kwargs
+    ):
+        # only last token for `inputs_ids` if the `past_key_values` is not empty.
+        if past_key_values is not None and len(past_key_values) > 0:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and len(past_key_values) == 0:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard.
+            # Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {'input_ids': input_ids.contiguous()}
+        if logits_to_keep is not None:
+            model_inputs['logits_to_keep'] = logits_to_keep
+        model_inputs.update({
+            'past_key_values': past_key_values,
+            'use_cache': use_cache,
+            'attention_mask': attention_mask,
+            'logits_to_keep': logits_to_keep,
+        })
+        return model_inputs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs: Unpack[Dict]
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.training and inputs_embeds is not None:
+            inputs_embeds = self.dropout(inputs_embeds)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs
+        )
+        hidden_states = outputs[0]
+        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
+        loss, logits = None, None
+        if not fuse_linear_and_cross_entropy or labels is None:
+            logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:])
+        if labels is not None:
+            if getattr(self, 'criterion', None) is None:
+                if fuse_linear_and_cross_entropy:
+                    criterion = FusedLinearCrossEntropyLoss()
+                elif self.config.fuse_cross_entropy:
+                    criterion = FusedCrossEntropyLoss(inplace_backward=True)
+                else:
+                    criterion = nn.CrossEntropyLoss()
+            else:
+                criterion = self.criterion
+            # Enable model parallelism
+            labels = labels.to(hidden_states.device)
+            labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1)
+            if fuse_linear_and_cross_entropy:
+                loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias)
+            else:
+                loss = criterion(logits.view(labels.numel(), -1), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def copy_state_dict(self, state_dict: dict):
+        """从源 state dict 复制参数到当前模型，排除 embeddings 和 lm_head
+        The state dict is from original RWKV7 language model
+        Args:
+            state_dict: 源 state dict
+        """
+        # 获取当前模型的 state dict
+        target_dict = self.state_dict()
+        # 创建新的 state dict 用于存储要复制的参数
+        new_state_dict = {}
+        # 遍历源 state dict 的键
+        for key in state_dict.keys():
+            # 跳过 embeddings 和 lm_head 相关的参数
+            if key == 'model.embeddings.weight':
+                new_state_dict['text_embedder.weight'] = state_dict[key]
+                continue
+            if 'embeddings' in key or 'lm_head' in key:
+                continue
+            # 如果键在当前模型中存在，则复制参数
+            if key in target_dict:
+                new_state_dict[key] = state_dict[key]
+        # 加载新的 state dict 到当前模型
+        info = self.load_state_dict(new_state_dict, strict=False)
+        print(info)
+        return self

rwkv7-0.1B-g1-respark-voice-tunable-ipa/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|rwkv_tokenizer_end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": "\n\n",
+  "pad_token": {
+    "content": "<|rwkv_tokenizer_end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|rwkv_tokenizer_end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

rwkv7-0.1B-g1-respark-voice-tunable-ipa/texts_utilities.py ADDED Viewed

File without changes

rwkv7-0.1B-g1-respark-voice-tunable-ipa/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,836 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|rwkv_tokenizer_end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65530": {
+      "content": "\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65531": {
+      "content": "SPCT_0",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65532": {
+      "content": "SPCT_1",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65533": {
+      "content": "SPCT_2",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65534": {
+      "content": "SPCT_3",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65535": {
+      "content": "SPCT_4",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65536": {
+      "content": "SPCT_5",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65537": {
+      "content": "SPCT_6",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65538": {
+      "content": "SPCT_7",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65539": {
+      "content": "SPCT_8",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65540": {
+      "content": "SPCT_9",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65541": {
+      "content": "SPCT_10",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65542": {
+      "content": "SPCT_11",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65543": {
+      "content": "SPCT_12",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65544": {
+      "content": "SPCT_13",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65545": {
+      "content": "SPCT_14",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65546": {
+      "content": "SPCT_15",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65547": {
+      "content": "SPCT_16",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65548": {
+      "content": "SPCT_17",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65549": {
+      "content": "SPCT_18",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65550": {
+      "content": "SPCT_19",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65551": {
+      "content": "SPCT_20",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65552": {
+      "content": "SPCT_21",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65553": {
+      "content": "SPCT_22",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65554": {
+      "content": "SPCT_23",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65555": {
+      "content": "SPCT_24",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65556": {
+      "content": "SPCT_25",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65557": {
+      "content": "SPCT_26",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65558": {
+      "content": "SPCT_27",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65559": {
+      "content": "SPCT_28",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65560": {
+      "content": "SPCT_29",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65561": {
+      "content": "SPCT_30",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65562": {
+      "content": "SPCT_31",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65563": {
+      "content": "SPCT_32",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65564": {
+      "content": "SPCT_33",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65565": {
+      "content": "SPCT_34",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65566": {
+      "content": "SPCT_35",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65567": {
+      "content": "SPCT_36",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65568": {
+      "content": "SPCT_37",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65569": {
+      "content": "SPCT_38",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65570": {
+      "content": "SPCT_39",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65571": {
+      "content": "SPCT_40",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65572": {
+      "content": "SPCT_41",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65573": {
+      "content": "SPCT_42",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65574": {
+      "content": "SPCT_43",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65575": {
+      "content": "SPCT_44",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65576": {
+      "content": "SPCT_45",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65577": {
+      "content": "SPCT_46",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65578": {
+      "content": "SPCT_47",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65579": {
+      "content": "SPCT_48",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65580": {
+      "content": "SPCT_49",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65581": {
+      "content": "SPCT_50",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65582": {
+      "content": "SPCT_51",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65583": {
+      "content": "SPCT_52",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65584": {
+      "content": "SPCT_53",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65585": {
+      "content": "SPCT_54",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65586": {
+      "content": "SPCT_55",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65587": {
+      "content": "SPCT_56",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65588": {
+      "content": "SPCT_57",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65589": {
+      "content": "SPCT_58",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65590": {
+      "content": "SPCT_59",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65591": {
+      "content": "SPCT_60",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65592": {
+      "content": "SPCT_61",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65593": {
+      "content": "SPCT_62",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65594": {
+      "content": "SPCT_63",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65595": {
+      "content": "SPCT_64",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65596": {
+      "content": "SPCT_65",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65597": {
+      "content": "SPCT_66",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65598": {
+      "content": "SPCT_67",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65599": {
+      "content": "SPCT_68",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65600": {
+      "content": "SPCT_69",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65601": {
+      "content": "SPCT_70",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65602": {
+      "content": "SPCT_71",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65603": {
+      "content": "SPCT_72",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65604": {
+      "content": "SPCT_73",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65605": {
+      "content": "SPCT_74",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65606": {
+      "content": "SPCT_75",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65607": {
+      "content": "SPCT_76",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65608": {
+      "content": "SPCT_77",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65609": {
+      "content": "SPCT_78",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65610": {
+      "content": "SPCT_79",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65611": {
+      "content": "SPCT_80",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65612": {
+      "content": "SPCT_81",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65613": {
+      "content": "SPCT_82",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65614": {
+      "content": "SPCT_83",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65615": {
+      "content": "SPCT_84",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65616": {
+      "content": "SPCT_85",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65617": {
+      "content": "SPCT_86",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65618": {
+      "content": "SPCT_87",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65619": {
+      "content": "SPCT_88",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65620": {
+      "content": "SPCT_89",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65621": {
+      "content": "SPCT_90",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65622": {
+      "content": "SPCT_91",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65623": {
+      "content": "SPCT_92",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65624": {
+      "content": "SPCT_93",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65625": {
+      "content": "SPCT_94",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65626": {
+      "content": "SPCT_95",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65627": {
+      "content": "SPCT_96",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65628": {
+      "content": "SPCT_97",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65629": {
+      "content": "SPCT_98",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65630": {
+      "content": "SPCT_99",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "hf_rwkv_tokenizer.RwkvTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|rwkv_tokenizer_end_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "\n\n",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|rwkv_tokenizer_end_of_text|>",
+  "tokenizer_class": "RwkvTokenizer",
+  "unk_token": "<|rwkv_tokenizer_end_of_text|>",
+  "use_fast": false
+}

rwkv7-0.1B-g1-respark-voice-tunable-ipa/translation_data.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from tts_cli import TTSGenerator
+import webrwkv_py
+import time
+from transformers import AutoTokenizer
+model_path = "/home/yueyulin/models/rwkvtts-respark-webrwkv/"
+decoder_path = f'{model_path}/BiCodecDetokenize.onnx'
+device_idx = 0
+webrwkv_model_path = f'{model_path}/webrwkv.safetensors'
+print(f"🔍 尝试加载模型文件: {webrwkv_model_path} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+model = webrwkv_py.Model(webrwkv_model_path, 'fp32', device_idx)
+print(f"✅ 模型加载成功: {webrwkv_model_path} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+runtime = model.create_thread_runtime()
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+print(f"✅ tokenizer 加载成功: {model_path} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+generator = TTSGenerator(runtime, tokenizer, decoder_path, device_idx, model_path)
+print(f"✅ generator 创建成功: {model_path} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+chinese_text = "一开始，很多人把这次危机比作一九八二年或一九七三年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。"
+english_text = "At the start of the crisis, many people likened it to 1982 or 1973, which was reassuring, because both dates refer to classical cyclical downturns."
+global_tokens, semantic_tokens, global_time, global_speed, semantic_time, semantic_speed = generator._generate_tokens(chinese_text,'middle-aged','male','happy','medium_pitch','medium')
+print(f"✅ 生成完成: {chinese_text} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+print(f"🎯 global_tokens: {global_tokens}")
+print(f"🎯 semantic_tokens: {semantic_tokens}")
+print(f"🎯 global_time: {global_time}")
+print(f"🎯 global_speed: {global_speed}")
+print(f"🎯 semantic_time: {semantic_time}")
+print(f"🎯 semantic_speed: {semantic_speed}")
+wav_data, audio_duration, decode_time, decode_speed = generator._decode_audio(global_tokens, semantic_tokens)
+print(f"✅ 解码完成: {audio_duration:.2f}s，耗时 {decode_time:.2f}s，速度 {decode_speed:.1f} tokens/s")
+generator._save_audio(wav_data, "chinese_text.wav", 16000)
+generator.reset_runtime()
+global_tokens, semantic_tokens, prefill_time, prefill_speed, semantic_time, semantic_speed = generator._generate_tokens_with_global_tokens(english_text, global_tokens)
+print(f"✅ 生成完成: {english_text} time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}")
+print(f"🎯 global_tokens: {global_tokens}")
+print(f"🎯 semantic_tokens: {semantic_tokens}")
+print(f"🎯 prefill_time: {prefill_time}")
+print(f"🎯 prefill_speed: {prefill_speed}")
+print(f"🎯 semantic_time: {semantic_time}")
+print(f"🎯 semantic_speed: {semantic_speed}")
+wav_data, audio_duration, decode_time, decode_speed = generator._decode_audio(global_tokens, semantic_tokens)
+print(f"✅ 解码完成: {audio_duration:.2f}s，耗时 {decode_time:.2f}s，速度 {decode_speed:.1f} tokens/s")
+generator._save_audio(wav_data, "english_text.wav", 16000)

rwkv7-0.1B-g1-respark-voice-tunable-ipa/tts_cli.py ADDED Viewed

	@@ -0,0 +1,992 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+RWKV TTS 交互式音频生成工具
+使用 webrwkv_py 和 ONNX Runtime 进行音频生成
+"""
+import os
+import sys
+import re
+import time
+import warnings
+import logging
+from pathlib import Path
+from typing import Dict, Any, Tuple, List
+import numpy as np
+import soundfile as sf
+import click
+# 配置日志
+def setup_logging():
+    """设置日志配置"""
+    # 从环境变量获取日志级别，默认为WARNING
+    log_level_str = os.environ.get('LOG_LEVEL', 'INFO').upper()
+    log_level = getattr(logging, log_level_str, logging.WARNING)
+    # 配置日志格式
+    logging.basicConfig(
+        level=log_level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    return logging.getLogger(__name__)
+# 创建logger实例
+logger = setup_logging()
+# 抑制警告
+warnings.filterwarnings("ignore", category=UserWarning, module="numpy")
+warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime")
+warnings.filterwarnings("ignore", category=UserWarning, module="torch")
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+np.seterr(all='ignore')
+# 检查并导入必要的库
+try:
+    import webrwkv_py
+    HAS_WEBRWKV = True
+except ImportError:
+    HAS_WEBRWKV = False
+    logger.error("❌ 错误: 需要安装 'webrwkv_py' 库")
+    logger.error("请运行: pip install webrwkv_py")
+    sys.exit(1)
+try:
+    import onnxruntime as ort
+    HAS_ONNX = True
+except ImportError:
+    HAS_ONNX = False
+    logger.error("❌ 错误: 需要安装 'onnxruntime' 库")
+    logger.error("请运行: pip install onnxruntime")
+    sys.exit(1)
+try:
+    from transformers import AutoTokenizer
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
+    logger.error("❌ 错误: 需要安装 'transformers' 库")
+    logger.error("请运行: pip install transformers")
+    sys.exit(1)
+try:
+    import questionary
+    HAS_QUESTIONARY = True
+except ImportError:
+    HAS_QUESTIONARY = False
+    logger.warning("⚠️  警告: 无法导入 questionary 库来使用交互式界面")
+    logger.warning("请运行: pip install questionary")
+    sys.exit(1)
+# 导入属性工具
+try:
+    from properties_util import (
+        SPEED_MAP, PITCH_MAP, AGE_MAP, GENDER_MAP, EMOTION_MAP
+    )
+    # 从映射中提取选项
+    age_choices = list(AGE_MAP.keys())
+    gender_choices = list(GENDER_MAP.keys())
+    emotion_choices = list(EMOTION_MAP.keys())
+    pitch_choices = list(PITCH_MAP.keys())
+    speed_choices = list(SPEED_MAP.keys())
+except ImportError:
+    logger.warning("⚠️  警告: 无法导入 properties_util，使用默认选项")
+    # 默认选项
+    age_choices = ['child', 'teenager', 'youth-adult', 'middle-aged', 'elderly']
+    gender_choices = ['female', 'male']  # 与properties_util.py保持一致
+    emotion_choices = ['NEUTRAL', 'HAPPY', 'SAD', 'ANGRY', 'FEARFUL', 'DISGUSTED', 'SURPRISED']
+    pitch_choices = ['low_pitch', 'medium_pitch', 'high_pitch', 'very_high_pitch']
+    speed_choices = ['very_slow', 'slow', 'medium', 'fast', 'very_fast']
+def detect_token_lang(token: str) -> str:
+    """基于字符集合的简单词级语言检测。返回 'en' 或 'zh'。"""
+    if not token:
+        return 'en'
+    has_zh = re.search(r"[\u4e00-\u9fff]", token) is not None
+    has_en = re.search(r"[A-Za-z]", token) is not None
+    if has_zh and not has_en:
+        return 'zh'
+    if has_en and not has_zh:
+        return 'en'
+    if has_zh and has_en:
+        return 'zh'
+    return 'en'
+def sample_logits(logits, temperature=1.0, top_p=0.85, top_k=0):
+    """从logits中采样token"""
+    if temperature == 0:
+        temperature = 1.0
+        top_p = 0
+    if isinstance(logits, list):
+        logits = np.array(logits)
+    try:
+        from scipy import special
+        probs = special.softmax(logits, axis=-1)
+    except ImportError:
+        # 如果没有scipy，使用numpy的简单实现
+        exp_logits = np.exp(logits - np.max(logits))
+        probs = exp_logits / np.sum(exp_logits)
+    top_k = int(top_k)
+    sorted_ids = np.argsort(probs)
+    sorted_probs = probs[sorted_ids][::-1]
+    cumulative_probs = np.cumsum(sorted_probs)
+    cutoff_mask = cumulative_probs >= top_p
+    if np.any(cutoff_mask):
+        cutoff_idx = np.argmax(cutoff_mask)
+        cutoff = float(sorted_probs[cutoff_idx])
+        probs[probs < cutoff] = 0
+    if top_k < len(probs) and top_k > 0:
+        probs[sorted_ids[:-top_k]] = 0
+    if temperature != 1.0:
+        probs = probs ** (1.0 / temperature)
+    probs = probs / np.sum(probs)
+    out = np.random.choice(a=len(probs), size=1, p=probs)
+    return int(out[0])
+def get_unique_filename(output_dir, text, extension=".wav"):
+    """生成唯一的文件名，避免重名"""
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    prefix = text[:3] if len(text) >= 3 else text
+    prefix = re.sub(r'[\W\s]', '', prefix).strip()
+    base_name = prefix
+    index = 0
+    while True:
+        if index == 0:
+            filename = base_name + extension
+        else:
+            filename = f"{base_name}_{index}{extension}"
+        filepath = output_dir / filename
+        if not filepath.exists():
+            return str(filepath)
+        index += 1
+class TTSGenerator:
+    """TTS生成器类，负责音频生成和统计"""
+    def __init__(self, runtime, tokenizer, decoder_path, device, model_path):
+        self.runtime = runtime
+        self.tokenizer = tokenizer
+        self.decoder_path = decoder_path
+        self.device = device
+        self.model_path = model_path
+        # 初始化 RefAudioUtilities 实例
+        logger.info('🎿 开始加载音频编码器模型')
+        try:
+            audio_tokenizer_path = os.path.join(model_path, 'BiCodecTokenize.onnx')
+            wav2vec2_path = os.path.join(model_path, 'wav2vec2-large-xlsr-53.onnx')
+            from ref_audio_utilities import RefAudioUtilities
+            self.ref_audio_utilities = RefAudioUtilities(audio_tokenizer_path, wav2vec2_path)
+            logger.info('✅ 音频编码器模型加载成功')
+        except Exception as e:
+            logger.error(f'❌ 音频编码器模型加载失败: {e}')
+            self.ref_audio_utilities = None
+        # 缓存ONNX session
+        logger.info('🎿 开始加载ONNX模型')
+        try:
+            self.ort_session = ort.InferenceSession(decoder_path,
+                                                providers=['CUDAExecutionProvider','CPUExecutionProvider'])
+            logger.info(f"🖥️ONNX Session for generate wavform actual providers: {self.ort_session.get_providers()}")
+            logger.info('✅ ONNX模型加载成功')
+        except Exception as e:
+            logger.error(f'❌ ONNX模型加载失败: {e}')
+            raise
+        # 生成统计信息
+        self.generation_stats = {
+            'total_generations': 0,
+            'total_tokens': 0,
+            'total_time': 0.0,
+            'last_generation': {
+                'text': '',
+                'params': {},
+                'total_time': 0.0,
+                'total_tokens': 0,
+                'audio_duration': 0.0,
+                'rtf': 0.0,
+                'global_speed': 0.0,
+                'semantic_speed': 0.0,
+                'decode_speed': 0.0,
+                'timestamp': '',
+                'output_path': ''
+            }
+        }
+    def reset_runtime(self):
+        """重置runtime状态"""
+        try:
+            self.runtime.reset()
+            logger.info("🔄 Runtime状态已重置")
+        except Exception as e:
+            logger.warning(f"⚠️  Runtime重置失败: {e}")
+    def generate_audio(self, params: Dict[str, Any]) -> Tuple[np.ndarray, Dict[str, Any]]:
+        """生成音频"""
+        start_time = time.time()
+        # 重置runtime状态
+        self.reset_runtime()
+        # 获取参数
+        text = params['text']
+        # 检查是否为 zero shot 模式
+        if params.get('zero_shot', False):
+            # Zero shot 模式
+            ref_audio_path = params['ref_audio_path']
+            prompt_text = params.get('prompt_text', "希望你以后能够做的，比我还好呦！")
+            logger.info(f"🎯 开始生成音频 (Zero Shot 模式): {text}")
+            logger.info(f"📊 参数: 参考音频={ref_audio_path}, 提示文本={prompt_text}")
+            # 检测语言
+            lang = detect_token_lang(text)
+            logger.info(f"🌍 检测到语言: {lang}")
+            # 使用 zero shot 方法生成 tokens
+            global_tokens, semantic_tokens, semantic_time, semantic_speed = self._generate_tokens_zeroshot(text, ref_audio_path, prompt_text)
+        else:
+            # 传统模式
+            age = params['age']
+            gender = params['gender']
+            emotion = params['emotion']
+            pitch = params['pitch']
+            speed = params['speed']
+            logger.info(f"🎯 开始生成音频: {text}")
+            logger.info(f"📊 参数: 年龄={age}, 性别={gender}, 情感={emotion}, 音高={pitch}, 速度={speed}")
+            # 检测语言
+            lang = detect_token_lang(text)
+            logger.info(f"🌍 检测到语言: {lang}")
+            # 生成global tokens和semantic tokens
+            global_tokens, semantic_tokens, global_time, global_speed, semantic_time, semantic_speed = self._generate_tokens(text, age, gender, emotion, pitch, speed)
+        # 解码音频
+        logger.info("🎵 解码音频...")
+        # 使用抽象化的音频解码函数
+        wav_data, audio_duration, decode_time, decode_speed = self._decode_audio(global_tokens, semantic_tokens)
+        # 计算总耗时和RTF
+        total_time = time.time() - start_time
+        total_tokens = len(global_tokens) + len(semantic_tokens)
+        rtf = total_time / audio_duration if audio_duration > 0 else 0
+        logger.info(f"📊 总耗时: {total_time:.2f}s，RTF: {rtf:.2f}")
+        # 更新统计信息
+        self.generation_stats['total_generations'] += 1
+        self.generation_stats['total_tokens'] += total_tokens
+        self.generation_stats['total_time'] += total_time
+        self.generation_stats['last_generation'] = {
+            'text': text,
+            'params': params,
+            'total_time': total_time,
+            'total_tokens': total_tokens,
+            'audio_duration': audio_duration,
+            'rtf': rtf,
+            'semantic_speed': semantic_speed,
+            'decode_speed': decode_speed,
+            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+            'output_path': ''
+        }
+        return wav_data, self.generation_stats['last_generation']
+    def _generate_tokens(self, text: str, age: str, gender: str, emotion: str, pitch: str, speed: str) -> Tuple[List[int], List[int], float, float, float, float]:
+        """
+        生成global tokens和semantic tokens
+        Args:
+            text: 原始文本内容
+            age: 年龄参数
+            gender: 性别参数
+            emotion: 情感参数
+            pitch: 音高参数
+            speed: 速度参数
+        Returns:
+            Tuple: (global_tokens, semantic_tokens, global_time, global_speed, semantic_time, semantic_speed)
+        """
+        # 编码文本
+        logger.info("🔤 编码文本...")
+        tokens = self.tokenizer.encode(text)
+        logger.info(f"✅ 文本编码完成，共 {len(tokens)} 个token")
+        # 生成全局token
+        logger.info("🌐 生成全局token...")
+        global_start = time.time()
+        # 准备输入tokens
+        TTS_TAG_0 = 8193
+        TTS_TAG_1 = 8194
+        TTS_TAG_2 = 8195
+        # 构建属性tokens - 使用properties_util.py
+        from properties_util import convert_standard_properties_to_tokens
+        properties_text = convert_standard_properties_to_tokens(age, gender, emotion, pitch, speed)
+        logger.info(f'🔤 属性文本: {properties_text}')
+        properties_tokens = self.tokenizer.encode(properties_text, add_special_tokens=False)
+        properties_tokens = [i + 8196 + 4096 for i in properties_tokens]
+        # 构建文本tokens
+        text_tokens = [i + 8196 + 4096 for i in tokens]
+        # 组合所有tokens
+        all_idx = properties_tokens + [TTS_TAG_2] + text_tokens + [TTS_TAG_0]
+        logger.info(f'🔢 属性token: {properties_tokens}')
+        logger.info(f'🔢 文本token: {text_tokens}')
+        logger.info(f'🎯 组合后的tokens: {all_idx}')
+        # Prefill阶段
+        logger.info("💎 开始Prefill阶段...")
+        session = self.runtime.create_inference_session([all_idx],token_chunk_size=512)
+        step_count = 0
+        start = time.time()
+        while not session.is_complete():
+            step_count += 1
+            output = session.step()
+            if not output.batches[0].is_empty():
+                logits = output.batches[0].data
+                break
+        prefill_time = time.time() - start
+        logger.info(f"✅ Prefill完成，耗时 {step_count} 步")
+        logger.info(f"✅ Prefill完成，logits长度: {len(logits)}")
+        logger.info(f"✅ Prefill完成，耗时 {prefill_time:.2f}s {len(all_idx)/prefill_time:.1f} tokens/s")
+        # 生成全局token - 按照tts_gui_simple.py的逻辑
+        logger.info("🌍 开始生成全局token...")
+        global_tokens_size = 32
+        global_tokens = []
+        for i in range(global_tokens_size):
+            # 从logits中采样token
+            sampled_id = sample_logits(logits[0:4096], temperature=1.0, top_p=0.95, top_k=20)
+            global_tokens.append(sampled_id)
+            # 预测下一个token
+            sampled_id += 8196
+            logits = self.runtime.predict_next(sampled_id)
+        global_time = time.time() - global_start
+        global_speed = global_tokens_size / global_time if global_time > 0 else 0
+        logger.info(f"✅ 全局token生成完成，共 {len(global_tokens)} 个token，耗时 {global_time:.2f}s，速度 {global_speed:.1f} tokens/s")
+        logger.info(f'🎯 生成的全局token: {global_tokens}')
+        # 生成语义token
+        logger.info("🧠 生成语义token...")
+        semantic_start = time.time()
+        # 按照tts_gui_simple.py的逻辑生成语义token
+        x = self.runtime.predict_next(TTS_TAG_1)
+        semantic_tokens = []
+        for i in range(2048):  # 最大生成2048个token
+            sampled_id = sample_logits(x[0:8193], temperature=1.0, top_p=0.95, top_k=80)
+            if sampled_id == 8192:  # 遇到结束标记
+                logger.info(f"🛑 语义token生成结束，遇到结束标记，共生成 {len(semantic_tokens)} 个token")
+                break
+            semantic_tokens.append(sampled_id)
+            x = self.runtime.predict_next(sampled_id)
+        semantic_time = time.time() - semantic_start
+        semantic_speed = len(semantic_tokens) / semantic_time if semantic_time > 0 else 0
+        logger.info(f"✅ 语义token生成完成，共 {len(semantic_tokens)} 个token，耗时 {semantic_time:.2f}s，速度 {semantic_speed:.1f} tokens/s")
+        return global_tokens, semantic_tokens, global_time, global_speed, semantic_time, semantic_speed
+    def _generate_tokens_with_global_tokens(self, text: str, global_tokens: List[int]) -> Tuple[List[int], List[int], float, float, float, float]:
+        """
+        使用 global tokens 生成语义token
+        """
+        # 编码文本
+        logger.info("🔤 编码文本...")
+        text_tokens = self.tokenizer.encode(text, add_special_tokens=False)
+        text_tokens = [i + 8196 + 4096 for i in text_tokens]
+        logger.info(f"✅ 文本编码完成，共 {len(text_tokens)} 个token")
+        global_tokens = [int(i) + 8196 for i in global_tokens]
+        logger.info(f'🎯 参考音频 global_tokens: {global_tokens}')
+        start = time.time()
+        # 准备输入tokens
+        TTS_TAG_0 = 8193
+        TTS_TAG_1 = 8194
+        TTS_TAG_2 = 8195
+        # 组合所有tokens
+        all_idx = [TTS_TAG_2] + text_tokens + [TTS_TAG_0] + global_tokens + [TTS_TAG_1]
+        logger.info(f'🎯 组合后的tokens: {all_idx}')
+        # Prefill阶段
+        logger.info("💎 开始Prefill阶段...")
+        session = self.runtime.create_inference_session([all_idx],token_chunk_size=512)
+        step_count = 0
+        while not session.is_complete():
+            step_count += 1
+            output = session.step()
+            if not output.batches[0].is_empty():
+                logits = output.batches[0].data[0]
+                break
+        logger.info(f"✅ Prefill完成，耗时 {step_count} 步")
+        logger.info(f"✅ Prefill完成，速度 {step_count/output.time:.1f} tokens/s")
+        logger.info(f"✅ Prefill完成，logits长度: {len(logits)}")
+        prefill_time = time.time() - start
+        prefill_speed = len(all_idx) / prefill_time if prefill_time > 0 else 0
+        logger.info(f"✅ Prefill完成，耗时 {prefill_time:.2f}s，速度 {prefill_speed:.1f} tokens/s")
+        # 生成语义token
+        logger.info("🧠 生成语义token...")
+        semantic_start = time.time()
+        # 从当前logits开始生成语义token
+        x = logits
+        semantic_tokens = []
+        for i in range(2048):  # 最大生成2048个token
+            sampled_id = sample_logits(x[0:8193], temperature=1.0, top_p=0.95, top_k=80)
+            if sampled_id == 8192:  # 遇到结束标记
+                logger.info(f"🛑 语义token生成结束，遇到结束标记，共生成 {len(semantic_tokens)} 个token")
+                break
+            semantic_tokens.append(sampled_id)
+            x = self.runtime.predict_next(sampled_id)
+        semantic_time = time.time() - semantic_start
+        semantic_speed = len(semantic_tokens) / semantic_time if semantic_time > 0 else 0
+        logger.info(f"✅ 语义token生成完成，共 {len(semantic_tokens)} 个token，耗时 {semantic_time:.2f}s，速度 {semantic_speed:.1f} tokens/s")
+        return global_tokens, semantic_tokens, prefill_time, prefill_speed, semantic_time, semantic_speed
+    def _generate_tokens_zeroshot(self, text: str, ref_audio_path: str, prompt_text: str = "希望你以后能够做的，比我还好呦！") -> Tuple[List[int], List[int], float, float, float, float]:
+        """
+        使用 zero shot 方式生成global tokens和semantic tokens
+        Args:
+            text: 原始文本内容
+            ref_audio_path: 参考音频路径
+            prompt_text: 提示文本，默认为"希望你以后能够做的，比我还好呦！"
+        Returns:
+            Tuple: (global_tokens, semantic_tokens, global_time, global_speed, semantic_time, semantic_speed)
+        """
+        if self.ref_audio_utilities is None:
+            raise RuntimeError("RefAudioUtilities 未初始化，无法使用 zero shot 模式")
+        # 编码文本
+        logger.info("🔤 编码文本...")
+        text_tokens = self.tokenizer.encode(prompt_text + text, add_special_tokens=False)
+        text_tokens = [i + 8196 + 4096 for i in text_tokens]
+        logger.info(f"✅ 文本编码完成，共 {len(text_tokens)} 个token")
+        # 从参考音频获取 global tokens 和 semantic tokens
+        logger.info("🎵 处理参考音频...")
+        global_tokens, prompt_semantic_tokens = self.ref_audio_utilities.tokenize(ref_audio_path)
+        logger.info(f"✅ 参考音频处理完成")
+        # 直接使用flatten()展平数组并转换为Python一维数组
+        global_tokens = [int(i) + 8196 for i in global_tokens.flatten()]
+        prompt_semantic_tokens = [int(i) for i in prompt_semantic_tokens.flatten()]
+        logger.info(f'🎯 参考音频 global_tokens: {global_tokens}')
+        logger.info(f'🎯 参考音频 semantic_tokens: {prompt_semantic_tokens}')
+        # 生成全局token
+        logger.info("🌐 生成全局token...")
+        global_start = time.time()
+        # 准备输入tokens
+        TTS_TAG_0 = 8193
+        TTS_TAG_1 = 8194
+        TTS_TAG_2 = 8195
+        # 组合所有tokens
+        all_idx = [TTS_TAG_2] + text_tokens + [TTS_TAG_0] + global_tokens + [TTS_TAG_1] + prompt_semantic_tokens
+        logger.info(f'🎯 组合后的tokens: {all_idx}')
+        # Prefill阶段
+        logger.info("💎 开始Prefill阶段...")
+        session = self.runtime.create_inference_session([all_idx],token_chunk_size=512)
+        step_count = 0
+        start = time.time()
+        while not session.is_complete():
+            step_count += 1
+            output = session.step()
+            if not output.batches[0].is_empty():
+                logits = output.batches[0].data
+                break
+        prefill_time = time.time() - start
+        logger.info(f"✅ Prefill完成，logits长度: {len(logits)}")
+        logger.info(f"✅ Prefill完成，耗时 {step_count} 步")
+        logger.info(f"✅ Prefill完成，耗时 {prefill_time:.2f}s {len(all_idx)/prefill_time:.1f} tokens/s")
+        # 生成语义token
+        logger.info("🧠 生成语义token...")
+        semantic_start = time.time()
+        # 从当前logits开始生成语义token
+        x = logits
+        semantic_tokens = []
+        for i in range(2048):  # 最大生成2048个token
+            sampled_id = sample_logits(x[0:8193], temperature=1.0, top_p=0.95, top_k=80)
+            if sampled_id == 8192:  # 遇到结束标记
+                logger.info(f"🛑 语义token生成结束，遇到结束标记，共生成 {len(semantic_tokens)} 个token")
+                break
+            semantic_tokens.append(sampled_id)
+            x = self.runtime.predict_next(sampled_id)
+        semantic_time = time.time() - semantic_start
+        semantic_speed = len(semantic_tokens) / semantic_time if semantic_time > 0 else 0
+        logger.info(f"✅ 语义token生成完成，共 {len(semantic_tokens)} 个token，耗时 {semantic_time:.2f}s，速度 {semantic_speed:.1f} tokens/s")
+        global_tokens = [i - 8196 for i in global_tokens]
+        return global_tokens, semantic_tokens, semantic_time, semantic_speed
+    def _decode_audio(self, global_tokens: List[int], semantic_tokens: List[int]) -> Tuple[np.ndarray, float, float, float]:
+        """
+        解码音频的核心函数
+        Args:
+            global_tokens: 全局tokens列表
+            semantic_tokens: 语义tokens列表
+        Returns:
+            Tuple: (wav_data, audio_duration, decode_time, decode_speed)
+        """
+        # 开始计时
+        decode_start = time.time()
+        # 准备输入数据
+        logger.info("🔧 准备解码器输入数据...")
+        global_tokens_array = np.array(global_tokens, dtype=np.int64).reshape(1, 1, -1)
+        semantic_tokens_array = np.array(semantic_tokens, dtype=np.int64).reshape(1, -1)
+        logger.info(f'🎯 生成的全局token: {global_tokens}')
+        logger.info(f'🎯 生成的语义token: {semantic_tokens}')
+        logger.info(f'📊 解码器输入形状: global_tokens={global_tokens_array.shape}, semantic_tokens={semantic_tokens_array.shape}')
+        # 使用ONNX解码器生成音频
+        logger.info("🎵 开始ONNX解码器推理...")
+        outputs = self.ort_session.run(None, {
+                "global_tokens": global_tokens_array,
+                "semantic_tokens": semantic_tokens_array
+            })
+        wav_data = outputs[0].reshape(-1)
+        decode_time = time.time() - decode_start
+        # 计算音频时长和解码速度
+        audio_duration = len(wav_data) / 16000  # 采样率16kHz
+        decode_speed = len(semantic_tokens) / decode_time if decode_time > 0 else 0
+        logger.info(f"✅ 音频解码完成，时长 {audio_duration:.2f}s，耗时 {decode_time:.2f}s，速度 {decode_speed:.1f} tokens/s")
+        return wav_data, audio_duration, decode_time, decode_speed
+    def _save_audio(self, wav_data: np.ndarray, output_path: str, sample_rate: int = 16000) -> bool:
+        """
+        保存音频文件
+        Args:
+            wav_data: 音频数据
+            output_path: 输出文件路径
+            sample_rate: 采样率，默认16kHz
+        Returns:
+            bool: 保存是否成功
+        """
+        try:
+            sf.write(output_path, wav_data, sample_rate)
+            logger.info(f"💾 音频保存成功: {output_path}")
+            return True
+        except Exception as e:
+            logger.error(f"❌ 音频保存失败: {e}")
+            return False
+def display_stats(stats: Dict[str, Any]):
+    """显示生成统计信息"""
+    logger.info("\n" + "="*60)
+    logger.info("📊 生成统计信息")
+    logger.info("="*60)
+    if stats['text']:
+        logger.info(f"🎯 生成参数: {stats['params']}")
+        logger.info(f"📝 文本: {stats['text']}")
+        logger.info(f"⏱️  总耗时: {stats['total_time']:.2f}s")
+        logger.info(f"🎵 音频时长: {stats['audio_duration']:.2f}s")
+        logger.info(f"📈 RTF: {stats['rtf']:.2f}")
+        logger.info(f"🔢 总token数: {stats['total_tokens']}")
+        logger.info(f"🧠 语义token速度: {stats['semantic_speed']:.1f} tokens/s")
+        logger.info(f"🎵 解码速度: {stats['decode_speed']:.1f} tokens/s")
+        logger.info(f"🕐 时间: {stats['timestamp']}")
+        if stats['output_path']:
+            logger.info(f"💾 保存路径: {stats['output_path']}")
+    else:
+        logger.info("暂无生成记录")
+    logger.info("="*60)
+def interactive_parameter_selection(generator: TTSGenerator):
+    """交互式参数选择界面"""
+    logger.info("\n🎮 进入交互式配置界面")
+    logger.info("💡 使用方向键选择，回车确认，Ctrl+C退出")
+    while True:
+        try:
+            logger.info("\n" + "="*60)
+            logger.info("🎵 RWKV TTS 参数配置")
+            logger.info("="*60)
+            # 选择生成模式
+            generation_mode = questionary.select(
+                "🎯 请选择生成模式:",
+                choices=[
+                    "传统模式 (使用属性参数)",
+                    "Zero Shot 模式 (使用参考音频)"
+                ],
+                default="传统模式 (使用属性参数)"
+            ).ask()
+            if generation_mode is None:  # 用户按Ctrl+C
+                break
+            is_zero_shot = generation_mode == "Zero Shot 模式 (使用参考音频)"
+            # 文本输入
+            text = questionary.text(
+                "📝 请输入要转换的文本:",
+                default=generator.generation_stats['last_generation'].get('text', '你好，世界！')
+            ).ask()
+            if text is None:  # 用户按Ctrl+C
+                break
+            # 输出目录
+            output_dir = questionary.text(
+                "📁 请输入输出目录:",
+                default="./generated_audio"
+            ).ask()
+            if output_dir is None:
+                break
+            if is_zero_shot:
+                # Zero Shot 模式参数
+                ref_audio_path = questionary.text(
+                    "🎵 请输入参考音频路径:",
+                    default="zero_shot_prompt.wav"
+                ).ask()
+                if ref_audio_path is None:
+                    break
+                prompt_text = questionary.text(
+                    "💬 请输入提示文本 (可选，回车使用默认值):",
+                    default="希望你以后能够做的，能比我还好呦！"
+                ).ask()
+                if prompt_text is None:
+                    break
+                # 确认生成
+                confirm = questionary.confirm(
+                    f"🚀 确认生成音频 (Zero Shot 模式)?\n"
+                    f"文本: {text}\n"
+                    f"参考音频: {ref_audio_path}\n"
+                    f"提示文本: {prompt_text}\n"
+                    f"输出目录: {output_dir}",
+                    default=True
+                ).ask()
+                if confirm:
+                    # 准备参数
+                    params = {
+                        'text': text,
+                        'zero_shot': True,
+                        'ref_audio_path': ref_audio_path,
+                        'prompt_text': prompt_text,
+                        'output_dir': output_dir
+                    }
+                    # 生成音频
+                    try:
+                        wav_data, stats = generator.generate_audio(params)
+                        # 生成唯一文件名
+                        output_path = get_unique_filename(output_dir, text)
+                        # 保存音频
+                        if generator._save_audio(wav_data, output_path, 16000):
+                            stats['output_path'] = output_path
+                        else:
+                            logger.warning("⚠️ 音频保存失败，但生成统计已更新")
+                        logger.info(f"✅ 音频生成成功，保存至: {output_path}")
+                        stats['生成参数'] = f'参考音频={ref_audio_path}, 提示文本={prompt_text}'
+                        # 显示统计信息
+                        display_stats(stats)
+                    except Exception as e:
+                        logger.error(f"❌ 生成失败: {e}")
+                        import traceback
+                        traceback.print_exc()
+            else:
+                # 传统模式参数
+                # 年龄选择
+                age = questionary.select(
+                    "👶 请选择年龄:",
+                    choices=age_choices,
+                    default=age_choices[3]  # middle-aged
+                ).ask()
+                if age is None:
+                    break
+                # 性别选择
+                gender = questionary.select(
+                    "👤 请选择性别:",
+                    choices=gender_choices,
+                    default=gender_choices[0]  # female (第一个选项)
+                ).ask()
+                if gender is None:
+                    break
+                # 情感选择
+                emotion = questionary.select(
+                    "😊 请选择情感:",
+                    choices=emotion_choices,
+                    default=emotion_choices[1]  # NEUTRAL
+                ).ask()
+                if emotion is None:
+                    break
+                # 音高选择
+                pitch = questionary.select(
+                    "🎵 请选择音高:",
+                    choices=pitch_choices,
+                    default=pitch_choices[1]  # medium_pitch
+                ).ask()
+                if pitch is None:
+                    break
+                # 速度选择
+                speed = questionary.select(
+                    "⚡ 请选择速度:",
+                    choices=speed_choices,
+                    default=speed_choices[2]  # medium
+                ).ask()
+                if speed is None:
+                    break
+                # 确认生成
+                confirm = questionary.confirm(
+                    f"🚀 确认生成音频?\n"
+                    f"文本: {text}\n"
+                    f"参数: 年龄={age}, 性别={gender}, 情感={emotion}, 音高={pitch}, 速度={speed}\n"
+                    f"输出目录: {output_dir}",
+                    default=True
+                ).ask()
+                if confirm:
+                    # 准备参数
+                    params = {
+                        'text': text,
+                        'zero_shot': False,
+                        'age': age,
+                        'gender': gender,
+                        'emotion': emotion,
+                        'pitch': pitch,
+                        'speed': speed,
+                        'output_dir': output_dir
+                    }
+                    # 生成音频
+                    try:
+                        wav_data, stats = generator.generate_audio(params)
+                        # 生成唯一文件名
+                        output_path = get_unique_filename(output_dir, text)
+                        # 保存音频
+                        if generator._save_audio(wav_data, output_path, 16000):
+                            stats['output_path'] = output_path
+                        else:
+                            logger.warning("⚠️ 音频保存失败，但生成统计已更新")
+                        logger.info(f"✅ 音频生成成功，保存至: {output_path}")
+                        stats['生成参数'] = f'年龄={age}, 性别={gender}, 情感={emotion}, 音高={pitch}, 速度={speed}'
+                        # 显示统计信息
+                        display_stats(stats)
+                    except Exception as e:
+                        logger.error(f"❌ 生成失败: {e}")
+                        import traceback
+                        traceback.print_exc()
+            # 询问是否继续
+            continue_generation = questionary.confirm(
+                "🔄 是否继续生成音频?",
+                default=True
+            ).ask()
+            if not continue_generation:
+                break
+        except KeyboardInterrupt:
+            logger.info("\n👋 用户中断，退出程序")
+            break
+        except Exception as e:
+            logger.error(f"❌ 发生错误: {e}")
+            import traceback
+            traceback.print_exc()
+            break
+    logger.info("👋 感谢使用 RWKV TTS!")
+@click.command()
+@click.option('--model_path', required=True, help='RWKV模型路径')
+def main(model_path):
+    """RWKV TTS 主程序"""
+    logger.info("🚀 欢迎使用 RWKV TTS 交互式音频生成工具!")
+    # 检查模型文件
+    if not os.path.exists(model_path):
+        logger.error(f"❌ 错误: 模型路径不存在: {model_path}")
+        return
+    # 自动构建解码器路径
+    decoder_path = os.path.join(model_path, "BiCodecDetokenize.onnx")
+    logger.info(f"🔍 自动设置解码器路径: {decoder_path}")
+    # 检查模型目录中的文件
+    logger.info(f"🔍 检查模型目录: {model_path}")
+    try:
+        model_files = os.listdir(model_path)
+        logger.info(f"📁 模型目录中的文件:")
+        for file in model_files:
+            file_path = os.path.join(model_path, file)
+            if os.path.isfile(file_path):
+                size = os.path.getsize(file_path)
+                logger.info(f"   📄 {file} ({size:,} bytes)")
+            else:
+                logger.info(f"   📁 {file}/")
+    except Exception as e:
+        logger.warning(f"⚠️  无法列出模型目录内容: {e}")
+    if not os.path.exists(decoder_path):
+        logger.error(f"❌ 错误: 解码器路径不存在: {decoder_path}")
+        return
+    # 选择设备
+    logger.info("\n💎 选择设备 💎")
+    try:
+        devices = webrwkv_py.get_available_adapters_py()
+    except Exception as e:
+        logger.error(f"❌ 无法获取可用设备列表: {e}")
+        return
+    for i, device in enumerate(devices):
+        logger.info(f"{i}: {device}")
+    device_choice = input("请选择设备: ")
+    try:
+        device_idx = int(device_choice)
+        if device_idx < 0 or device_idx >= len(devices):
+            logger.error("❌ 无效的设备选择")
+            return
+        device = devices[device_idx]
+        logger.info(f"✅ 选择设备: {device}")
+    except ValueError:
+        logger.error("❌ 无效的设备选择")
+        return
+    # 加载模型
+    logger.info("\n💎 加载模型 💎")
+    try:
+        # 尝试多种可能的模型文件名
+        possible_model_files = [
+            'webrwkv.safetensors',
+        ]
+        webrwkv_model_path = None
+        for model_file in possible_model_files:
+            test_path = os.path.join(model_path, model_file)
+            if os.path.exists(test_path):
+                webrwkv_model_path = test_path
+                logger.info(f"✅ 找到模型文件: {model_file}")
+                break
+        if webrwkv_model_path is None:
+            logger.error(f"❌ 未找到模型文件")
+            logger.info(f"💡 请检查模型目录 {model_path} 中是否包含以下文件之一:")
+            for model_file in possible_model_files:
+                logger.info(f"   - {model_file}")
+            return
+        logger.info(f"🔍 尝试加载模型文件: {webrwkv_model_path}")
+        # 尝试新的API
+        model = webrwkv_py.Model(webrwkv_model_path, 'fp32', device_idx)
+        logger.info(f"✅ 模型加载成功: {webrwkv_model_path}")
+    except Exception as e:
+        logger.error(f"❌ 模型加载失败: {e}")
+        logger.info(f"💡 请检查:")
+        logger.info(f"   1. 模型文件路径是否正确: {webrwkv_model_path}")
+        logger.info(f"   2. 模型文件是否完整")
+        logger.info(f"   3. 设备索引是否正确: {device_idx}")
+        logger.info(f"   4. 模型文件格式是否支持")
+        return
+    # 创建runtime
+    logger.info("\n💎 创建 runtime 💎")
+    try:
+        runtime = model.create_thread_runtime()
+        logger.info("✅ runtime 创建成功")
+    except Exception as e:
+        logger.error(f"❌ runtime 创建失败: {e}")
+        return
+    # 加载tokenizer
+    logger.info("\n💎 加载 tokenizer 💎")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        logger.info(f"✅ tokenizer 加载成功: {model_path}")
+    except Exception as e:
+        logger.error(f"❌ tokenizer 加载失败: {e}")
+        logger.info(f"💡 请检查模型目录 {model_path} 中是否包含正确的tokenizer文件")
+        return
+    # 创建TTS生成器
+    generator = TTSGenerator(runtime, tokenizer, decoder_path, device, model_path)
+    # 启动交互式界面
+    logger.info("\n🎯 启动交互式配置界面...")
+    interactive_parameter_selection(generator)
+if __name__ == "__main__":
+    main()

rwkv7-0.1B-g1-respark-voice-tunable-ipa/utilities.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import json
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+from properties_util import convert_standard_properties_to_tokens
+def print_properties_info(age: str, gender: str, emotion: str, pitch: float, speed: float):
+    """
+    打印属性信息的辅助函数
+    Args:
+        age: 年龄
+        gender: 性别
+        emotion: 情感
+        pitch: 音调
+        speed: 速度
+    """
+    print(f'age: {age}, gender: {gender}, emotion: {emotion}, pitch: {pitch}, speed: {speed}')
+@torch.inference_mode()
+def extract_embeddings_for_global_tokens(model, tokenizer, text, age: str, gender: str, emotion: str, pitch: float, speed: float,global_tokens: list = None):
+    """
+    提取生成全局tokens所需的embedding
+    Args:
+        model: 模型实例
+        tokenizer: 分词器
+        text: 输入文本
+        age: 年龄
+        gender: 性别
+        emotion: 情感
+        pitch: 音调
+        speed: 速度
+        global_tokens: 全局tokens
+    Returns:
+        torch.Tensor: 拼接后的完整embedding
+    """
+    device = (next(model.parameters()).device)
+    properties_tokens = convert_standard_properties_to_tokens(age, gender, emotion, pitch, speed)
+    text_tokens = tokenizer.encode(text, add_special_tokens=False)
+    properties_tokens = tokenizer.encode(properties_tokens, add_special_tokens=False)
+    text_tokens_tensor = torch.tensor(text_tokens, dtype=torch.long, device=device)
+    properties_tokens_tensor = torch.tensor(properties_tokens, dtype=torch.long, device=device)
+    text_embs = model.text_embedder(text_tokens_tensor)
+    properties_embs = model.text_embedder(properties_tokens_tensor)
+    tag_0_emb = model.tts_tag_embedder(torch.tensor([0], dtype=torch.long, device=device))
+    tag_1_emb = model.tts_tag_embedder(torch.tensor([1], dtype=torch.long, device=device))
+    tag_2_emb = model.tts_tag_embedder(torch.tensor([2], dtype=torch.long, device=device))
+    full_embs_for_sample = torch.cat([
+        properties_embs,
+        tag_2_emb, text_embs, tag_0_emb,
+    ], dim=0)
+    if global_tokens is not None:
+        global_tokens_tensor = torch.tensor(global_tokens, dtype=torch.long, device=device)
+        global_embs = model.global_embedder(global_tokens_tensor)
+        full_embs_for_sample = torch.cat([
+            full_embs_for_sample,
+            global_embs,
+            tag_1_emb
+        ], dim=0)
+    return full_embs_for_sample
+def get_tokenizer(model_dir):
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    special_tokens = {
+            'pad_token': '<|rwkv_tokenizer_end_of_text|>',
+            'additional_special_tokens': [
+                '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
+            ]
+        }
+    tokenizer.add_special_tokens(special_tokens)
+    return tokenizer
+def get_respark_tts_tokenizer(model_dir):
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    original_vocab_size = tokenizer.vocab_size
+    added_tokens_file = os.path.join(os.path.dirname(__file__),'spark_tts_added_tokens.json')
+    with open(added_tokens_file, 'r') as f:
+        added_tokens = json.load(f)
+    tokenizer.add_special_tokens(added_tokens)
+    return tokenizer,original_vocab_size
+@torch.inference_mode()
+def generate_global_tokens(model, tokenizer, text, age: str, gender: str, emotion: str, pitch: float, speed: float,
+                           num_global_tokens: int = 4096):
+    full_embs_for_sample = extract_embeddings_for_global_tokens(model, tokenizer, text, age, gender, emotion, pitch, speed)
+    device = full_embs_for_sample.device
+    vocab_size = model.config.vocab_size
+    eos_token_id = vocab_size - 1
+    suppress_tokens = [id for id in range(num_global_tokens,vocab_size)]
+    gen_args = {
+        "inputs_embeds":full_embs_for_sample.unsqueeze(0),
+        "attention_mask":torch.ones((1, full_embs_for_sample.shape[1]),dtype=torch.long,device=device),
+        "max_new_tokens":32,
+        "min_new_tokens":32,
+        "do_sample":True,
+        "top_k":50,
+        "top_p":0.95,
+        "temperature":1.0,
+        "eos_token_id":eos_token_id,
+        "pad_token_id":tokenizer.pad_token_id,
+        "use_cache":True,
+        "suppress_tokens":suppress_tokens,
+        "return_dict_in_generate":True,
+    }
+    generated_outputs = model.generate(**gen_args)
+    return generated_outputs
+@torch.inference_mode()
+def generate_input_embeddings(model,tokenizer,text,global_tokens):
+    device = (next(model.parameters()).device)
+    text_tokens = tokenizer.encode(text, add_special_tokens=False)
+    text_tokens_tensor = torch.tensor(text_tokens, dtype=torch.long, device=device)
+    text_embs = model.text_embedder(text_tokens_tensor)
+    global_tokens_tensor = torch.tensor(global_tokens, dtype=torch.long, device=device)
+    global_embs = model.global_embedder(global_tokens_tensor)
+    tag_0_emb = model.tts_tag_embedder(torch.tensor([0], dtype=torch.long, device=device))
+    tag_1_emb = model.tts_tag_embedder(torch.tensor([1], dtype=torch.long, device=device))
+    tag_2_emb = model.tts_tag_embedder(torch.tensor([2], dtype=torch.long, device=device))
+    input_embs = torch.cat([tag_2_emb,text_embs,tag_0_emb,global_embs,tag_1_emb],dim=0)
+    return input_embs
+def generate_embeddings(model, tokenizer, text, bicodec, prompt_text=None, prompt_audio=None):
+    """
+    为 Spark LLM 生成预测所需的输入嵌入
+    Args:
+        model: Spark LLM 模型
+        tokenizer: 文本分词器
+        text: 要生成语音的文本
+        bicodec: BiCodecTokenizer 实例
+        prompt_text: 提示文本（可选）
+        prompt_audio: 提示音频数组（可选）
+    Returns:
+        dict: 包含 input_embs 的字典，用于模型预测
+    """
+    device = next(model.parameters()).device
+    # 1. 处理提示音频，提取 global_tokens 和 semantic_tokens
+    if prompt_audio is not None:
+        # 确保音频数据是 float32 类型
+        audio_data = np.array(prompt_audio, dtype=np.float32)
+        target_sample_rate = bicodec.config['sample_rate']
+        # 检查是否需要重采样
+        # 注意：这里假设 prompt_audio 已经是从 soundfile 加载的，采样率信息在外部处理
+        # BiCodecTokenizer 期望 16kHz 采样率的音频
+        print(f"BiCodecTokenizer 期望的采样率: {target_sample_rate}Hz")
+        print(f"音频数据形状: {audio_data.shape}")
+        # 使用 BiCodec 提取 tokens (返回顺序: global_tokens, semantic_tokens)
+        global_tokens, semantic_tokens = bicodec.tokenize(audio_data)
+        global_tokens = global_tokens.squeeze(0).squeeze(0).detach().cpu().tolist()
+        semantic_tokens = semantic_tokens.squeeze(0).squeeze(0).detach().cpu().tolist()
+    else:
+        global_tokens = []
+        semantic_tokens = []
+    # 2. 处理文本
+    if prompt_text is not None:
+        # 连接提示文本和目标文本
+        full_text = prompt_text + text
+        # 初始的 semantic tokens 等于 prompt_audio 提取的 semantic tokens
+        initial_semantic_tokens = semantic_tokens.copy()
+    else:
+        full_text = text
+        initial_semantic_tokens = []
+    # 3. 获取文本 tokens
+    text_tokens = tokenizer.encode(full_text, add_special_tokens=False)
+    # 4. 转换为张量
+    text_tokens_tensor = torch.tensor(text_tokens, dtype=torch.long, device=device)
+    global_tokens_tensor = torch.tensor(global_tokens, dtype=torch.long, device=device)
+    semantic_tokens_tensor = torch.tensor(initial_semantic_tokens, dtype=torch.long, device=device)
+    # 5. 获取嵌入
+    text_embs = model.text_embedder(text_tokens_tensor)
+    global_embs = model.global_embedder(global_tokens_tensor)
+    semantic_embs = model.model.embeddings(semantic_tokens_tensor)
+    # 6. 获取特殊标记嵌入
+    tag_0_emb = model.tts_tag_embedder(torch.tensor([0], dtype=torch.long, device=device))
+    tag_1_emb = model.tts_tag_embedder(torch.tensor([1], dtype=torch.long, device=device))
+    tag_2_emb = model.tts_tag_embedder(torch.tensor([2], dtype=torch.long, device=device))
+    # 7. 连接嵌入
+    input_embs = torch.cat([
+        tag_2_emb,
+        text_embs,
+        tag_0_emb,
+        global_embs,
+        tag_1_emb,
+        semantic_embs
+    ], dim=0)
+    # 8. 添加批次维度
+    input_embs = input_embs.unsqueeze(0)  # [1, seq_len, hidden_size]
+    return {
+        "input_embs": input_embs,
+        "global_tokens": global_tokens_tensor,
+    }

rwkv7-0.1B-g1-respark-voice-tunable-ipa/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

rwkv7-0.1B-g1-respark-voice-tunable-ipa/wav2vec2-large-xlsr-53.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0947d5aed2023e06b07a0180549e64a48977863b20f1156cbf33fd97ab6e3ad6
+size 858969041

rwkv7-0.1B-g1-respark-voice-tunable-ipa/webrwkv.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4136957bbb6335e26a0e4c073d9d87858fa9e85293be600ecfe2243e9fe6bf12
+size 420157752