pre-release version

Browse files

Files changed (5) hide show

README.md +49 -6
config.json +3 -2
feature_extraction_xy_tokenizer.py +17 -19
modeling_xy_tokenizer.py +8 -9
preprocessor_config.json +0 -1

README.md CHANGED Viewed

@@ -2,25 +2,68 @@
 license: apache-2.0
 ---
 ```python
 import torchaudio
 from transformers import AutoFeatureExtractor, AutoModel
-wav_form, sampling_rate = torchaudio.load("examples/zh_spk1_moon.wav")
 feature_extractor = AutoFeatureExtractor.from_pretrained("MCplayer/XY_Tokenizer", trust_remote_code=True)
 codec = AutoModel.from_pretrained("MCplayer/XY_Tokenizer", trust_remote_code=True, device_map="auto").eval()
 if sampling_rate != 16000:
-    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
-    wav_form = resampler(wav_form)
 input_spectrum = feature_extractor(wav_form, sampling_rate=16000, return_attention_mask=True, return_tensors="pt")
 code = codec.encode(input_spectrum)
 output_wav = codec.decode(code["audio_codes"], overlap_seconds=10)
-for i, audio in enumerate(output_wav["audio_values"]):
-    torchaudio.save(f"outputs/audio{i}.wav", audio.cpu(), 24000)
-```

 license: apache-2.0
 ---
+# **Introduction**
+**`XY-Tokenizer`** is a speech codec that simultaneously models both semantic and acoustic aspects of speech, converting audio into discrete tokens and decoding them back to high-quality audio. It achieves efficient speech representation at only 1kbps with RVQ8 quantization at 12.5Hz frame rate.
+-   **Paper:** [Read on arXiv](https://arxiv.org/pdf/2506.23325)
+-   **Source Code:**
+    - [GitHub Repo](https://github.com/OpenMOSS/MOSS-TTSD/tree/main/XY_Tokenizer)
+    - [Hugging Face Repo](https://huggingface.co/spaces/fnlp/MOSS-TTSD/tree/main/XY_Tokenizer)
+## 📚 Related Project: **`MOSS-TTSD`**
+**`XY-Tokenizer`** serves as the underlying neural codec for **`MOSS-TTSD`**, our 1.7B Audio Language Model. \
+Explore **`MOSS-TTSD`** for advanced text-to-speech and other audio generation tasks on [GitHub](https://github.com/OpenMOSS/MOSS-TTSD), [Blog](http://www.open-moss.com/en/moss-ttsd/), [博客](https://www.open-moss.com/cn/moss-ttsd/), and [Space Demo](https://huggingface.co/spaces/fnlp/MOSS-TTSD).
+## ✨ Features
+- **Dual-channel modeling**: Simultaneously captures semantic meaning and acoustic details
+- **Efficient representation**: 1kbps bitrate with RVQ8 quantization at 12.5Hz
+- **High-quality audio tokenization**: Convert speech to discrete tokens and back with minimal quality loss
+- **Long audio support**: Process audio files longer than 30 seconds using chunking with overlap
+- **Batch processing**: Efficiently process multiple audio files in batches
+- **24kHz output**: Generate high-quality 24kHz audio output
+## 🚀 Installation
+```bash
+git clone https://github.com/OpenMOSS/MOSS-TTSD.git
+cd MOSS-TTSD
+conda create -n xy_tokenizer python=3.10 -y && conda activate xy_tokenizer
+pip install -r XY_Tokenizer/requirements.txt
+```
+## 💻 Quick Start
+Here's how to use **`XY-Tokenizer`** with `transformers` to encode an audio file into discrete tokens and decode it back into a waveform.
 ```python
 import torchaudio
 from transformers import AutoFeatureExtractor, AutoModel
+# 1. Load the feature extractor and the codec model
 feature_extractor = AutoFeatureExtractor.from_pretrained("MCplayer/XY_Tokenizer", trust_remote_code=True)
 codec = AutoModel.from_pretrained("MCplayer/XY_Tokenizer", trust_remote_code=True, device_map="auto").eval()
+# 2. Load and preprocess the audio
+# The model expects a 16kHz sample rate.
+wav_form, sampling_rate = torchaudio.load("examples/zh_spk1_moon.wav")
 if sampling_rate != 16000:
+    wav_form = torchaudio.functional.resample(wav_form, orig_freq=sampling_rate, new_freq=16000)
+# 3. Encode the audio into discrete codes
 input_spectrum = feature_extractor(wav_form, sampling_rate=16000, return_attention_mask=True, return_tensors="pt")
+# The 'code' dictionary contains the discrete audio codes
 code = codec.encode(input_spectrum)
+# 4. Decode the codes back to an audio waveform
+# The output is high-quality 24kHz audio.
 output_wav = codec.decode(code["audio_codes"], overlap_seconds=10)
+# 5. Save the reconstructed audio
+for i, audio in enumerate(output_wav["audio_values"]):
+    torchaudio.save(f"outputs/audio_{i}.wav", audio.cpu(), 24000)
+```

config.json CHANGED Viewed

@@ -21,7 +21,6 @@
       "padding_side": "right",
       "padding_value": 0.0,
       "sampling_rate": 16000,
-      "encoder_downsample_rate": 1280,
       "return_attention_mask": true,
       "return_tensors": "pt"
     },
@@ -120,5 +119,7 @@
       "hop_size": 240,
       "padding": "same"
     }
-  }
 }

       "padding_side": "right",
       "padding_value": 0.0,
       "sampling_rate": 16000,
       "return_attention_mask": true,
       "return_tensors": "pt"
     },
       "hop_size": 240,
       "padding": "same"
     }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.0"
 }

feature_extraction_xy_tokenizer.py CHANGED Viewed

@@ -15,6 +15,7 @@
 """
 Feature extractor class for Whisper
 """
 from functools import partial
 from typing import List, Optional, Union
@@ -37,7 +38,6 @@ class ExtractorIterator:
         chunk_length=30,
         overlap_seconds=10,
         sampling_rate=16000,
-        encoder_downsample_rate=1280,
         encode_func = None,
     ) -> None:
         self.data = data
@@ -45,12 +45,11 @@ class ExtractorIterator:
         self.chunk_length = chunk_length
         self.overlap_seconds = overlap_seconds
         self.sampling_rate = sampling_rate
-        self.encoder_downsample_rate = encoder_downsample_rate
         # duration_size 是每次处理的有效音频长度
         self.duration_seconds = self.chunk_length - self.overlap_seconds
         self.duration_size = int(self.duration_seconds * self.sampling_rate)
-        self.code_duration_length = self.duration_size // self.encoder_downsample_rate
         # 注意：这里我们只处理不带重叠的块，重叠将在外部处理（如果需要）
         # 或者在迭代器内部更明确地处理。为了简化，我们假设分块是基于 duration_size
@@ -66,26 +65,30 @@ class ExtractorIterator:
         batch_num = 0
         # 注意：chunk_and_pad_view 输出的块大小是 duration_size
-        wav_tensor = torch.zeros(self.batch_size, 1, self.duration_size)
         input_lengths = torch.zeros(self.batch_size, dtype=torch.long)
         input_seq_no = torch.zeros(self.batch_size, dtype=torch.long)
-        def chunk_and_pad_view(tensor, chunk_size, seq_no):
             x = tensor[0:1, :].unsqueeze(0)
             B, C, L = x.shape
-            num_chunks = (L + chunk_size - 1) // chunk_size
-            target_len = num_chunks * chunk_size
-            pad_len = target_len - L
-            padded_x = F.pad(x, (0, pad_len))
-            output_tensor = padded_x.view(B, num_chunks, chunk_size).transpose(0, 1)
-            output_lengths = torch.full((num_chunks,), chunk_size, dtype=torch.long)
-            if pad_len > 0:
-                output_lengths[-1] = chunk_size - pad_len
             output_seq_no = torch.full((num_chunks,), seq_no, dtype=torch.long)
             return output_tensor, output_lengths, output_seq_no
         for i, sample in enumerate(self.data):
-            sample_chunks, sample_lengths, sample_seq_no = chunk_and_pad_view(sample, self.duration_size, i)
             processed_in_sample = 0
             while processed_in_sample < len(sample_chunks):
@@ -115,7 +118,6 @@ class ExtractorIterator:
                     ]
                     yield BatchFeature({
                         **self.encode_func(list_x),
-                        "input_lengths": input_lengths.clone(),
                         "chunk_seq_no": input_seq_no.clone(),
                     })
@@ -133,7 +135,6 @@ class ExtractorIterator:
             ]
             yield BatchFeature({
                 **self.encode_func(list_x),
-                "input_lengths": input_lengths.clone(),
                 "chunk_seq_no": input_seq_no[:batch_num].clone(),
             })
@@ -143,7 +144,6 @@ class XYTokenizerFeatureExtractor(WhisperFeatureExtractor):
         self,
         feature_size=80,
         sampling_rate=16000,
-        encoder_downsample_rate=1280,
         hop_length=160,
         chunk_length=30,
         n_fft=400,
@@ -166,7 +166,6 @@ class XYTokenizerFeatureExtractor(WhisperFeatureExtractor):
             **kwargs,
         )
         self.max_frequency = max_frequency if max_frequency is not None else sampling_rate / 2
-        self.encoder_downsample_rate = encoder_downsample_rate
         self.batch_size = batch_size
         self.mel_filters = mel_filter_bank(
             num_frequency_bins=1 + n_fft // 2,
@@ -204,7 +203,6 @@ class XYTokenizerFeatureExtractor(WhisperFeatureExtractor):
             chunk_length=self.chunk_length,
             overlap_seconds=overlap_seconds,
             sampling_rate=self.sampling_rate,
-            encoder_downsample_rate=self.encoder_downsample_rate,
             encode_func=partial(
                 super().__call__,
                 truncation=truncation,

 """
 Feature extractor class for Whisper
 """
+import math
 from functools import partial
 from typing import List, Optional, Union
         chunk_length=30,
         overlap_seconds=10,
         sampling_rate=16000,
         encode_func = None,
     ) -> None:
         self.data = data
         self.chunk_length = chunk_length
         self.overlap_seconds = overlap_seconds
         self.sampling_rate = sampling_rate
         # duration_size 是每次处理的有效音频长度
+        self.chunk_size = int(self.chunk_length * self.sampling_rate)
         self.duration_seconds = self.chunk_length - self.overlap_seconds
         self.duration_size = int(self.duration_seconds * self.sampling_rate)
         # 注意：这里我们只处理不带重叠的块，重叠将在外部处理（如果需要）
         # 或者在迭代器内部更明确地处理。为了简化，我们假设分块是基于 duration_size
         batch_num = 0
         # 注意：chunk_and_pad_view 输出的块大小是 duration_size
+        wav_tensor = torch.zeros(self.batch_size, 1, self.chunk_size)
         input_lengths = torch.zeros(self.batch_size, dtype=torch.long)
         input_seq_no = torch.zeros(self.batch_size, dtype=torch.long)
+        def chunk_and_pad_view(tensor, seq_no):
             x = tensor[0:1, :].unsqueeze(0)
+            stride = self.duration_size
+            kernel = self.chunk_size
             B, C, L = x.shape
+            num_chunks = math.ceil(L / stride)
+            target_len = (num_chunks - 1) * stride + kernel
+            padding_size = max(0, target_len - L)
+            x_padded = F.pad(x, (0, padding_size), "constant", 0)
+            output_tensor = x_padded.unfold(dimension=2, size=kernel, step=stride).squeeze(0).transpose(0, 1)
+            output_lengths = torch.full((num_chunks,), kernel, dtype=torch.long)
+            if padding_size > 0:
+                output_lengths[-1] = kernel - padding_size
             output_seq_no = torch.full((num_chunks,), seq_no, dtype=torch.long)
             return output_tensor, output_lengths, output_seq_no
         for i, sample in enumerate(self.data):
+            sample_chunks, sample_lengths, sample_seq_no = chunk_and_pad_view(sample, i)
             processed_in_sample = 0
             while processed_in_sample < len(sample_chunks):
                     ]
                     yield BatchFeature({
                         **self.encode_func(list_x),
                         "chunk_seq_no": input_seq_no.clone(),
                     })
             ]
             yield BatchFeature({
                 **self.encode_func(list_x),
                 "chunk_seq_no": input_seq_no[:batch_num].clone(),
             })
         self,
         feature_size=80,
         sampling_rate=16000,
         hop_length=160,
         chunk_length=30,
         n_fft=400,
             **kwargs,
         )
         self.max_frequency = max_frequency if max_frequency is not None else sampling_rate / 2
         self.batch_size = batch_size
         self.mel_filters = mel_filter_bank(
             num_frequency_bins=1 + n_fft // 2,
             chunk_length=self.chunk_length,
             overlap_seconds=overlap_seconds,
             sampling_rate=self.sampling_rate,
             encode_func=partial(
                 super().__call__,
                 truncation=truncation,

modeling_xy_tokenizer.py CHANGED Viewed

@@ -120,11 +120,11 @@ class VectorQuantizerConfig:
 # ----------------------------------------------- #
 #    All Helper Modules (Copied from source)      #
 # ----------------------------------------------- #
-def sinusoids(length, channels, max_timescale=10000):
     assert channels % 2 == 0
     log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
     inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
-    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
     return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
@@ -840,6 +840,7 @@ class XYTokenizerModel(XYTokenizerPreTrainedModel):
         self.enhanced_vocos = Vocos(**params['vocos_kwargs'])
         self.feature_extractor = params['feature_extractor_kwargs']
         # Store some config values for easier access
         self.nq = params['quantizer_kwargs']['num_quantizers']
         # Initialize weights and apply final processing
@@ -893,7 +894,7 @@ class XYTokenizerModel(XYTokenizerPreTrainedModel):
             # 1. Iterate through chunks and store intermediate results
             for chunk_features in features:
-                code_duration_length = features.code_duration_length
                 # Always use return_dict=True for easier access to named outputs
                 chunk_output = self._encode(chunk_features, n_quantizers, return_dict=True)
                 valid_code_lengths = torch.clamp(chunk_output.codes_lengths, 0, code_duration_length)
@@ -972,10 +973,8 @@ class XYTokenizerModel(XYTokenizerPreTrainedModel):
     ) -> Union[XYTokenizerEncodeOutput, Tuple]:
         input_mel = features['input_features'].to(self.device, dtype=self.dtype)
         mel_attention_mask = features['attention_mask'].to(self.device)
-        input_lengths = features['input_lengths'].to(self.device).unsqueeze(1)
-        mel_output_length = mel_attention_mask.sum(dim=-1).long().unsqueeze(1)
-        mel_output_length = torch.cat((mel_output_length, input_lengths), dim=1).min(dim=1).values
         # --- Encoder Path ---
         semantic_encoder_output, semantic_encoder_output_length = self.semantic_encoder(input_mel, mel_output_length)
         semantic_adapter_output, _ = self.semantic_encoder_adapter(semantic_encoder_output, semantic_encoder_output_length)
@@ -983,8 +982,8 @@ class XYTokenizerModel(XYTokenizerPreTrainedModel):
         concated_channel = torch.cat([semantic_adapter_output, acoustic_encoder_output], dim=1)
-        pre_rvq_adapter_output, _ = self.pre_rvq_adapter(concated_channel, acoustic_encoder_output_length)
-        downsample_output, downsample_output_length = self.downsample(pre_rvq_adapter_output, acoustic_encoder_output_length)
         n_quantizers = n_quantizers or self.quantizer.num_quantizers
         zq, codes, vq_loss, _, quantizer_output_length = self.quantizer(downsample_output, downsample_output_length, n_quantizers=n_quantizers)

 # ----------------------------------------------- #
 #    All Helper Modules (Copied from source)      #
 # ----------------------------------------------- #
+def sinusoids(length, channels, max_timescale=10000, device=None):
     assert channels % 2 == 0
     log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
     inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length, device=device)[:, np.newaxis] * inv_timescales[np.newaxis, :]
     return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
         self.enhanced_vocos = Vocos(**params['vocos_kwargs'])
         self.feature_extractor = params['feature_extractor_kwargs']
         # Store some config values for easier access
+        self.encoder_downsample_rate = config.encoder_downsample_rate
         self.nq = params['quantizer_kwargs']['num_quantizers']
         # Initialize weights and apply final processing
             # 1. Iterate through chunks and store intermediate results
             for chunk_features in features:
+                code_duration_length = features.duration_size // self.encoder_downsample_rate
                 # Always use return_dict=True for easier access to named outputs
                 chunk_output = self._encode(chunk_features, n_quantizers, return_dict=True)
                 valid_code_lengths = torch.clamp(chunk_output.codes_lengths, 0, code_duration_length)
     ) -> Union[XYTokenizerEncodeOutput, Tuple]:
         input_mel = features['input_features'].to(self.device, dtype=self.dtype)
         mel_attention_mask = features['attention_mask'].to(self.device)
+        mel_output_length = mel_attention_mask.sum(dim=-1).long()
         # --- Encoder Path ---
         semantic_encoder_output, semantic_encoder_output_length = self.semantic_encoder(input_mel, mel_output_length)
         semantic_adapter_output, _ = self.semantic_encoder_adapter(semantic_encoder_output, semantic_encoder_output_length)
         concated_channel = torch.cat([semantic_adapter_output, acoustic_encoder_output], dim=1)
+        pre_rvq_adapter_output, pre_rvq_adapter_output_length = self.pre_rvq_adapter(concated_channel, acoustic_encoder_output_length)
+        downsample_output, downsample_output_length = self.downsample(pre_rvq_adapter_output, pre_rvq_adapter_output_length)
         n_quantizers = n_quantizers or self.quantizer.num_quantizers
         zq, codes, vq_loss, _, quantizer_output_length = self.quantizer(downsample_output, downsample_output_length, n_quantizers=n_quantizers)

preprocessor_config.json CHANGED Viewed

@@ -8,7 +8,6 @@
   "padding_side": "right",
   "padding_value": 0.0,
   "sampling_rate": 16000,
-  "encoder_downsample_rate": 1280,
   "return_attention_mask": true,
   "return_tensors": "pt"
 }

   "padding_side": "right",
   "padding_value": 0.0,
   "sampling_rate": 16000,
   "return_attention_mask": true,
   "return_tensors": "pt"
 }