MOSS-Audio-Tokenizer / configuration_moss_audio_tokenizer.py

init model

8b576d1 8 days ago

12.9 kB

	# coding=utf-8
	# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""MossAudioTokenizer model configuration"""

	from typing import Any

	from transformers.configuration_utils import PreTrainedConfig
	from transformers.utils import logging


	logger = logging.get_logger(__name__)


	class MossAudioTokenizerConfig(PreTrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`MossAudioTokenizerModel`]. It is used to instantiate a
	MossAudioTokenizer model according to the specified arguments, defining the model architecture.

	Instantiating a configuration with the defaults will yield a similar configuration to that of the
	[VoiceAgentGroup/moss_audio_tokenizer](https://huggingface.co/VoiceAgentGroup/moss_audio_tokenizer) architecture.

	Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PreTrainedConfig`] for more information.

	Args:
	sampling_rate (`int`, optional, defaults to 24000):
	The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
	downsample_rate (`int`, optional, defaults to 1920):
	Total downsampling rate from waveform to tokens.
	causal_transformer_context_duration (`float`, optional, defaults to 10.0):
	Context duration in seconds for causal transformer.
	encoder_kwargs (`list[dict]`, optional):
	List of encoder module configurations. Each dict specifies a module type and its parameters.
	decoder_kwargs (`list[dict]`, optional):
	List of decoder module configurations in execution order.
	quantizer_type (`str`, optional, defaults to `"rvq"`):
	Quantizer type. Options include `"rvq"`, `"spec_rvq"`, `"rlfq"`, `"random_prefix_rlfq"`.
	quantizer_kwargs (`dict`, optional):
	Configuration for the quantizer including `input_dim`, `rvq_dim`, `output_dim`, `num_quantizers`,
	`codebook_size`, and `codebook_dim`.

	Example:

	```python
	>>> from transformers import MossAudioTokenizerModel, MossAudioTokenizerConfig

	>>> # Initializing a MossAudioTokenizer style configuration
	>>> configuration = MossAudioTokenizerConfig()

	>>> # Initializing a model (with random weights) from the configuration
	>>> model = MossAudioTokenizerModel(configuration)

	>>> # Accessing the model configuration
	>>> configuration = model.config
	```
	"""

	model_type = "moss-audio-tokenizer"

	# Backward-compatible alias used by some checkpoints.
	attribute_map = {"sample_rate": "sampling_rate"}

	sampling_rate: int
	downsample_rate: int
	causal_transformer_context_duration: float
	encoder_kwargs: list[dict[str, Any]]
	decoder_kwargs: list[dict[str, Any]]
	quantizer_type: str
	quantizer_kwargs: dict[str, Any]

	def __init__(
	self,
	version: str \| None = None,
	sampling_rate: int = 24000,
	downsample_rate: int = 1920,
	causal_transformer_context_duration: float = 10.0,
	encoder_kwargs: list[dict[str, Any]] \| None = None,
	decoder_kwargs: list[dict[str, Any]] \| None = None,
	quantizer_type: str = "rlfq",
	quantizer_kwargs: dict[str, Any] \| None = None,
	**kwargs,
	):
	# Some checkpoints might include an incorrect/legacy `model_type` (e.g. "speech_tokenizer").
	# We drop it to avoid overriding the class-level `model_type`.
	kwargs.pop("model_type", None)

	# `version` is accepted for compatibility but not used in modeling.
	self.version = version
	self.sampling_rate = sampling_rate
	self.downsample_rate = downsample_rate
	self.causal_transformer_context_duration = causal_transformer_context_duration
	# Default encoder configuration
	if encoder_kwargs is None:
	encoder_kwargs = [
	{
	"module_type": "PatchedPretransform",
	"patch_size": 240,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 240,
	"output_dimension": 384,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 768,
	"output_dimension": 384,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 768,
	"output_dimension": 640,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 1280,
	"output_dimension": 768,
	"d_model": 1280,
	"num_heads": 20,
	"num_layers": 32,
	"dim_feedforward": 5120,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	]
	self.encoder_kwargs = encoder_kwargs

	# Default decoder configuration (execution order)
	if decoder_kwargs is None:
	decoder_kwargs = [
	{
	"module_type": "Transformer",
	"input_dimension": 768,
	"output_dimension": 1280,
	"d_model": 1280,
	"num_heads": 20,
	"num_layers": 32,
	"dim_feedforward": 5120,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 640,
	"output_dimension": 768,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 384,
	"output_dimension": 768,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 384,
	"output_dimension": 768,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2,
	},
	{
	"module_type": "Transformer",
	"input_dimension": 384,
	"output_dimension": 240,
	"d_model": 768,
	"num_heads": 12,
	"num_layers": 12,
	"dim_feedforward": 3072,
	"causal": True,
	"norm": "layer_norm",
	"positional_embedding": "rope",
	"max_period": 10000,
	"gating": "none",
	"layer_scale": 0.01,
	"conv_layout": True,
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 240,
	},
	]
	self.decoder_kwargs = decoder_kwargs

	# Default quantizer configuration
	if quantizer_kwargs is None:
	quantizer_kwargs = {
	"input_dim": 768,
	"rvq_dim": 512,
	"output_dim": 768,
	"num_quantizers": 32,
	"codebook_size": 1024,
	"codebook_dim": 8,
	"quantizer_type": "rlfq",
	}

	# Handle quantizer_type from kwargs or config
	kw_qtype = quantizer_kwargs.get("quantizer_type", None)
	if kw_qtype is not None:
	self.quantizer_type = kw_qtype
	else:
	self.quantizer_type = quantizer_type
	quantizer_kwargs["quantizer_type"] = quantizer_type

	self.quantizer_kwargs = quantizer_kwargs

	super().__init__(**kwargs)

	@property
	def num_quantizers(self) -> int:
	"""Return the number of quantizers from quantizer_kwargs."""
	return self.quantizer_kwargs.get("num_quantizers", 32)

	@property
	def codebook_size(self) -> int:
	"""Return the codebook size from quantizer_kwargs."""
	return self.quantizer_kwargs.get("codebook_size", 4096)

	@property
	def frame_rate(self) -> float:
	"""Return the frame rate (tokens per second)."""
	return self.sampling_rate / self.downsample_rate


	__all__ = ["MossAudioTokenizerConfig"]