|
|
from transformers import PretrainedConfig |
|
|
|
|
|
class SpeechEncoderConfig(PretrainedConfig): |
|
|
model_type = "gslm-speech-encoder" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
|
|
|
hubert_backend: str = "fairseq", |
|
|
|
|
|
hubert_ckpt: str = "hubert_base_ls960.pt", |
|
|
|
|
|
hubert_hf_name: str = "facebook/hubert-base-ls960", |
|
|
hubert_layer: int = 9, |
|
|
expected_sample_rate: int = 16000, |
|
|
code_hop_size: int = 320, |
|
|
|
|
|
quantizer_file: str = "kmeans_100.pt", |
|
|
|
|
|
quantizer_key: str = "", |
|
|
|
|
|
deduplicate: bool = True, |
|
|
add_bos_eos: bool = False, |
|
|
need_f0: bool = False, |
|
|
|
|
|
bos_id: int | None = None, |
|
|
eos_id: int | None = None, |
|
|
|
|
|
feature_norm: str | None = None, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
self.hubert_backend = hubert_backend |
|
|
self.hubert_ckpt = hubert_ckpt |
|
|
self.hubert_hf_name = hubert_hf_name |
|
|
self.hubert_layer = int(hubert_layer) |
|
|
self.expected_sample_rate = int(expected_sample_rate) |
|
|
self.code_hop_size = int(code_hop_size) |
|
|
|
|
|
self.quantizer_file = quantizer_file |
|
|
self.quantizer_key = quantizer_key |
|
|
|
|
|
self.deduplicate = bool(deduplicate) |
|
|
self.add_bos_eos = bool(add_bos_eos) |
|
|
self.need_f0 = bool(need_f0) |
|
|
self.bos_id = bos_id |
|
|
self.eos_id = eos_id |
|
|
|
|
|
self.feature_norm = feature_norm |
|
|
|