from transformers import PretrainedConfig class SpeechEncoderConfig(PretrainedConfig): model_type = "gslm-speech-encoder" def __init__( self, # Backend for dense features: "fairseq" (your uploaded .pt) or "transformers" hubert_backend: str = "fairseq", # If backend=fairseq: filename of your uploaded HuBERT checkpoint (e.g. "hubert_base_ls960.pt") hubert_ckpt: str = "hubert_base_ls960.pt", # If backend=transformers: HF model id (e.g. "facebook/hubert-base-ls960") hubert_hf_name: str = "facebook/hubert-base-ls960", hubert_layer: int = 9, # which layer features to use expected_sample_rate: int = 16000, # HuBERT-base default code_hop_size: int = 320, # 20 ms at 16 kHz # Quantizer file (K-Means centers) quantizer_file: str = "kmeans_100.pt", # Optional key to find centers inside a .pt/.pkl quantizer_key: str = "", # Interface flags (mirror textless SpeechEncoder) deduplicate: bool = True, add_bos_eos: bool = False, need_f0: bool = False, # keep False (F0 pipeline not bundled here) # BOS/EOS ids (if left None, we will place them at vocab_size and vocab_size+1) bos_id: int | None = None, eos_id: int | None = None, # Feature normalization before KMeans (None | "unit" | "layernorm") feature_norm: str | None = None, **kwargs, ): super().__init__(**kwargs) self.hubert_backend = hubert_backend self.hubert_ckpt = hubert_ckpt self.hubert_hf_name = hubert_hf_name self.hubert_layer = int(hubert_layer) self.expected_sample_rate = int(expected_sample_rate) self.code_hop_size = int(code_hop_size) self.quantizer_file = quantizer_file self.quantizer_key = quantizer_key self.deduplicate = bool(deduplicate) self.add_bos_eos = bool(add_bos_eos) self.need_f0 = bool(need_f0) self.bos_id = bos_id self.eos_id = eos_id self.feature_norm = feature_norm