gslm-encoder / configuration_speech_encoder.py
klemenk's picture
Create configuration_speech_encoder.py
df8c03d verified
from transformers import PretrainedConfig
class SpeechEncoderConfig(PretrainedConfig):
model_type = "gslm-speech-encoder"
def __init__(
self,
# Backend for dense features: "fairseq" (your uploaded .pt) or "transformers"
hubert_backend: str = "fairseq",
# If backend=fairseq: filename of your uploaded HuBERT checkpoint (e.g. "hubert_base_ls960.pt")
hubert_ckpt: str = "hubert_base_ls960.pt",
# If backend=transformers: HF model id (e.g. "facebook/hubert-base-ls960")
hubert_hf_name: str = "facebook/hubert-base-ls960",
hubert_layer: int = 9, # which layer features to use
expected_sample_rate: int = 16000, # HuBERT-base default
code_hop_size: int = 320, # 20 ms at 16 kHz
# Quantizer file (K-Means centers)
quantizer_file: str = "kmeans_100.pt",
# Optional key to find centers inside a .pt/.pkl
quantizer_key: str = "",
# Interface flags (mirror textless SpeechEncoder)
deduplicate: bool = True,
add_bos_eos: bool = False,
need_f0: bool = False, # keep False (F0 pipeline not bundled here)
# BOS/EOS ids (if left None, we will place them at vocab_size and vocab_size+1)
bos_id: int | None = None,
eos_id: int | None = None,
# Feature normalization before KMeans (None | "unit" | "layernorm")
feature_norm: str | None = None,
**kwargs,
):
super().__init__(**kwargs)
self.hubert_backend = hubert_backend
self.hubert_ckpt = hubert_ckpt
self.hubert_hf_name = hubert_hf_name
self.hubert_layer = int(hubert_layer)
self.expected_sample_rate = int(expected_sample_rate)
self.code_hop_size = int(code_hop_size)
self.quantizer_file = quantizer_file
self.quantizer_key = quantizer_key
self.deduplicate = bool(deduplicate)
self.add_bos_eos = bool(add_bos_eos)
self.need_f0 = bool(need_f0)
self.bos_id = bos_id
self.eos_id = eos_id
self.feature_norm = feature_norm