File size: 2,123 Bytes
df8c03d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from transformers import PretrainedConfig

class SpeechEncoderConfig(PretrainedConfig):
    model_type = "gslm-speech-encoder"

    def __init__(
        self,
        # Backend for dense features: "fairseq" (your uploaded .pt) or "transformers"
        hubert_backend: str = "fairseq",
        # If backend=fairseq: filename of your uploaded HuBERT checkpoint (e.g. "hubert_base_ls960.pt")
        hubert_ckpt: str = "hubert_base_ls960.pt",
        # If backend=transformers: HF model id (e.g. "facebook/hubert-base-ls960")
        hubert_hf_name: str = "facebook/hubert-base-ls960",
        hubert_layer: int = 9,               # which layer features to use
        expected_sample_rate: int = 16000,   # HuBERT-base default
        code_hop_size: int = 320,            # 20 ms at 16 kHz
        # Quantizer file (K-Means centers)
        quantizer_file: str = "kmeans_100.pt",
        # Optional key to find centers inside a .pt/.pkl
        quantizer_key: str = "",
        # Interface flags (mirror textless SpeechEncoder)
        deduplicate: bool = True,
        add_bos_eos: bool = False,
        need_f0: bool = False,               # keep False (F0 pipeline not bundled here)
        # BOS/EOS ids (if left None, we will place them at vocab_size and vocab_size+1)
        bos_id: int | None = None,
        eos_id: int | None = None,
        # Feature normalization before KMeans (None | "unit" | "layernorm")
        feature_norm: str | None = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.hubert_backend = hubert_backend
        self.hubert_ckpt = hubert_ckpt
        self.hubert_hf_name = hubert_hf_name
        self.hubert_layer = int(hubert_layer)
        self.expected_sample_rate = int(expected_sample_rate)
        self.code_hop_size = int(code_hop_size)

        self.quantizer_file = quantizer_file
        self.quantizer_key = quantizer_key

        self.deduplicate = bool(deduplicate)
        self.add_bos_eos = bool(add_bos_eos)
        self.need_f0 = bool(need_f0)
        self.bos_id = bos_id
        self.eos_id = eos_id

        self.feature_norm = feature_norm