mazesmazes commited on
Commit
d14a2cd
·
verified ·
1 Parent(s): 282a137

Training in progress - step 500

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
asr_config.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import transformers
4
+
5
+
6
+ class ASRConfig(transformers.PretrainedConfig):
7
+ model_type = "asr_model"
8
+ is_composition = True
9
+
10
+ def __init__(
11
+ self,
12
+ audio_model_id: str = "openai/whisper-large-v3-turbo",
13
+ text_model_id: str = "HuggingFaceTB/SmolLM3-3B",
14
+ attn_implementation: str = "flash_attention_2",
15
+ model_dtype: str = "bfloat16",
16
+ num_beams: Optional[int] = None,
17
+ system_prompt: str = "/no_think /system_override",
18
+ user_prompt: str = "Transcribe: <audio>",
19
+ encoder_dim: Optional[int] = None,
20
+ llm_dim: Optional[int] = None,
21
+ audio_sample_rate: int = 16000,
22
+ projector_init_std: float = 0.02,
23
+ projector_pool_stride: int = 4,
24
+ downsample_rate: int = 5, # Granite default
25
+ projector_hidden_dim: Optional[int] = None,
26
+ projector_type: str = "moe", # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
27
+ projector_num_layers: int = 2, # Number of layers (for residual projector)
28
+ projector_dropout: float = 0.0, # Dropout rate for projector layers
29
+ # MoE-specific configuration
30
+ num_experts: int = 4, # Number of experts in MoE projectors
31
+ num_experts_per_tok: int = 2, # Top-k experts per token
32
+ router_aux_loss_coef: float = 0.01, # Auxiliary loss coefficient for load balancing
33
+ # QFormer-specific configuration (Granite defaults)
34
+ qformer_window_size: int = 15, # Window size for QFormer processing
35
+ qformer_hidden_size: Optional[int] = None, # QFormer hidden size (defaults to encoder_dim)
36
+ qformer_num_layers: int = 2, # Number of QFormer transformer layers
37
+ qformer_num_heads: int = 16, # Number of attention heads in QFormer
38
+ qformer_intermediate_size: Optional[int] = None, # FFN size (defaults to 4x hidden)
39
+ label_smoothing: float = 0.0, # Label smoothing for cross-entropy loss
40
+ inference_warmup_tokens: int = 10,
41
+ max_new_tokens: Optional[int] = None,
42
+ repetition_penalty: Optional[float] = None,
43
+ length_penalty: Optional[float] = None,
44
+ no_repeat_ngram_size: Optional[int] = None,
45
+ use_cache: Optional[bool] = None,
46
+ **kwargs,
47
+ ):
48
+ # Set default generation parameters (greedy decoding only)
49
+ generation_defaults = {
50
+ "num_beams": 1,
51
+ "max_new_tokens": 96,
52
+ "repetition_penalty": 1.0,
53
+ "length_penalty": 1.0,
54
+ "no_repeat_ngram_size": 0,
55
+ "use_cache": True,
56
+ }
57
+
58
+ # Apply defaults (config.json values take precedence)
59
+ kwargs = {**generation_defaults, **kwargs}
60
+
61
+ self.audio_model_id = audio_model_id
62
+ self.text_model_id = text_model_id
63
+ self.attn_implementation = attn_implementation
64
+ self.model_dtype = model_dtype
65
+ self.system_prompt = system_prompt
66
+ self.user_prompt = user_prompt
67
+ self.encoder_dim = encoder_dim
68
+ self.llm_dim = llm_dim
69
+ self.audio_sample_rate = audio_sample_rate
70
+ self.projector_init_std = projector_init_std
71
+ self.projector_pool_stride = projector_pool_stride
72
+ self.downsample_rate = downsample_rate
73
+ self.projector_hidden_dim = projector_hidden_dim
74
+ self.projector_type = projector_type
75
+ self.projector_num_layers = projector_num_layers
76
+ self.projector_dropout = projector_dropout
77
+ # MoE-specific configuration
78
+ self.num_experts = num_experts
79
+ self.num_experts_per_tok = num_experts_per_tok
80
+ self.router_aux_loss_coef = router_aux_loss_coef
81
+ # QFormer-specific configuration
82
+ self.qformer_window_size = qformer_window_size
83
+ self.qformer_hidden_size = qformer_hidden_size
84
+ self.qformer_num_layers = qformer_num_layers
85
+ self.qformer_num_heads = qformer_num_heads
86
+ self.qformer_intermediate_size = qformer_intermediate_size
87
+ self.label_smoothing = label_smoothing
88
+ self.inference_warmup_tokens = inference_warmup_tokens
89
+
90
+ # Generation parameters (use explicit value if provided, else use default)
91
+ self.num_beams = num_beams if num_beams is not None else generation_defaults["num_beams"]
92
+ self.max_new_tokens = (
93
+ max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
94
+ )
95
+ self.repetition_penalty = (
96
+ repetition_penalty
97
+ if repetition_penalty is not None
98
+ else generation_defaults["repetition_penalty"]
99
+ )
100
+ self.length_penalty = (
101
+ length_penalty if length_penalty is not None else generation_defaults["length_penalty"]
102
+ )
103
+ self.no_repeat_ngram_size = (
104
+ no_repeat_ngram_size
105
+ if no_repeat_ngram_size is not None
106
+ else generation_defaults["no_repeat_ngram_size"]
107
+ )
108
+ self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
109
+
110
+ if "audio_config" not in kwargs:
111
+ self.audio_config = transformers.AutoConfig.from_pretrained(audio_model_id)
112
+ # Override dtype to match model_dtype
113
+ self.audio_config.dtype = model_dtype
114
+ else:
115
+ self.audio_config = kwargs.pop("audio_config")
116
+
117
+ if "text_config" not in kwargs:
118
+ self.text_config = transformers.AutoConfig.from_pretrained(
119
+ text_model_id, trust_remote_code=True
120
+ )
121
+ # Override dtype to match model_dtype
122
+ self.text_config.dtype = model_dtype
123
+ else:
124
+ self.text_config = kwargs.pop("text_config")
125
+
126
+ if isinstance(self.text_config, dict):
127
+ # Reconstruct config from dict using the model_type stored in the dict
128
+ model_type = self.text_config["model_type"]
129
+ config_class = transformers.AutoConfig.for_model(model_type).__class__
130
+ self.text_config = config_class(**self.text_config)
131
+
132
+ if isinstance(self.audio_config, dict):
133
+ model_type = self.audio_config.get("model_type")
134
+ if model_type:
135
+ config_class = transformers.AutoConfig.for_model(model_type).__class__
136
+ self.audio_config = config_class(**self.audio_config)
137
+
138
+ super().__init__(**kwargs)
139
+
140
+ self.auto_map = {
141
+ "AutoConfig": "asr_config.ASRConfig",
142
+ "AutoModel": "asr_modeling.ASRModel",
143
+ "AutoModelForSpeechSeq2Seq": "asr_modeling.ASRModel",
144
+ "AutoProcessor": "asr_processing.ASRProcessor",
145
+ }
146
+ self.custom_pipelines = {
147
+ "automatic-speech-recognition": {
148
+ "impl": "asr_pipeline.ASRPipeline",
149
+ "pt": ["AutoModelForSpeechSeq2Seq"],
150
+ "tf": [],
151
+ "type": "audio",
152
+ }
153
+ }
154
+ self.architectures = ["ASRModel"]
155
+ self.pipeline_tag = "automatic-speech-recognition"
156
+
157
+
158
+ transformers.AutoConfig.register("asr_model", ASRConfig)
asr_modeling.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Optional, Union
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from transformers import (
8
+ AutoConfig,
9
+ AutoModel,
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ PreTrainedModel,
13
+ )
14
+ from transformers.generation import GenerationMixin
15
+ from transformers.modeling_outputs import CausalLMOutputWithPast
16
+
17
+ try:
18
+ from .asr_config import ASRConfig
19
+ from .projectors import PROJECTOR_CLASSES
20
+ except ImportError:
21
+ from asr_config import ASRConfig # type: ignore[no-redef]
22
+ from projectors import PROJECTOR_CLASSES # type: ignore[no-redef]
23
+
24
+
25
+ class ASRModel(PreTrainedModel, GenerationMixin):
26
+ """Audio-to-text model combining an audio encoder, projector, and language model."""
27
+
28
+ config_class = ASRConfig
29
+ base_model_prefix = "model"
30
+ main_input_name = "input_features"
31
+ _supports_flash_attn_2 = True
32
+ supports_gradient_checkpointing = True
33
+ _is_loading_from_pretrained: bool = False
34
+ _pretrained_model_path: Optional[str] = None
35
+
36
+ TRANSCRIBE_PROMPT = "Transcribe: "
37
+
38
+ @classmethod
39
+ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
40
+ """Load model from pretrained, handling device placement correctly."""
41
+ from safetensors.torch import load_file
42
+ from transformers.utils.hub import cached_file
43
+
44
+ config = kwargs.pop("config", None)
45
+ if config is None:
46
+ config = ASRConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
47
+
48
+ # Set flag to avoid device_map="auto" in sub-model loaders
49
+ cls._is_loading_from_pretrained = True
50
+ cls._pretrained_model_path = pretrained_model_name_or_path
51
+
52
+ try:
53
+ model = cls(config, **kwargs)
54
+
55
+ # Load projector weights from safetensors
56
+ subfolder = kwargs.get("subfolder")
57
+ revision = kwargs.get("revision")
58
+ cache_kwargs = {}
59
+ if subfolder:
60
+ cache_kwargs["subfolder"] = subfolder
61
+ if revision:
62
+ cache_kwargs["revision"] = revision
63
+
64
+ model_file = cached_file(
65
+ pretrained_model_name_or_path,
66
+ "model.safetensors",
67
+ _raise_exceptions_for_missing_entries=False,
68
+ **cache_kwargs,
69
+ )
70
+
71
+ if model_file is not None:
72
+ state_dict = load_file(model_file)
73
+ model.load_state_dict(state_dict, strict=False)
74
+
75
+ return model
76
+ finally:
77
+ cls._is_loading_from_pretrained = False
78
+ cls._pretrained_model_path = None
79
+
80
+ def __init__(self, config: ASRConfig, **kwargs):
81
+ super().__init__(config)
82
+
83
+ self.system_prompt = config.system_prompt
84
+ target_dtype = getattr(torch, config.model_dtype)
85
+
86
+ # Audio encoder (frozen)
87
+ self.audio_tower = self._load_audio_encoder(config, target_dtype)
88
+
89
+ # Language model (frozen)
90
+ self.language_model = self._load_language_model(config, target_dtype)
91
+
92
+ # Initialize tokenizer and special tokens
93
+ self._init_tokenizer(config)
94
+
95
+ # Set up generation config with greedy decoding defaults
96
+ self.generation_config = self.language_model.generation_config
97
+ self.generation_config.max_new_tokens = config.max_new_tokens
98
+ self.generation_config.num_beams = config.num_beams
99
+ self.generation_config.do_sample = False
100
+ # Clear sampling params (inherited from LLM) since we use greedy decoding
101
+ self.generation_config.temperature = None
102
+ self.generation_config.top_p = None
103
+ self.generation_config.top_k = None
104
+ self.generation_config.use_cache = config.use_cache
105
+ self.generation_config.length_penalty = config.length_penalty
106
+ self.generation_config.repetition_penalty = config.repetition_penalty
107
+ self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
108
+ self.generation_config.eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
109
+ self.generation_config.pad_token_id = self.tokenizer.pad_token_id
110
+
111
+ # Feature extractor for audio preprocessing
112
+ self.feature_extractor = self._create_feature_extractor(config)
113
+
114
+ # Audio projector (trainable)
115
+ self.projector = self._create_projector(config, target_dtype)
116
+
117
+ # For model parallelism
118
+ self._no_split_modules = getattr(self.language_model, "_no_split_modules", [])
119
+
120
+ def _create_feature_extractor(self, config: ASRConfig):
121
+ """Create the appropriate feature extractor for the audio encoder."""
122
+ from transformers import AutoFeatureExtractor
123
+
124
+ return AutoFeatureExtractor.from_pretrained(config.audio_model_id)
125
+
126
+ @classmethod
127
+ def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
128
+ """Load and freeze the audio encoder."""
129
+ encoder_kwargs = {
130
+ "attn_implementation": config.attn_implementation,
131
+ "low_cpu_mem_usage": True,
132
+ "dtype": dtype,
133
+ }
134
+
135
+ if "whisper" in config.audio_model_id.lower():
136
+ from transformers import WhisperModel
137
+
138
+ full_model = WhisperModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
139
+ encoder = full_model.encoder
140
+ del full_model
141
+ else:
142
+ encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
143
+
144
+ encoder.requires_grad_(False)
145
+ encoder.eval()
146
+ return encoder
147
+
148
+ @classmethod
149
+ def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
150
+ """Load and freeze the language model."""
151
+ decoder_kwargs = {
152
+ "attn_implementation": config.attn_implementation,
153
+ "trust_remote_code": True,
154
+ "tie_word_embeddings": True,
155
+ "low_cpu_mem_usage": True,
156
+ "dtype": dtype,
157
+ }
158
+
159
+ decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
160
+ decoder.config.use_cache = getattr(config, "use_cache", True)
161
+ decoder.requires_grad_(False)
162
+ decoder.eval()
163
+ return decoder
164
+
165
+ def _create_projector(self, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
166
+ """Create the trainable audio projector."""
167
+ # Auto-detect dimensions if not specified
168
+ if config.encoder_dim is None:
169
+ enc_cfg = self.audio_tower.config
170
+ config.encoder_dim = getattr(enc_cfg, "hidden_size", None) or getattr(
171
+ enc_cfg, "d_model", None
172
+ )
173
+ if config.encoder_dim is None:
174
+ raise ValueError("Could not auto-detect encoder_dim. Please specify in config.")
175
+
176
+ if config.llm_dim is None:
177
+ dec_cfg = self.language_model.config
178
+ config.llm_dim = getattr(dec_cfg, "hidden_size", None) or getattr(
179
+ dec_cfg, "d_model", None
180
+ )
181
+ if config.llm_dim is None:
182
+ raise ValueError("Could not auto-detect llm_dim. Please specify in config.")
183
+
184
+ # Select projector type based on config
185
+ projector_type = getattr(config, "projector_type", "mlp")
186
+ projector_class = PROJECTOR_CLASSES.get(projector_type)
187
+ if projector_class is None:
188
+ raise ValueError(
189
+ f"Unknown projector_type: {projector_type}. "
190
+ f"Valid options: {list(PROJECTOR_CLASSES.keys())}"
191
+ )
192
+ projector = projector_class(config)
193
+
194
+ # Move projector to same device as language model (important when using quantization)
195
+ device = next(self.language_model.parameters()).device
196
+ return projector.to(device=device, dtype=dtype)
197
+
198
+ def _init_tokenizer(self, config: ASRConfig):
199
+ """Initialize tokenizer with audio token."""
200
+ self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
201
+
202
+ # Set pad token
203
+ if (
204
+ self.tokenizer.pad_token is None
205
+ or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
206
+ ) and "<|finetune_right_pad_id|>" in self.tokenizer.get_vocab():
207
+ self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
208
+
209
+ # Add audio token
210
+ existing_special = self.tokenizer.additional_special_tokens or []
211
+ if "<audio>" not in existing_special:
212
+ self.tokenizer.add_special_tokens(
213
+ {"additional_special_tokens": existing_special + ["<audio>"]}
214
+ )
215
+ self.language_model.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
216
+
217
+ self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
218
+ self.tokenizer.padding_side = "right"
219
+
220
+ # Sync token IDs to configs
221
+ for cfg in [self.config.text_config, self.language_model.config, self.generation_config]:
222
+ if cfg is not None:
223
+ cfg.pad_token_id = self.tokenizer.pad_token_id
224
+ cfg.eos_token_id = self.tokenizer.eos_token_id
225
+ cfg.bos_token_id = self.tokenizer.bos_token_id
226
+
227
+ def _init_weights(self, module):
228
+ """Weight initialization (projector weights are initialized in MoEAudioProjector)."""
229
+ pass
230
+
231
+ def _set_gradient_checkpointing(self, enable: bool = True, gradient_checkpointing_func=None):
232
+ """Enable/disable gradient checkpointing for the language model."""
233
+ # The LLM still stores activations during forward for backprop to projector
234
+ # Gradient checkpointing trades compute for memory by recomputing activations
235
+ if hasattr(self.language_model, "_set_gradient_checkpointing"):
236
+ self.language_model._set_gradient_checkpointing(enable, gradient_checkpointing_func)
237
+ elif hasattr(self.language_model, "gradient_checkpointing_enable") and enable:
238
+ self.language_model.gradient_checkpointing_enable(
239
+ gradient_checkpointing_kwargs={"use_reentrant": False}
240
+ )
241
+ elif hasattr(self.language_model, "gradient_checkpointing_disable") and not enable:
242
+ self.language_model.gradient_checkpointing_disable()
243
+
244
+ def get_input_embeddings(self):
245
+ return self.language_model.get_input_embeddings()
246
+
247
+ def set_input_embeddings(self, value):
248
+ self.language_model.set_input_embeddings(value)
249
+
250
+ def get_output_embeddings(self):
251
+ return self.language_model.get_output_embeddings()
252
+
253
+ def set_output_embeddings(self, value):
254
+ self.language_model.set_output_embeddings(value)
255
+
256
+ def get_processor(self):
257
+ """Get the processor for this model."""
258
+ try:
259
+ from .asr_processing import ASRProcessor
260
+ except ImportError:
261
+ from asr_processing import ASRProcessor # type: ignore[no-redef]
262
+
263
+ return ASRProcessor(feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
264
+
265
+ def state_dict(self, *args, **kwargs):
266
+ """Only save trainable projector weights."""
267
+ return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
268
+
269
+ def _encode_audio(
270
+ self,
271
+ audio_features: torch.Tensor,
272
+ audio_attention_mask: torch.Tensor,
273
+ ) -> torch.Tensor:
274
+ """Encode audio and project to LLM embedding space.
275
+
276
+ Args:
277
+ audio_features: Mel spectrogram features (batch, n_mels, mel_len)
278
+ audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
279
+
280
+ Returns:
281
+ Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
282
+ """
283
+ with torch.no_grad():
284
+ encoder_out = self.audio_tower(input_features=audio_features)
285
+ hidden_states = encoder_out.last_hidden_state
286
+
287
+ # Truncate to actual audio length (mel_frames -> encoder_frames via stride-2 conv)
288
+ real_encoder_len = audio_attention_mask.sum(dim=-1) // 2
289
+ max_real_len = int(real_encoder_len.max().item())
290
+ hidden_states = hidden_states[:, :max_real_len]
291
+
292
+ audio_embeds = self.projector(hidden_states)
293
+
294
+ # Flatten: (batch, seq, hidden) -> (batch * seq, hidden)
295
+ # This allows masked_scatter to do 1:1 replacement
296
+ return audio_embeds.reshape(-1, audio_embeds.shape[-1])
297
+
298
+ def forward(
299
+ self,
300
+ input_ids: Optional[torch.Tensor] = None,
301
+ input_features: Optional[torch.Tensor] = None,
302
+ audio_attention_mask: Optional[torch.Tensor] = None,
303
+ attention_mask: Optional[torch.Tensor] = None,
304
+ position_ids: Optional[torch.Tensor] = None,
305
+ past_key_values: Optional[torch.Tensor] = None,
306
+ inputs_embeds: Optional[torch.Tensor] = None,
307
+ labels: Optional[torch.Tensor] = None,
308
+ use_cache: Optional[bool] = None,
309
+ cache_position: Optional[torch.Tensor] = None,
310
+ **kwargs,
311
+ ) -> CausalLMOutputWithPast:
312
+ """Forward pass for training and inference."""
313
+ # Get text embeddings if not provided
314
+ if inputs_embeds is None:
315
+ inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
316
+
317
+ if input_features is not None and input_ids is not None:
318
+ # Encode audio -> flattened (total_audio_tokens, hidden_dim)
319
+ audio_embeds = self._encode_audio(input_features, audio_attention_mask)
320
+
321
+ # Replace <audio> token placeholders with audio embeddings using masked_scatter
322
+ audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
323
+ inputs_embeds = inputs_embeds.masked_scatter(
324
+ audio_token_mask.to(inputs_embeds.device),
325
+ audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
326
+ )
327
+
328
+ # Run through language model (let it compute loss if labels provided)
329
+ outputs = self.language_model(
330
+ attention_mask=attention_mask,
331
+ position_ids=position_ids,
332
+ past_key_values=past_key_values,
333
+ inputs_embeds=inputs_embeds,
334
+ labels=labels,
335
+ use_cache=use_cache,
336
+ cache_position=cache_position,
337
+ **kwargs,
338
+ )
339
+
340
+ # Add auxiliary loss from MoE projectors if available
341
+ if outputs.loss is not None and hasattr(self.projector, "get_aux_loss"):
342
+ aux_loss = self.projector.get_aux_loss()
343
+ if aux_loss is not None and aux_loss.numel() > 0:
344
+ outputs.loss = outputs.loss + aux_loss.to(outputs.loss.device)
345
+
346
+ return outputs
347
+
348
+ def prepare_inputs_for_generation(self, *args, **kwargs):
349
+ """Prepare inputs for generation, handling audio features for cached decoding."""
350
+ input_features = kwargs.pop("input_features", None)
351
+ cache_position = kwargs.get("cache_position")
352
+
353
+ model_inputs = self.language_model.prepare_inputs_for_generation(*args, **kwargs)
354
+
355
+ # Only pass audio features on the first generation step (cache_position[0] == 0)
356
+ if cache_position is not None and cache_position[0] == 0 and input_features is not None:
357
+ model_inputs["input_features"] = input_features
358
+
359
+ return model_inputs
360
+
361
+ def _get_num_audio_tokens(
362
+ self,
363
+ audio_attention_mask: torch.Tensor,
364
+ ) -> int:
365
+ """Calculate number of audio tokens based on actual audio length.
366
+
367
+ Uses attention mask to get real audio length, then computes:
368
+ mel_frames -> encoder_frames (stride-2) -> projector output tokens
369
+ """
370
+ mel_len = int(audio_attention_mask.sum(dim=-1).max().item())
371
+ encoder_output_len = mel_len // 2
372
+ return int(self.projector.get_output_length(encoder_output_len))
373
+
374
+ @torch.no_grad()
375
+ def generate(
376
+ self,
377
+ input_ids: Optional[torch.Tensor] = None,
378
+ input_features: Optional[torch.Tensor] = None,
379
+ audio_attention_mask: Optional[torch.Tensor] = None,
380
+ attention_mask: Optional[torch.Tensor] = None,
381
+ system_prompt: Optional[str] = None,
382
+ **generate_kwargs,
383
+ ) -> torch.Tensor:
384
+ """Generate transcription from audio input.
385
+
386
+ Can be called in two ways:
387
+ 1. With input_ids containing <audio> tokens (from processor)
388
+ 2. With just audio, and we build the prompt internally
389
+ """
390
+ if input_features is None:
391
+ raise ValueError("input_features required for generation")
392
+ if audio_attention_mask is None:
393
+ raise ValueError("audio_attention_mask required for generation")
394
+
395
+ device = input_features.device
396
+ batch_size = input_features.shape[0]
397
+
398
+ # Encode audio -> flattened embeddings
399
+ audio_embeds = self._encode_audio(input_features, audio_attention_mask)
400
+
401
+ # If input_ids not provided, build prompt with correct number of audio tokens
402
+ if input_ids is None:
403
+ num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
404
+ audio_placeholder = "<audio>" * num_audio_tokens
405
+
406
+ system_prompt = system_prompt or self.system_prompt
407
+
408
+ messages: list[dict[str, str]] = []
409
+ if system_prompt:
410
+ messages.append({"role": "system", "content": system_prompt})
411
+ messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
412
+
413
+ input_ids = self.tokenizer.apply_chat_template(
414
+ messages,
415
+ tokenize=True,
416
+ add_generation_prompt=True,
417
+ return_tensors="pt",
418
+ ).to(device)
419
+
420
+ if input_ids.dim() == 1:
421
+ input_ids = input_ids.unsqueeze(0)
422
+ if input_ids.shape[0] == 1 and batch_size > 1:
423
+ input_ids = input_ids.expand(batch_size, -1)
424
+
425
+ attention_mask = torch.ones_like(input_ids)
426
+
427
+ # Get text embeddings and replace audio tokens with audio embeddings
428
+ inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
429
+ audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
430
+ inputs_embeds = inputs_embeds.masked_scatter(
431
+ audio_token_mask.to(inputs_embeds.device),
432
+ audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
433
+ )
434
+
435
+ # Generate using language model
436
+ output = self.language_model.generate(
437
+ inputs_embeds=inputs_embeds,
438
+ attention_mask=attention_mask,
439
+ generation_config=self.generation_config,
440
+ **generate_kwargs,
441
+ )
442
+
443
+ # When using inputs_embeds without input_ids, generate returns only new tokens
444
+ if isinstance(output, torch.Tensor):
445
+ return output
446
+ return output.sequences
447
+
448
+ def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
449
+ """Save model, tokenizer, and processor."""
450
+ import shutil
451
+ from pathlib import Path as PathlibPath
452
+
453
+ save_dir = PathlibPath(save_directory)
454
+ save_dir.mkdir(parents=True, exist_ok=True)
455
+
456
+ # Update config with actual vocab size
457
+ self.config.vocab_size = self.language_model.config.vocab_size
458
+ self.config.text_config.vocab_size = self.language_model.config.vocab_size
459
+
460
+ if hasattr(self.audio_tower.config, "num_mel_bins"):
461
+ self.config.audio_config.num_mel_bins = self.audio_tower.config.num_mel_bins
462
+
463
+ # Save model (temporarily remove non-serializable attributes)
464
+ tokenizer = self.tokenizer
465
+ del self.tokenizer
466
+
467
+ try:
468
+ super().save_pretrained(save_dir, **kwargs)
469
+ finally:
470
+ self.tokenizer = tokenizer
471
+
472
+ # Save tokenizer and feature extractor
473
+ self.tokenizer.save_pretrained(save_dir)
474
+ self.feature_extractor.save_pretrained(save_dir)
475
+
476
+ # Add processor auto_map to preprocessor_config.json
477
+ config_path = save_dir / "preprocessor_config.json"
478
+ if config_path.exists():
479
+ with config_path.open() as f:
480
+ processor_config = json.load(f)
481
+ else:
482
+ processor_config = {}
483
+
484
+ processor_config.update(
485
+ {
486
+ "processor_class": "ASRProcessor",
487
+ "auto_map": {"AutoProcessor": "asr_processing.ASRProcessor"},
488
+ }
489
+ )
490
+
491
+ with config_path.open("w") as f:
492
+ json.dump(processor_config, f, indent=2)
493
+
494
+ # Copy source files for auto-loading
495
+ src_dir = PathlibPath(__file__).parent
496
+ for asr_file in src_dir.glob("asr_*.py"):
497
+ shutil.copy(asr_file, save_dir / asr_file.name)
498
+ # Copy projectors module
499
+ shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
500
+
501
+
502
+ # Register with transformers Auto classes
503
+ AutoConfig.register("asr_model", ASRConfig)
504
+ AutoModel.register(ASRConfig, ASRModel)
asr_pipeline.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+ import torch
6
+ import transformers
7
+
8
+ try:
9
+ from .asr_modeling import ASRModel
10
+ except ImportError:
11
+ from asr_modeling import ASRModel # type: ignore[no-redef]
12
+
13
+
14
+ class ForcedAligner:
15
+ """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
16
+
17
+ _bundle = None
18
+ _model = None
19
+ _labels = None
20
+ _dictionary = None
21
+
22
+ @classmethod
23
+ def get_instance(cls, device: str = "cuda"):
24
+ if cls._model is None:
25
+ import torchaudio
26
+
27
+ cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
28
+ cls._model = cls._bundle.get_model().to(device)
29
+ cls._model.eval()
30
+ cls._labels = cls._bundle.get_labels()
31
+ cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
32
+ return cls._model, cls._labels, cls._dictionary
33
+
34
+ @classmethod
35
+ def align(
36
+ cls,
37
+ audio: np.ndarray,
38
+ text: str,
39
+ sample_rate: int = 16000,
40
+ language: str = "eng",
41
+ batch_size: int = 16,
42
+ ) -> list[dict]:
43
+ """Align transcript to audio and return word-level timestamps.
44
+
45
+ Args:
46
+ audio: Audio waveform as numpy array
47
+ text: Transcript text to align
48
+ sample_rate: Audio sample rate (default 16000)
49
+ language: ISO-639-3 language code (default "eng" for English, unused)
50
+ batch_size: Batch size for alignment model (unused)
51
+
52
+ Returns:
53
+ List of dicts with 'word', 'start', 'end' keys
54
+ """
55
+ import torchaudio
56
+ from torchaudio.functional import forced_align, merge_tokens
57
+
58
+ device = "cuda" if torch.cuda.is_available() else "cpu"
59
+ model, labels, dictionary = cls.get_instance(device)
60
+
61
+ # Convert audio to tensor (copy to ensure array is writable)
62
+ if isinstance(audio, np.ndarray):
63
+ waveform = torch.from_numpy(audio.copy()).float()
64
+ else:
65
+ waveform = audio.clone().float()
66
+
67
+ # Ensure 2D (channels, time)
68
+ if waveform.dim() == 1:
69
+ waveform = waveform.unsqueeze(0)
70
+
71
+ # Resample if needed (wav2vec2 expects 16kHz)
72
+ if sample_rate != cls._bundle.sample_rate:
73
+ waveform = torchaudio.functional.resample(
74
+ waveform, sample_rate, cls._bundle.sample_rate
75
+ )
76
+
77
+ waveform = waveform.to(device)
78
+
79
+ # Get emissions from model
80
+ with torch.inference_mode():
81
+ emissions, _ = model(waveform)
82
+ emissions = torch.log_softmax(emissions, dim=-1)
83
+
84
+ emission = emissions[0].cpu()
85
+
86
+ # Normalize text: uppercase, keep only valid characters
87
+ transcript = text.upper()
88
+ # Build tokens from transcript
89
+ tokens = []
90
+ for char in transcript:
91
+ if char in dictionary:
92
+ tokens.append(dictionary[char])
93
+ elif char == " ":
94
+ tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
95
+
96
+ if not tokens:
97
+ return []
98
+
99
+ targets = torch.tensor([tokens], dtype=torch.int32)
100
+
101
+ # Run forced alignment
102
+ # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
103
+ # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
104
+ aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
105
+
106
+ # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
107
+ token_spans = merge_tokens(aligned_tokens[0], scores[0])
108
+
109
+ # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
110
+ frame_duration = 320 / cls._bundle.sample_rate
111
+
112
+ # Group token spans into words based on pipe separator
113
+ words = text.split()
114
+ word_timestamps = []
115
+ current_word_start = None
116
+ current_word_end = None
117
+ word_idx = 0
118
+
119
+ for span in token_spans:
120
+ token_char = labels[span.token]
121
+ if token_char == "|": # Word separator
122
+ if current_word_start is not None and word_idx < len(words):
123
+ word_timestamps.append(
124
+ {
125
+ "word": words[word_idx],
126
+ "start": current_word_start * frame_duration,
127
+ "end": current_word_end * frame_duration,
128
+ }
129
+ )
130
+ word_idx += 1
131
+ current_word_start = None
132
+ current_word_end = None
133
+ else:
134
+ if current_word_start is None:
135
+ current_word_start = span.start
136
+ current_word_end = span.end
137
+
138
+ # Don't forget the last word
139
+ if current_word_start is not None and word_idx < len(words):
140
+ word_timestamps.append(
141
+ {
142
+ "word": words[word_idx],
143
+ "start": current_word_start * frame_duration,
144
+ "end": current_word_end * frame_duration,
145
+ }
146
+ )
147
+
148
+ return word_timestamps
149
+
150
+
151
+ class SpeakerDiarizer:
152
+ """Lazy-loaded speaker diarization using pyannote-audio."""
153
+
154
+ _pipeline = None
155
+
156
+ @classmethod
157
+ def get_instance(cls, hf_token: str | None = None):
158
+ """Get or create the diarization pipeline.
159
+
160
+ Args:
161
+ hf_token: HuggingFace token with access to pyannote models.
162
+ Can also be set via HF_TOKEN environment variable.
163
+ """
164
+ if cls._pipeline is None:
165
+ from pyannote.audio import Pipeline
166
+
167
+ cls._pipeline = Pipeline.from_pretrained(
168
+ "pyannote/speaker-diarization-3.1",
169
+ )
170
+
171
+ # Move to GPU if available
172
+ if torch.cuda.is_available():
173
+ cls._pipeline.to(torch.device("cuda"))
174
+ elif torch.backends.mps.is_available():
175
+ cls._pipeline.to(torch.device("mps"))
176
+
177
+ return cls._pipeline
178
+
179
+ @classmethod
180
+ def diarize(
181
+ cls,
182
+ audio: np.ndarray | str,
183
+ sample_rate: int = 16000,
184
+ num_speakers: int | None = None,
185
+ min_speakers: int | None = None,
186
+ max_speakers: int | None = None,
187
+ hf_token: str | None = None,
188
+ ) -> list[dict]:
189
+ """Run speaker diarization on audio.
190
+
191
+ Args:
192
+ audio: Audio waveform as numpy array or path to audio file
193
+ sample_rate: Audio sample rate (default 16000)
194
+ num_speakers: Exact number of speakers (if known)
195
+ min_speakers: Minimum number of speakers
196
+ max_speakers: Maximum number of speakers
197
+ hf_token: HuggingFace token for pyannote models
198
+
199
+ Returns:
200
+ List of dicts with 'speaker', 'start', 'end' keys
201
+ """
202
+ pipeline = cls.get_instance(hf_token)
203
+
204
+ # Prepare audio input
205
+ if isinstance(audio, np.ndarray):
206
+ # pyannote expects {"waveform": tensor, "sample_rate": int}
207
+ waveform = torch.from_numpy(audio).unsqueeze(0) # Add channel dim
208
+ if waveform.dim() == 1:
209
+ waveform = waveform.unsqueeze(0)
210
+ audio_input = {"waveform": waveform, "sample_rate": sample_rate}
211
+ else:
212
+ # File path
213
+ audio_input = audio
214
+
215
+ # Run diarization
216
+ diarization_args = {}
217
+ if num_speakers is not None:
218
+ diarization_args["num_speakers"] = num_speakers
219
+ if min_speakers is not None:
220
+ diarization_args["min_speakers"] = min_speakers
221
+ if max_speakers is not None:
222
+ diarization_args["max_speakers"] = max_speakers
223
+
224
+ diarization = pipeline(audio_input, **diarization_args)
225
+
226
+ # Handle different pyannote return types
227
+ # pyannote 3.x returns DiarizeOutput dataclass, older versions return Annotation
228
+ if hasattr(diarization, "itertracks"):
229
+ annotation = diarization
230
+ elif hasattr(diarization, "speaker_diarization"):
231
+ # pyannote 3.x DiarizeOutput dataclass
232
+ annotation = diarization.speaker_diarization
233
+ elif isinstance(diarization, tuple):
234
+ # Some versions return (annotation, embeddings) tuple
235
+ annotation = diarization[0]
236
+ else:
237
+ raise TypeError(f"Unexpected diarization output type: {type(diarization)}")
238
+
239
+ # Convert to simple format
240
+ segments = []
241
+ for turn, _, speaker in annotation.itertracks(yield_label=True):
242
+ segments.append(
243
+ {
244
+ "speaker": speaker,
245
+ "start": turn.start,
246
+ "end": turn.end,
247
+ }
248
+ )
249
+
250
+ return segments
251
+
252
+ @classmethod
253
+ def assign_speakers_to_words(
254
+ cls,
255
+ words: list[dict],
256
+ speaker_segments: list[dict],
257
+ ) -> list[dict]:
258
+ """Assign speaker labels to words based on timestamp overlap.
259
+
260
+ Args:
261
+ words: List of word dicts with 'word', 'start', 'end' keys
262
+ speaker_segments: List of speaker dicts with 'speaker', 'start', 'end' keys
263
+
264
+ Returns:
265
+ Words list with 'speaker' key added to each word
266
+ """
267
+ for word in words:
268
+ word_mid = (word["start"] + word["end"]) / 2
269
+
270
+ # Find the speaker segment that contains this word's midpoint
271
+ best_speaker = None
272
+ for seg in speaker_segments:
273
+ if seg["start"] <= word_mid <= seg["end"]:
274
+ best_speaker = seg["speaker"]
275
+ break
276
+
277
+ # If no exact match, find closest segment
278
+ if best_speaker is None and speaker_segments:
279
+ min_dist = float("inf")
280
+ for seg in speaker_segments:
281
+ seg_mid = (seg["start"] + seg["end"]) / 2
282
+ dist = abs(word_mid - seg_mid)
283
+ if dist < min_dist:
284
+ min_dist = dist
285
+ best_speaker = seg["speaker"]
286
+
287
+ word["speaker"] = best_speaker
288
+
289
+ return words
290
+
291
+
292
+ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
293
+ """ASR Pipeline for audio-to-text transcription."""
294
+
295
+ model: ASRModel
296
+
297
+ def __init__(self, model: ASRModel, **kwargs):
298
+ feature_extractor = kwargs.pop("feature_extractor", None)
299
+ tokenizer = kwargs.pop("tokenizer", model.tokenizer)
300
+
301
+ if feature_extractor is None:
302
+ feature_extractor = model.get_processor().feature_extractor
303
+
304
+ super().__init__(
305
+ model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
306
+ )
307
+ self._current_audio = None
308
+
309
+ def _sanitize_parameters(self, **kwargs):
310
+ """Intercept our custom parameters before parent class validates them."""
311
+ # Remove our custom parameters so parent doesn't see them
312
+ kwargs.pop("return_timestamps", None)
313
+ kwargs.pop("return_speakers", None)
314
+ kwargs.pop("num_speakers", None)
315
+ kwargs.pop("min_speakers", None)
316
+ kwargs.pop("max_speakers", None)
317
+ kwargs.pop("hf_token", None)
318
+
319
+ return super()._sanitize_parameters(**kwargs)
320
+
321
+ def __call__(
322
+ self,
323
+ inputs,
324
+ **kwargs,
325
+ ):
326
+ """Transcribe audio with optional word-level timestamps and speaker diarization.
327
+
328
+ Args:
329
+ inputs: Audio input (file path, dict with array/sampling_rate, etc.)
330
+ return_timestamps: If True, return word-level timestamps using forced alignment
331
+ return_speakers: If True, return speaker labels for each word
332
+ num_speakers: Exact number of speakers (if known, for diarization)
333
+ min_speakers: Minimum number of speakers (for diarization)
334
+ max_speakers: Maximum number of speakers (for diarization)
335
+ hf_token: HuggingFace token for pyannote models (or set HF_TOKEN env var)
336
+ **kwargs: Additional arguments passed to the pipeline
337
+
338
+ Returns:
339
+ Dict with 'text' key, 'words' key if return_timestamps=True,
340
+ and speaker labels on words if return_speakers=True
341
+ """
342
+ # Extract our params before super().__call__ (which will also call _sanitize_parameters)
343
+ return_timestamps = kwargs.pop("return_timestamps", False)
344
+ return_speakers = kwargs.pop("return_speakers", False)
345
+ diarization_params = {
346
+ "num_speakers": kwargs.pop("num_speakers", None),
347
+ "min_speakers": kwargs.pop("min_speakers", None),
348
+ "max_speakers": kwargs.pop("max_speakers", None),
349
+ "hf_token": kwargs.pop("hf_token", None),
350
+ }
351
+
352
+ if return_speakers:
353
+ return_timestamps = True
354
+
355
+ # Store audio for timestamp alignment and diarization
356
+ if return_timestamps or return_speakers:
357
+ self._current_audio = self._extract_audio(inputs)
358
+
359
+ # Run standard transcription
360
+ result = super().__call__(inputs, **kwargs)
361
+
362
+ # Add timestamps if requested
363
+ if return_timestamps and self._current_audio is not None:
364
+ text = result.get("text", "")
365
+ if text:
366
+ try:
367
+ words = ForcedAligner.align(
368
+ self._current_audio["array"],
369
+ text,
370
+ sample_rate=self._current_audio.get("sampling_rate", 16000),
371
+ )
372
+ result["words"] = words
373
+ except Exception as e:
374
+ result["words"] = []
375
+ result["timestamp_error"] = str(e)
376
+ else:
377
+ result["words"] = []
378
+
379
+ # Add speaker diarization if requested
380
+ if return_speakers and self._current_audio is not None:
381
+ try:
382
+ # Run diarization
383
+ speaker_segments = SpeakerDiarizer.diarize(
384
+ self._current_audio["array"],
385
+ sample_rate=self._current_audio.get("sampling_rate", 16000),
386
+ **{k: v for k, v in diarization_params.items() if v is not None},
387
+ )
388
+ result["speaker_segments"] = speaker_segments
389
+
390
+ # Assign speakers to words
391
+ if result.get("words"):
392
+ result["words"] = SpeakerDiarizer.assign_speakers_to_words(
393
+ result["words"],
394
+ speaker_segments,
395
+ )
396
+ except Exception as e:
397
+ result["speaker_segments"] = []
398
+ result["diarization_error"] = str(e)
399
+
400
+ # Clean up
401
+ self._current_audio = None
402
+
403
+ return result
404
+
405
+ def _extract_audio(self, inputs) -> dict | None:
406
+ """Extract audio array from various input formats using HF utilities."""
407
+ from transformers.pipelines.audio_utils import ffmpeg_read
408
+
409
+ if isinstance(inputs, dict):
410
+ if "array" in inputs:
411
+ return {
412
+ "array": inputs["array"],
413
+ "sampling_rate": inputs.get("sampling_rate", 16000),
414
+ }
415
+ if "raw" in inputs:
416
+ return {
417
+ "array": inputs["raw"],
418
+ "sampling_rate": inputs.get("sampling_rate", 16000),
419
+ }
420
+ elif isinstance(inputs, str):
421
+ # File path - load audio using ffmpeg (same as HF pipeline)
422
+ with Path(inputs).open("rb") as f:
423
+ audio = ffmpeg_read(f.read(), sampling_rate=16000)
424
+ return {"array": audio, "sampling_rate": 16000}
425
+ elif isinstance(inputs, bytes):
426
+ audio = ffmpeg_read(inputs, sampling_rate=16000)
427
+ return {"array": audio, "sampling_rate": 16000}
428
+ elif isinstance(inputs, np.ndarray):
429
+ return {"array": inputs, "sampling_rate": 16000}
430
+
431
+ return None
432
+
433
+ def preprocess(self, inputs, **preprocess_params):
434
+ # Handle dict with "array" key (from datasets)
435
+ if isinstance(inputs, dict) and "array" in inputs:
436
+ inputs = {
437
+ "raw": inputs["array"],
438
+ "sampling_rate": inputs.get("sampling_rate", self.feature_extractor.sampling_rate),
439
+ }
440
+
441
+ for item in super().preprocess(inputs, **preprocess_params):
442
+ if "is_last" not in item:
443
+ item["is_last"] = True
444
+ yield item
445
+
446
+ def _forward(self, model_inputs, **generate_kwargs) -> dict[str, Any]:
447
+ # Extract audio features and is_last flag
448
+ is_last = model_inputs.pop("is_last", True) if isinstance(model_inputs, dict) else True
449
+
450
+ input_features = model_inputs["input_features"].to(self.model.device)
451
+ audio_attention_mask = model_inputs["attention_mask"].to(self.model.device)
452
+
453
+ generated_ids = self.model.generate(
454
+ input_features=input_features,
455
+ audio_attention_mask=audio_attention_mask,
456
+ **generate_kwargs,
457
+ )
458
+
459
+ return {"tokens": generated_ids, "is_last": is_last}
460
+
461
+ def postprocess(self, model_outputs, **kwargs) -> dict[str, str]:
462
+ # Handle list of outputs (from chunking)
463
+ if isinstance(model_outputs, list):
464
+ model_outputs = model_outputs[0] if model_outputs else {}
465
+
466
+ tokens = model_outputs.get("tokens")
467
+ if tokens is None:
468
+ return super().postprocess(model_outputs, **kwargs)
469
+
470
+ if torch.is_tensor(tokens):
471
+ tokens = tokens.cpu()
472
+ if tokens.dim() > 1:
473
+ tokens = tokens[0]
474
+
475
+ text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
476
+ return {"text": text}
asr_processing.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import torch
4
+ import transformers
5
+ from transformers import ProcessorMixin
6
+
7
+ try:
8
+ from .asr_config import ASRConfig
9
+ except ImportError:
10
+ from asr_config import ASRConfig # type: ignore[no-redef]
11
+
12
+
13
+ class ASRProcessor(ProcessorMixin):
14
+ """Processor for Whisper-based ASR models."""
15
+
16
+ attributes = ["feature_extractor", "tokenizer"]
17
+ feature_extractor_class = "AutoFeatureExtractor"
18
+ tokenizer_class = "AutoTokenizer"
19
+ AUDIO_TOKEN = "<audio>"
20
+ TRANSCRIBE_PROMPT = "Transcribe: "
21
+
22
+ def __init__(self, feature_extractor, tokenizer, projector=None):
23
+ self.feature_extractor = feature_extractor
24
+ self.tokenizer = tokenizer
25
+ self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
26
+ self.projector = projector
27
+
28
+ def __call__(
29
+ self,
30
+ audio: Optional[Union[list, "torch.Tensor"]] = None,
31
+ text: Optional[str] = None,
32
+ system_prompt: Optional[str] = None,
33
+ return_tensors: str = "pt",
34
+ **kwargs,
35
+ ) -> dict:
36
+ """Process audio and text inputs for inference.
37
+
38
+ Args:
39
+ audio: Raw audio waveform(s)
40
+ text: Target transcription (optional, for training - but use DataCollator instead)
41
+ system_prompt: Optional system prompt
42
+ return_tensors: Return format ("pt" for PyTorch)
43
+
44
+ Returns:
45
+ Dict with input_features, input_ids, attention_mask
46
+ """
47
+ result = {}
48
+
49
+ # Process audio
50
+ if audio is not None:
51
+ audio_inputs = self.feature_extractor(
52
+ audio,
53
+ sampling_rate=getattr(self.feature_extractor, "sampling_rate", 16000),
54
+ return_attention_mask=True,
55
+ return_tensors=return_tensors,
56
+ **kwargs,
57
+ )
58
+ result["input_features"] = audio_inputs["input_features"]
59
+ result["audio_attention_mask"] = audio_inputs["attention_mask"]
60
+
61
+ # Use actual audio length (from attention mask) for token count
62
+ real_mel_len = audio_inputs["attention_mask"].sum(dim=-1).max().item()
63
+ encoder_output_len = real_mel_len // 2
64
+ num_audio_tokens = self.projector.get_output_length(encoder_output_len)
65
+ else:
66
+ num_audio_tokens = 0
67
+
68
+ # Build prompt with audio token placeholders
69
+ user_content = self.TRANSCRIBE_PROMPT
70
+ if num_audio_tokens > 0:
71
+ user_content += self.AUDIO_TOKEN * num_audio_tokens
72
+
73
+ messages = []
74
+ if system_prompt:
75
+ messages.append({"role": "system", "content": system_prompt})
76
+ messages.append({"role": "user", "content": user_content})
77
+ if text is not None:
78
+ messages.append({"role": "assistant", "content": text})
79
+
80
+ # Tokenize
81
+ input_ids = self.tokenizer.apply_chat_template(
82
+ messages,
83
+ tokenize=True,
84
+ add_generation_prompt=(text is None),
85
+ return_tensors=return_tensors,
86
+ )
87
+
88
+ if isinstance(input_ids, torch.Tensor) and input_ids.dim() == 1:
89
+ input_ids = input_ids.unsqueeze(0)
90
+
91
+ result["input_ids"] = input_ids
92
+ result["attention_mask"] = torch.ones_like(input_ids)
93
+
94
+ return result
95
+
96
+
97
+ ASRProcessor.register_for_auto_class()
98
+ transformers.AutoProcessor.register(ASRConfig, ASRProcessor)
chat_template.jinja ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# ───── defaults ───── #}
2
+ {%- if enable_thinking is not defined -%}
3
+ {%- set enable_thinking = true -%}
4
+ {%- endif -%}
5
+
6
+ {# ───── reasoning mode ───── #}
7
+ {%- if enable_thinking -%}
8
+ {%- set reasoning_mode = "/think" -%}
9
+ {%- else -%}
10
+ {%- set reasoning_mode = "/no_think" -%}
11
+ {%- endif -%}
12
+
13
+ {# ───── header (system message) ───── #}
14
+ {{- "<|im_start|>system\n" -}}
15
+
16
+ {%- if messages[0].role == "system" -%}
17
+ {%- set system_message = messages[0].content -%}
18
+ {%- if "/no_think" in system_message -%}
19
+ {%- set reasoning_mode = "/no_think" -%}
20
+ {%- elif "/think" in system_message -%}
21
+ {%- set reasoning_mode = "/think" -%}
22
+ {%- endif -%}
23
+ {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
24
+ {%- endif -%}
25
+
26
+ {%- if "/system_override" in system_message -%}
27
+ {{- custom_instructions.replace("/system_override", "").rstrip() -}}
28
+ {{- "<|im_end|>\n" -}}
29
+ {%- else -%}
30
+ {{- "## Metadata\n\n" -}}
31
+ {{- "Knowledge Cutoff Date: June 2025\n" -}}
32
+ {%- set today = strftime_now("%d %B %Y") -%}
33
+ {{- "Today Date: " ~ today ~ "\n" -}}
34
+ {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
35
+
36
+ {{- "## Custom Instructions\n\n" -}}
37
+ {%- if custom_instructions -%}
38
+ {{- custom_instructions + "\n\n" -}}
39
+ {%- elif reasoning_mode == "/think" -%}
40
+ {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
41
+ {%- else -%}
42
+ {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
43
+ {%- endif -%}
44
+
45
+ {%- if xml_tools or python_tools or tools -%}
46
+ {{- "### Tools\n\n" -}}
47
+ {%- if xml_tools or tools -%}
48
+ {%- if tools -%}
49
+ {%- set xml_tools = tools -%}
50
+ {%- endif -%}
51
+ {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
52
+ {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
53
+ {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
54
+ {%- endfor -%}
55
+ {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
56
+ {{- xml_tool_string -}}
57
+ {%- endif -%}
58
+ {%- if python_tools -%}
59
+ {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
60
+ {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
61
+ {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
62
+ {%- endfor -%}
63
+ {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
64
+ {{- python_tool_string -}}
65
+ {%- endif -%}
66
+ {{- "\n\n" -}}
67
+ {{- "<|im_end|>\n" -}}
68
+ {%- endif -%}
69
+ {%- endif -%}
70
+ {# ───── main loop ───── #}
71
+ {%- for message in messages -%}
72
+ {%- set content = message.content if message.content is string else "" -%}
73
+ {%- if message.role == "user" -%}
74
+ {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }}
75
+ {%- elif message.role == "assistant" -%}
76
+ {% generation %}
77
+ {%- if reasoning_mode == "/think" -%}
78
+ {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
79
+ {%- else -%}
80
+ {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
81
+ {%- endif -%}
82
+ {% endgeneration %}
83
+ {%- elif message.role == "tool" -%}
84
+ {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }}
85
+ {%- endif -%}
86
+ {%- endfor -%}
87
+ {# ───── generation prompt ───── #}
88
+ {%- if add_generation_prompt -%}
89
+ {%- if reasoning_mode == "/think" -%}
90
+ {{ "<|im_start|>assistant\n" }}
91
+ {%- else -%}
92
+ {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" }}
93
+ {%- endif -%}
94
+ {%- endif -%}
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "ASRProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000,
15
+ "auto_map": {
16
+ "AutoProcessor": "asr_processing.ASRProcessor"
17
+ }
18
+ }
projectors.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio projector modules for bridging encoder and decoder embeddings.
2
+
3
+ This module contains all projector architectures:
4
+ - MLPAudioProjector: Simple 2-layer MLP with conv downsampling
5
+ - MoEAudioProjector: MOSA-style dense mixture of experts
6
+ - SwiGLUAudioProjector: SwiGLU-based projector with temporal pooling
7
+ - ResidualAudioProjector: Residual MLP blocks with linear projection
8
+ - SharedMoEAudioProjector: Shared expert + sparse routed experts
9
+ - QFormerAudioProjector: BLIP-2 QFormer with learnable queries (Granite-style)
10
+ """
11
+
12
+ import math
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F # noqa: N812
17
+ from transformers import AutoModel, Blip2QFormerConfig
18
+ from transformers.models.llama.modeling_llama import LlamaRMSNorm
19
+
20
+ # =============================================================================
21
+ # MLP Projector
22
+ # =============================================================================
23
+
24
+
25
+ class MLPAudioProjector(nn.Module):
26
+ """2-layer MLP projector with conv-based 2x temporal downsampling."""
27
+
28
+ def __init__(self, config):
29
+ super().__init__()
30
+
31
+ encoder_dim = getattr(config, "encoder_dim", 768)
32
+ llm_dim = getattr(config, "llm_dim", 2048)
33
+
34
+ self.downsample = nn.Conv1d(
35
+ encoder_dim, encoder_dim, kernel_size=3, stride=2, padding=1, bias=False
36
+ )
37
+ self.linear_1 = nn.Linear(encoder_dim, llm_dim, bias=False)
38
+ self.act = nn.GELU()
39
+ self.linear_2 = nn.Linear(llm_dim, llm_dim, bias=False)
40
+
41
+ self.apply(self._init_weights)
42
+
43
+ def _init_weights(self, module):
44
+ if isinstance(module, nn.Linear):
45
+ nn.init.normal_(module.weight, mean=0.0, std=0.02)
46
+ elif isinstance(module, nn.Conv1d):
47
+ nn.init.normal_(module.weight, mean=0.0, std=0.02)
48
+ if module.bias is not None:
49
+ nn.init.zeros_(module.bias)
50
+
51
+ def get_output_length(self, input_length: int) -> int:
52
+ """Calculate output sequence length given input length."""
53
+ # Conv stride=2 halves the length (with padding=1, kernel=3)
54
+ return (input_length + 1) // 2
55
+
56
+ def forward(self, x):
57
+ """
58
+ x: [Batch, Seq_Len, Dim]
59
+ Returns: [Batch, Seq_Len // 2, llm_dim]
60
+ """
61
+ # Conv1d expects [Batch, Channels, Seq_Len]
62
+ x = x.transpose(1, 2)
63
+ x = self.downsample(x)
64
+ x = x.transpose(1, 2)
65
+
66
+ x = self.linear_1(x)
67
+ x = self.act(x)
68
+ return self.linear_2(x)
69
+
70
+
71
+ # =============================================================================
72
+ # MoE Projector (MOSA-style)
73
+ # =============================================================================
74
+
75
+
76
+ class SimpleAdapter(nn.Module):
77
+ """Simple 2-layer ReLU adapter (from MOSA paper)."""
78
+
79
+ def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
80
+ super().__init__()
81
+ self.fc1 = nn.Linear(input_dim, hidden_dim)
82
+ self.act = nn.ReLU()
83
+ self.fc2 = nn.Linear(hidden_dim, output_dim)
84
+
85
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
86
+ return self.fc2(self.act(self.fc1(x)))
87
+
88
+
89
+ class SwiGLUExpert(nn.Module):
90
+ """SwiGLU expert (gated MLP with SiLU activation)."""
91
+
92
+ def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
93
+ super().__init__()
94
+ self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
95
+ self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
96
+ self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
97
+
98
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
99
+ return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
100
+
101
+
102
+ class MOSAProjector(nn.Module):
103
+ def __init__(self, config):
104
+ super().__init__()
105
+ self.encoder_dim = getattr(config, "encoder_dim", None) or 1280
106
+ self.llm_dim = getattr(config, "llm_dim", None) or 2048
107
+ self.num_experts = getattr(config, "num_experts", None) or 8
108
+ adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
109
+
110
+ # Auxiliary loss coefficients (MOSA paper uses only cross-entropy, no aux losses)
111
+ self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.0)
112
+ self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.0)
113
+
114
+ # Store router state for aux loss computation
115
+ self.last_router_logits = None
116
+ self.last_routing_weights = None
117
+
118
+ # --- 1. Pre-Norms (CRITICAL for stability) ---
119
+ self.in_norm = LlamaRMSNorm(self.encoder_dim, eps=1e-8)
120
+
121
+ # --- 2. Convolutional Subsampling (Stride 4) ---
122
+ self.conv = nn.Sequential(
123
+ nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
124
+ nn.SiLU(),
125
+ nn.Conv1d(self.llm_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
126
+ nn.SiLU(),
127
+ )
128
+
129
+ # --- 3. Deep Router (ReLU per MOSA paper) ---
130
+ self.router = nn.Sequential(
131
+ nn.Linear(self.encoder_dim, 2560),
132
+ nn.ReLU(),
133
+ nn.Linear(2560, 5120),
134
+ nn.ReLU(),
135
+ nn.Linear(5120, 2560),
136
+ nn.ReLU(),
137
+ nn.Linear(2560, 1280),
138
+ nn.ReLU(),
139
+ nn.Linear(1280, self.num_experts),
140
+ )
141
+
142
+ # --- 4. Experts (Simple 2-layer ReLU adapters per MOSA paper) ---
143
+ self.experts = nn.ModuleList(
144
+ [
145
+ SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
146
+ for _ in range(self.num_experts)
147
+ ]
148
+ )
149
+
150
+ # --- 5. Output Norm ---
151
+ # Projects often drift in magnitude; this clamps them before the LLM.
152
+ self.out_norm = LlamaRMSNorm(self.llm_dim, eps=1e-8)
153
+
154
+ # Using PyTorch default initialization (like MOSA paper)
155
+
156
+ def forward(self, x):
157
+ # x: (B, S, 1280)
158
+ batch_size, seq_len, _ = x.shape
159
+
160
+ # Apply Input Norm
161
+ x = self.in_norm(x)
162
+
163
+ # --- 1. Conv Branch ---
164
+ x_trans = x.permute(0, 2, 1) # (B, D, S)
165
+ h_conv = self.conv(x_trans).permute(0, 2, 1) # (B, S//4, llm_dim)
166
+
167
+ # --- 2. Router Branch ---
168
+ pad_amt = (4 - (seq_len % 4)) % 4
169
+ x_padded = F.pad(x, (0, 0, 0, pad_amt)) if pad_amt > 0 else x
170
+
171
+ # Mean pool to align receptive fields
172
+ x_pooled = x_padded.view(batch_size, -1, 4, self.encoder_dim).mean(dim=2) # (B, S//4, D)
173
+
174
+ # Router Logits
175
+ router_logits = self.router(x_pooled) # (B, S//4, num_experts)
176
+
177
+ # Softmax for Dense MoE (Soft Mixing)
178
+ routing_weights = F.softmax(router_logits, dim=-1)
179
+
180
+ # Store for aux loss computation
181
+ self.last_router_logits = router_logits
182
+ self.last_routing_weights = routing_weights
183
+
184
+ # --- 3. Expert Mixture (Dense Execution) ---
185
+ # Warning: High VRAM usage. Runs all experts.
186
+ # h_conv: (B, S//4, llm_dim)
187
+
188
+ # Stack approach is clean but memory hungry.
189
+ # Checkpointing could be added here if OOM occurs.
190
+ expert_outputs = torch.stack([expert(h_conv) for expert in self.experts]) # (E, B, S//4, D)
191
+
192
+ # Weighted Sum
193
+ # (Experts, Batch, Seq, Dim) * (Batch, Seq, Experts) -> (Batch, Seq, Dim)
194
+ final_out = torch.einsum("ebsd, bse -> bsd", expert_outputs, routing_weights)
195
+
196
+ return self.out_norm(final_out)
197
+
198
+ def get_output_length(self, input_length: int) -> int:
199
+ """Calculate output sequence length given input length."""
200
+ # Two conv layers with stride=2 each = stride 4 total
201
+ padded = input_length + (4 - input_length % 4) % 4
202
+ return padded // 4
203
+
204
+ def get_aux_loss(self) -> torch.Tensor:
205
+ """Compute auxiliary losses: load balancing + z-loss."""
206
+ if self.last_router_logits is None:
207
+ return torch.tensor(0.0, device=self.conv[0].weight.device)
208
+
209
+ # Flatten for loss computation: (B, S, E) -> (B*S, E)
210
+ logits_flat = self.last_router_logits.view(-1, self.num_experts)
211
+ probs_flat = self.last_routing_weights.view(-1, self.num_experts)
212
+
213
+ balance = load_balancing_loss(probs_flat, self.num_experts, top_k=self.num_experts)
214
+ z = z_loss(logits_flat)
215
+
216
+ return self.aux_loss_coef * balance + self.z_loss_coef * z
217
+
218
+
219
+ # =============================================================================
220
+ # SwiGLU Projector
221
+ # =============================================================================
222
+
223
+
224
+ class SwiGLU(nn.Module):
225
+ """SwiGLU activation block (Llama-style: SiLU(Gate) * Value -> Output)."""
226
+
227
+ def __init__(self, in_features, hidden_features, out_features):
228
+ super().__init__()
229
+ self.w1 = nn.Linear(in_features, hidden_features, bias=False) # Gate
230
+ self.w2 = nn.Linear(in_features, hidden_features, bias=False) # Value
231
+ self.w3 = nn.Linear(hidden_features, out_features, bias=False) # Output
232
+ self.act = nn.SiLU()
233
+
234
+ def forward(self, x):
235
+ return self.w3(self.act(self.w1(x)) * self.w2(x))
236
+
237
+
238
+ class SwiGLUAudioProjector(nn.Module):
239
+ """
240
+ Optimized for Frozen LLM + 2500h Data.
241
+ Target: 12.5 Hz Output (Stride 4) with 8/3 SwiGLU Expansion.
242
+ """
243
+
244
+ def __init__(self, config):
245
+ super().__init__()
246
+ self.k = getattr(config, "projector_pool_stride", 4)
247
+ encoder_dim = config.encoder_dim
248
+ llm_dim = config.llm_dim
249
+
250
+ # Conv Expansion (Compensating for Time Compression)
251
+ # We compress time by 4x, so we expand width by 2x to preserve info density.
252
+ hidden_dim = int(encoder_dim * 2)
253
+
254
+ # SwiGLU Internal Expansion (The 8/3 Ratio)
255
+ # To match standard FFN capacity: 4 * (2/3) = 8/3
256
+ swiglu_inner = int(hidden_dim * 8 / 3)
257
+
258
+ self.downsample = nn.Conv1d(
259
+ in_channels=encoder_dim,
260
+ out_channels=hidden_dim,
261
+ kernel_size=self.k,
262
+ stride=self.k,
263
+ padding=0,
264
+ )
265
+
266
+ self.norm = LlamaRMSNorm(hidden_dim, eps=1e-8)
267
+
268
+ self.proj = SwiGLU(hidden_dim, swiglu_inner, llm_dim)
269
+
270
+ self.apply(self._init_weights)
271
+
272
+ def _init_weights(self, m):
273
+ if isinstance(m, (nn.Linear, nn.Conv1d)):
274
+ nn.init.trunc_normal_(m.weight, std=0.02)
275
+ if m.bias is not None:
276
+ nn.init.constant_(m.bias, 0)
277
+
278
+ def forward(self, x):
279
+ # x: [Batch, Seq, Dim]
280
+ batch, seq, dim = x.shape
281
+
282
+ # Manual Padding (prevents frame dropping)
283
+ if seq % self.k != 0:
284
+ pad_len = self.k - (seq % self.k)
285
+ x = F.pad(x, (0, 0, 0, pad_len))
286
+
287
+ # [B, S, D] -> [B, D, S]
288
+ x = x.transpose(1, 2)
289
+
290
+ # Downsample (50Hz -> 12.5Hz)
291
+ x = self.downsample(x)
292
+
293
+ # [B, D, S] -> [B, S, D]
294
+ x = x.transpose(1, 2)
295
+
296
+ # Norm & Project
297
+ x = self.norm(x)
298
+ return self.proj(x)
299
+
300
+ def get_output_length(self, input_length: int) -> int:
301
+ return (input_length + self.k - 1) // self.k
302
+
303
+
304
+ # =============================================================================
305
+ # Shared MoE Projector
306
+ # =============================================================================
307
+
308
+
309
+ class SharedMoEBlock(nn.Module):
310
+ """MoE block with Shared + Sigmoid-Routed Experts."""
311
+
312
+ def __init__(
313
+ self,
314
+ input_dim: int,
315
+ hidden_dim: int,
316
+ output_dim: int,
317
+ num_experts: int = 4,
318
+ top_k: int = 2,
319
+ ):
320
+ super().__init__()
321
+ self.num_experts = num_experts
322
+ self.top_k = top_k
323
+ self.output_dim = output_dim
324
+
325
+ # RMSNorm before routing
326
+ self.norm = LlamaRMSNorm(input_dim, eps=1e-8)
327
+
328
+ self.router = nn.Linear(input_dim, num_experts, bias=False)
329
+ nn.init.normal_(self.router.weight, mean=0.0, std=0.02)
330
+
331
+ self.shared_expert = SwiGLUExpert(input_dim, hidden_dim, output_dim)
332
+ self.experts = nn.ModuleList(
333
+ [SwiGLUExpert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]
334
+ )
335
+
336
+ self.last_router_logits = None
337
+ self.last_router_probs = None
338
+
339
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
340
+ batch_size, seq_len, dim = hidden_states.shape
341
+
342
+ # 1. Apply Shared Expert
343
+ normed_states = self.norm(hidden_states)
344
+ shared_out = self.shared_expert(normed_states)
345
+
346
+ # 2. Router Logic (Sigmoid Style)
347
+ flat_hidden = normed_states.view(-1, dim)
348
+ router_logits = self.router(flat_hidden)
349
+
350
+ # Sigmoid routing
351
+ router_probs = torch.sigmoid(router_logits)
352
+
353
+ self.last_router_logits = router_logits
354
+ self.last_router_probs = router_probs
355
+
356
+ # 3. Top-K Selection
357
+ top_k_scores, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
358
+
359
+ # Normalize weights
360
+ top_k_weights = top_k_scores / (top_k_scores.sum(dim=-1, keepdim=True) + 1e-6)
361
+ top_k_weights = top_k_weights.to(hidden_states.dtype)
362
+
363
+ # 4. Dispatch
364
+ routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
365
+ routed_out = routed_out.view(batch_size, seq_len, -1)
366
+
367
+ return shared_out + routed_out
368
+
369
+ def _dispatch_experts(
370
+ self,
371
+ hidden_states: torch.Tensor,
372
+ top_k_indices: torch.Tensor,
373
+ top_k_weights: torch.Tensor,
374
+ ) -> torch.Tensor:
375
+ num_tokens = hidden_states.shape[0]
376
+ output = torch.zeros(
377
+ num_tokens, self.output_dim, device=hidden_states.device, dtype=hidden_states.dtype
378
+ )
379
+
380
+ for expert_idx, expert in enumerate(self.experts):
381
+ expert_mask = top_k_indices == expert_idx
382
+ if not expert_mask.any():
383
+ continue
384
+
385
+ token_indices, slot_indices = torch.where(expert_mask)
386
+ expert_input = hidden_states[token_indices]
387
+ expert_output = expert(expert_input).to(output.dtype)
388
+ weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
389
+ output.index_add_(0, token_indices, expert_output * weights)
390
+
391
+ return output
392
+
393
+
394
+ def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
395
+ """Auxiliary loss to encourage balanced expert usage."""
396
+ prob_per_expert = router_probs.mean(dim=0)
397
+ target_mean = prob_per_expert.mean()
398
+ return (prob_per_expert - target_mean).square().sum() * num_experts
399
+
400
+
401
+ def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
402
+ """Z-loss to prevent router logits from growing too large."""
403
+ return torch.logsumexp(router_logits.float(), dim=-1).square().mean()
404
+
405
+
406
+ class SharedMoEAudioProjector(nn.Module):
407
+ """Shared expert + sparse routed experts projector."""
408
+
409
+ def __init__(self, config):
410
+ super().__init__()
411
+
412
+ self.k = getattr(config, "projector_pool_stride", 4)
413
+ encoder_dim = config.encoder_dim
414
+
415
+ # Depthwise Conv for temporal mixing
416
+ self.temporal_conv = nn.Conv1d(
417
+ encoder_dim, encoder_dim, kernel_size=3, padding=1, groups=encoder_dim
418
+ )
419
+
420
+ in_dim = encoder_dim * self.k
421
+ out_dim = config.llm_dim
422
+ hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
423
+
424
+ self.num_experts = getattr(config, "num_experts", 4)
425
+ self.top_k = getattr(config, "num_experts_per_tok", 2)
426
+ self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.02)
427
+ self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
428
+
429
+ self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
430
+ self._init_weights()
431
+
432
+ def _init_weights(self):
433
+ with torch.no_grad():
434
+ nn.init.orthogonal_(self.moe.shared_expert.gate_proj.weight)
435
+ nn.init.orthogonal_(self.moe.shared_expert.up_proj.weight)
436
+ nn.init.orthogonal_(self.moe.shared_expert.down_proj.weight, gain=0.5)
437
+
438
+ for expert in self.moe.experts:
439
+ nn.init.orthogonal_(expert.gate_proj.weight)
440
+ nn.init.orthogonal_(expert.up_proj.weight)
441
+ nn.init.orthogonal_(expert.down_proj.weight, gain=0.01)
442
+
443
+ def get_output_length(self, input_length: int) -> int:
444
+ """Calculate output sequence length given input length."""
445
+ # Temporal pooling with stride k
446
+ if input_length % self.k:
447
+ input_length += self.k - input_length % self.k
448
+ return input_length // self.k
449
+
450
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
451
+ batch_size, seq_len, dim = x.size()
452
+
453
+ target_dtype = self.moe.shared_expert.gate_proj.weight.dtype
454
+ if x.dtype != target_dtype:
455
+ x = x.to(target_dtype)
456
+
457
+ # Temporal Context Injection
458
+ x_ctx = x.transpose(1, 2)
459
+ x_ctx = self.temporal_conv(x_ctx)
460
+ x = x + x_ctx.transpose(1, 2)
461
+
462
+ if seq_len % self.k:
463
+ x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
464
+
465
+ x = x.view(batch_size, -1, dim * self.k)
466
+
467
+ return self.moe(x)
468
+
469
+ def get_aux_loss(self) -> torch.Tensor:
470
+ if self.moe.last_router_logits is None:
471
+ return torch.tensor(0.0, device=self.moe.router.weight.device)
472
+
473
+ balance = load_balancing_loss(self.moe.last_router_probs, self.num_experts, self.top_k)
474
+ z = z_loss(self.moe.last_router_logits)
475
+
476
+ return self.aux_loss_coef * balance + self.z_loss_coef * z
477
+
478
+
479
+ # =============================================================================
480
+ # QFormer Projector (Granite-style)
481
+ # =============================================================================
482
+
483
+
484
+ class QFormerAudioProjector(nn.Module):
485
+ """
486
+ BLIP-2 QFormer projector with learnable queries.
487
+
488
+ Based on GraniteSpeechEncoderProjector - uses a QFormer model with learnable
489
+ query embeddings to compress and project audio encoder outputs. The audio
490
+ sequence is processed in windows and downsampled via cross-attention.
491
+ """
492
+
493
+ def __init__(self, config):
494
+ super().__init__()
495
+
496
+ encoder_dim = config.encoder_dim
497
+ llm_dim = config.llm_dim
498
+
499
+ # Window and downsampling parameters (Granite defaults: window=15, downsample=5)
500
+ self.window_size = getattr(config, "qformer_window_size", 15)
501
+ self.downsample_rate = getattr(config, "downsample_rate", 5)
502
+ self.num_queries = self.window_size // self.downsample_rate
503
+
504
+ # QFormer hidden size (matches encoder for cross-attention)
505
+ qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
506
+ qformer_num_layers = getattr(config, "qformer_num_layers", 2)
507
+ qformer_num_heads = getattr(config, "qformer_num_heads", 16)
508
+ qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (
509
+ qformer_hidden * 4
510
+ )
511
+
512
+ # Learnable query embeddings (Granite uses std=1.0)
513
+ self.query = nn.Parameter(torch.zeros(1, self.num_queries, qformer_hidden))
514
+ self.query.data.normal_(mean=0.0, std=1.0)
515
+
516
+ # Optional projection if encoder dim != qformer hidden
517
+ if encoder_dim != qformer_hidden:
518
+ self.encoder_proj = nn.Linear(encoder_dim, qformer_hidden, bias=False)
519
+ else:
520
+ self.encoder_proj = None
521
+
522
+ # Configure QFormer to match Granite's exact config
523
+ qformer_config = Blip2QFormerConfig(
524
+ hidden_size=qformer_hidden,
525
+ num_hidden_layers=qformer_num_layers,
526
+ num_attention_heads=qformer_num_heads,
527
+ intermediate_size=qformer_intermediate,
528
+ encoder_hidden_size=qformer_hidden,
529
+ cross_attention_frequency=1,
530
+ # Granite-specific settings
531
+ hidden_act="gelu",
532
+ attention_probs_dropout_prob=0.1,
533
+ hidden_dropout_prob=0.1,
534
+ layer_norm_eps=1e-12,
535
+ initializer_range=0.02,
536
+ )
537
+ self.qformer = AutoModel.from_config(qformer_config)
538
+
539
+ # Final projection to LLM dimension (Granite uses bias=True)
540
+ self.linear = nn.Linear(qformer_hidden, llm_dim)
541
+
542
+ def get_output_length(self, input_length: int) -> int:
543
+ """Calculate output sequence length given input length."""
544
+ # QFormer uses window-based processing with num_queries per window
545
+ nblocks = math.ceil(input_length / self.window_size)
546
+ return nblocks * self.num_queries
547
+
548
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
549
+ """
550
+ Args:
551
+ hidden_states: [batch_size, seq_len, encoder_dim]
552
+
553
+ Returns:
554
+ projected: [batch_size, num_output_tokens, llm_dim]
555
+ """
556
+ batch_size, seq_len, dim = hidden_states.size()
557
+
558
+ # Ensure float dtype for QFormer
559
+ target_dtype = self.query.dtype
560
+ if hidden_states.dtype != target_dtype:
561
+ hidden_states = hidden_states.to(target_dtype)
562
+
563
+ # Optional encoder projection
564
+ if self.encoder_proj is not None:
565
+ hidden_states = self.encoder_proj(hidden_states)
566
+
567
+ # Compute number of windows and pad to fit
568
+ nblocks = math.ceil(seq_len / self.window_size)
569
+ pad = nblocks * self.window_size - seq_len
570
+ if pad > 0:
571
+ hidden_states = F.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
572
+
573
+ # Reshape to process each window: [batch*nblocks, window_size, dim]
574
+ effective_batch = batch_size * nblocks
575
+ hidden_states = hidden_states.view(effective_batch, self.window_size, -1)
576
+
577
+ # Expand queries to match batch size
578
+ query_embeds = self.query.expand(effective_batch, -1, -1)
579
+
580
+ # QFormer cross-attention
581
+ query_output = self.qformer(
582
+ query_embeds=query_embeds,
583
+ encoder_hidden_states=hidden_states,
584
+ return_dict=True,
585
+ )
586
+
587
+ # Reshape back: [batch, nblocks * num_queries, hidden]
588
+ output_tokens = nblocks * self.num_queries
589
+ query_proj = query_output.last_hidden_state.view(batch_size, output_tokens, -1)
590
+
591
+ # Project to LLM dimension
592
+ return self.linear(query_proj)
593
+
594
+
595
+ # =============================================================================
596
+ # Transformer Projector
597
+ # =============================================================================
598
+
599
+
600
+ class TransformerAudioProjector(nn.Module):
601
+ """
602
+ Transformer Projector (FunASR Style).
603
+ Projects to LLM dim first, then applies transformer blocks for context mixing.
604
+ """
605
+
606
+ def __init__(self, config):
607
+ super().__init__()
608
+ # Default stride 4: Whisper (2x) * Projector (4x) = 8x total → ~12.5 Hz
609
+ # Similar to FunASR's 6x total (~16.67 Hz)
610
+ self.k = getattr(config, "projector_pool_stride", 4)
611
+
612
+ encoder_dim = config.encoder_dim
613
+ llm_dim = config.llm_dim
614
+
615
+ # Input: Stacked frames (e.g. 1280 * 2 = 2560)
616
+ in_dim = encoder_dim * self.k
617
+
618
+ # FFN hidden dim for initial projection (FunASR default: 2048)
619
+ ffn_dim = getattr(config, "projector_hidden_dim", None) or 2048
620
+
621
+ # FunASR-style projection: linear1 -> relu -> linear2
622
+ self.linear1 = nn.Linear(in_dim, ffn_dim)
623
+ self.relu = nn.ReLU()
624
+ self.linear2 = nn.Linear(ffn_dim, llm_dim)
625
+
626
+ # Transformer blocks operating at llm_dim
627
+ num_layers = getattr(config, "projector_num_layers", 2)
628
+ if num_layers > 0:
629
+ encoder_layer = nn.TransformerEncoderLayer(
630
+ d_model=llm_dim,
631
+ nhead=getattr(config, "projector_num_heads", 8),
632
+ dim_feedforward=llm_dim // 4, # FunASR uses quarter size
633
+ dropout=0.0,
634
+ activation="relu",
635
+ batch_first=True,
636
+ norm_first=True,
637
+ )
638
+ self.blocks = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
639
+ else:
640
+ self.blocks = None
641
+
642
+ # Final Norm for stability when projecting to frozen LLM
643
+ self.norm = LlamaRMSNorm(llm_dim, eps=1e-8)
644
+
645
+ self.apply(self._init_weights)
646
+
647
+ def _init_weights(self, m):
648
+ if isinstance(m, nn.Linear):
649
+ nn.init.trunc_normal_(m.weight, std=0.02)
650
+ if m.bias is not None:
651
+ nn.init.zeros_(m.bias)
652
+
653
+ def forward(self, x):
654
+ # x: [Batch, Seq, Dim]
655
+ batch, seq, dim = x.shape
656
+
657
+ # Padding to multiple of k
658
+ chunk_num = (seq - 1) // self.k + 1
659
+ pad_num = chunk_num * self.k - seq
660
+ if pad_num > 0:
661
+ x = F.pad(x, (0, 0, 0, pad_num))
662
+
663
+ # Frame stacking: [B, S, D] -> [B, S/k, D*k]
664
+ x = x.contiguous().view(batch, chunk_num, dim * self.k)
665
+
666
+ # FunASR-style projection to LLM dim
667
+ x = self.linear1(x)
668
+ x = self.relu(x)
669
+ x = self.linear2(x)
670
+
671
+ # Transformer context mixing
672
+ if self.blocks is not None:
673
+ x = self.blocks(x)
674
+
675
+ return self.norm(x)
676
+
677
+ def get_output_length(self, input_length: int) -> int:
678
+ return (input_length - 1) // self.k + 1
679
+
680
+
681
+ # =============================================================================
682
+ # Projector Registry
683
+ # =============================================================================
684
+
685
+ PROJECTOR_CLASSES = {
686
+ "mlp": MLPAudioProjector,
687
+ "mosa": MOSAProjector,
688
+ "swiglu": SwiGLUAudioProjector,
689
+ "shared_moe": SharedMoEAudioProjector,
690
+ "qformer": QFormerAudioProjector,
691
+ "transformer": TransformerAudioProjector,
692
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<audio>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "eos_token": {
12
+ "content": "<|im_end|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "pad_token": "<|finetune_right_pad_id|>"
19
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4aeaf198f783cbf58d8cd59812baac429ffe49147bf9648f6618de20b8d4a4c
3
+ size 17209003
tokenizer_config.json ADDED
@@ -0,0 +1,2075 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128000": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "128001": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "128002": {
20
+ "content": "<think>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "128003": {
28
+ "content": "</think>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "128004": {
36
+ "content": "<|finetune_right_pad_id|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128005": {
44
+ "content": "<|reserved_special_token_2|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128006": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128007": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128008": {
68
+ "content": "<|eom_id|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128009": {
76
+ "content": "<|eot_id|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128010": {
84
+ "content": "<|python_tag|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128011": {
92
+ "content": "<|im_start|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128012": {
100
+ "content": "<|im_end|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128013": {
108
+ "content": "<tool_response>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "128014": {
116
+ "content": "</tool_response>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "128015": {
124
+ "content": "<tool_call>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "128016": {
132
+ "content": "</tool_call>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "128017": {
140
+ "content": "<code>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "128018": {
148
+ "content": "</code>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": false
154
+ },
155
+ "128019": {
156
+ "content": "<|reserved_special_token_11|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "128020": {
164
+ "content": "<|reserved_special_token_12|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "128021": {
172
+ "content": "<|reserved_special_token_13|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "128022": {
180
+ "content": "<|reserved_special_token_14|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "128023": {
188
+ "content": "<|reserved_special_token_15|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "128024": {
196
+ "content": "<|reserved_special_token_16|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "128025": {
204
+ "content": "<|reserved_special_token_17|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "128026": {
212
+ "content": "<|reserved_special_token_18|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "128027": {
220
+ "content": "<|reserved_special_token_19|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "128028": {
228
+ "content": "<|reserved_special_token_20|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "128029": {
236
+ "content": "<|reserved_special_token_21|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "128030": {
244
+ "content": "<|reserved_special_token_22|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "128031": {
252
+ "content": "<|reserved_special_token_23|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "128032": {
260
+ "content": "<|reserved_special_token_24|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "128033": {
268
+ "content": "<|reserved_special_token_25|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "128034": {
276
+ "content": "<|reserved_special_token_26|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "128035": {
284
+ "content": "<|reserved_special_token_27|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "128036": {
292
+ "content": "<|reserved_special_token_28|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "128037": {
300
+ "content": "<|reserved_special_token_29|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "128038": {
308
+ "content": "<|reserved_special_token_30|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "128039": {
316
+ "content": "<|reserved_special_token_31|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "128040": {
324
+ "content": "<|reserved_special_token_32|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "128041": {
332
+ "content": "<|reserved_special_token_33|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "128042": {
340
+ "content": "<|reserved_special_token_34|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "128043": {
348
+ "content": "<|reserved_special_token_35|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "128044": {
356
+ "content": "<|reserved_special_token_36|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "128045": {
364
+ "content": "<|reserved_special_token_37|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "128046": {
372
+ "content": "<|reserved_special_token_38|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "128047": {
380
+ "content": "<|reserved_special_token_39|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "128048": {
388
+ "content": "<|reserved_special_token_40|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "128049": {
396
+ "content": "<|reserved_special_token_41|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "128050": {
404
+ "content": "<|reserved_special_token_42|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "128051": {
412
+ "content": "<|reserved_special_token_43|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "128052": {
420
+ "content": "<|reserved_special_token_44|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "128053": {
428
+ "content": "<|reserved_special_token_45|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "128054": {
436
+ "content": "<|reserved_special_token_46|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "128055": {
444
+ "content": "<|reserved_special_token_47|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "128056": {
452
+ "content": "<|reserved_special_token_48|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "128057": {
460
+ "content": "<|reserved_special_token_49|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "128058": {
468
+ "content": "<|reserved_special_token_50|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "128059": {
476
+ "content": "<|reserved_special_token_51|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "128060": {
484
+ "content": "<|reserved_special_token_52|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "128061": {
492
+ "content": "<|reserved_special_token_53|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "128062": {
500
+ "content": "<|reserved_special_token_54|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "128063": {
508
+ "content": "<|reserved_special_token_55|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "128064": {
516
+ "content": "<|reserved_special_token_56|>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "128065": {
524
+ "content": "<|reserved_special_token_57|>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "128066": {
532
+ "content": "<|reserved_special_token_58|>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "128067": {
540
+ "content": "<|reserved_special_token_59|>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "128068": {
548
+ "content": "<|reserved_special_token_60|>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "128069": {
556
+ "content": "<|reserved_special_token_61|>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "128070": {
564
+ "content": "<|reserved_special_token_62|>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "128071": {
572
+ "content": "<|reserved_special_token_63|>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "128072": {
580
+ "content": "<|reserved_special_token_64|>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "128073": {
588
+ "content": "<|reserved_special_token_65|>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "128074": {
596
+ "content": "<|reserved_special_token_66|>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "128075": {
604
+ "content": "<|reserved_special_token_67|>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "128076": {
612
+ "content": "<|reserved_special_token_68|>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "128077": {
620
+ "content": "<|reserved_special_token_69|>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "128078": {
628
+ "content": "<|reserved_special_token_70|>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "128079": {
636
+ "content": "<|reserved_special_token_71|>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "128080": {
644
+ "content": "<|reserved_special_token_72|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "128081": {
652
+ "content": "<|reserved_special_token_73|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "128082": {
660
+ "content": "<|reserved_special_token_74|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "128083": {
668
+ "content": "<|reserved_special_token_75|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "128084": {
676
+ "content": "<|reserved_special_token_76|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "128085": {
684
+ "content": "<|reserved_special_token_77|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "128086": {
692
+ "content": "<|reserved_special_token_78|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "128087": {
700
+ "content": "<|reserved_special_token_79|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "128088": {
708
+ "content": "<|reserved_special_token_80|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "128089": {
716
+ "content": "<|reserved_special_token_81|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "128090": {
724
+ "content": "<|reserved_special_token_82|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "128091": {
732
+ "content": "<|reserved_special_token_83|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "128092": {
740
+ "content": "<|reserved_special_token_84|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "128093": {
748
+ "content": "<|reserved_special_token_85|>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "128094": {
756
+ "content": "<|reserved_special_token_86|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "128095": {
764
+ "content": "<|reserved_special_token_87|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "128096": {
772
+ "content": "<|reserved_special_token_88|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "128097": {
780
+ "content": "<|reserved_special_token_89|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "128098": {
788
+ "content": "<|reserved_special_token_90|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "128099": {
796
+ "content": "<|reserved_special_token_91|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "128100": {
804
+ "content": "<|reserved_special_token_92|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "128101": {
812
+ "content": "<|reserved_special_token_93|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "128102": {
820
+ "content": "<|reserved_special_token_94|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "128103": {
828
+ "content": "<|reserved_special_token_95|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "128104": {
836
+ "content": "<|reserved_special_token_96|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "128105": {
844
+ "content": "<|reserved_special_token_97|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "128106": {
852
+ "content": "<|reserved_special_token_98|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "128107": {
860
+ "content": "<|reserved_special_token_99|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "128108": {
868
+ "content": "<|reserved_special_token_100|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "128109": {
876
+ "content": "<|reserved_special_token_101|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "128110": {
884
+ "content": "<|reserved_special_token_102|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "128111": {
892
+ "content": "<|reserved_special_token_103|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "128112": {
900
+ "content": "<|reserved_special_token_104|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "128113": {
908
+ "content": "<|reserved_special_token_105|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "128114": {
916
+ "content": "<|reserved_special_token_106|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "128115": {
924
+ "content": "<|reserved_special_token_107|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "128116": {
932
+ "content": "<|reserved_special_token_108|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "128117": {
940
+ "content": "<|reserved_special_token_109|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "128118": {
948
+ "content": "<|reserved_special_token_110|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "128119": {
956
+ "content": "<|reserved_special_token_111|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "128120": {
964
+ "content": "<|reserved_special_token_112|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "128121": {
972
+ "content": "<|reserved_special_token_113|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "128122": {
980
+ "content": "<|reserved_special_token_114|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "128123": {
988
+ "content": "<|reserved_special_token_115|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "128124": {
996
+ "content": "<|reserved_special_token_116|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "128125": {
1004
+ "content": "<|reserved_special_token_117|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "128126": {
1012
+ "content": "<|reserved_special_token_118|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "128127": {
1020
+ "content": "<|reserved_special_token_119|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "128128": {
1028
+ "content": "<|reserved_special_token_120|>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "128129": {
1036
+ "content": "<|reserved_special_token_121|>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "128130": {
1044
+ "content": "<|reserved_special_token_122|>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "128131": {
1052
+ "content": "<|reserved_special_token_123|>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "128132": {
1060
+ "content": "<|reserved_special_token_124|>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "128133": {
1068
+ "content": "<|reserved_special_token_125|>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "128134": {
1076
+ "content": "<|reserved_special_token_126|>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "128135": {
1084
+ "content": "<|reserved_special_token_127|>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "128136": {
1092
+ "content": "<|reserved_special_token_128|>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "128137": {
1100
+ "content": "<|reserved_special_token_129|>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "128138": {
1108
+ "content": "<|reserved_special_token_130|>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "128139": {
1116
+ "content": "<|reserved_special_token_131|>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "128140": {
1124
+ "content": "<|reserved_special_token_132|>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "128141": {
1132
+ "content": "<|reserved_special_token_133|>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "128142": {
1140
+ "content": "<|reserved_special_token_134|>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "128143": {
1148
+ "content": "<|reserved_special_token_135|>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "128144": {
1156
+ "content": "<|reserved_special_token_136|>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "128145": {
1164
+ "content": "<|reserved_special_token_137|>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "128146": {
1172
+ "content": "<|reserved_special_token_138|>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "128147": {
1180
+ "content": "<|reserved_special_token_139|>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "128148": {
1188
+ "content": "<|reserved_special_token_140|>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "128149": {
1196
+ "content": "<|reserved_special_token_141|>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "128150": {
1204
+ "content": "<|reserved_special_token_142|>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "128151": {
1212
+ "content": "<|reserved_special_token_143|>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "128152": {
1220
+ "content": "<|reserved_special_token_144|>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "128153": {
1228
+ "content": "<|reserved_special_token_145|>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "128154": {
1236
+ "content": "<|reserved_special_token_146|>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "128155": {
1244
+ "content": "<|reserved_special_token_147|>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "128156": {
1252
+ "content": "<|reserved_special_token_148|>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "128157": {
1260
+ "content": "<|reserved_special_token_149|>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "128158": {
1268
+ "content": "<|reserved_special_token_150|>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "128159": {
1276
+ "content": "<|reserved_special_token_151|>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "128160": {
1284
+ "content": "<|reserved_special_token_152|>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "128161": {
1292
+ "content": "<|reserved_special_token_153|>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "128162": {
1300
+ "content": "<|reserved_special_token_154|>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "128163": {
1308
+ "content": "<|reserved_special_token_155|>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "128164": {
1316
+ "content": "<|reserved_special_token_156|>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "128165": {
1324
+ "content": "<|reserved_special_token_157|>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "128166": {
1332
+ "content": "<|reserved_special_token_158|>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "128167": {
1340
+ "content": "<|reserved_special_token_159|>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "128168": {
1348
+ "content": "<|reserved_special_token_160|>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "128169": {
1356
+ "content": "<|reserved_special_token_161|>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "128170": {
1364
+ "content": "<|reserved_special_token_162|>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "128171": {
1372
+ "content": "<|reserved_special_token_163|>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "128172": {
1380
+ "content": "<|reserved_special_token_164|>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "128173": {
1388
+ "content": "<|reserved_special_token_165|>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "128174": {
1396
+ "content": "<|reserved_special_token_166|>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "128175": {
1404
+ "content": "<|reserved_special_token_167|>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "128176": {
1412
+ "content": "<|reserved_special_token_168|>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "128177": {
1420
+ "content": "<|reserved_special_token_169|>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "128178": {
1428
+ "content": "<|reserved_special_token_170|>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "128179": {
1436
+ "content": "<|reserved_special_token_171|>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "128180": {
1444
+ "content": "<|reserved_special_token_172|>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "128181": {
1452
+ "content": "<|reserved_special_token_173|>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "128182": {
1460
+ "content": "<|reserved_special_token_174|>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "128183": {
1468
+ "content": "<|reserved_special_token_175|>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "128184": {
1476
+ "content": "<|reserved_special_token_176|>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "128185": {
1484
+ "content": "<|reserved_special_token_177|>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "128186": {
1492
+ "content": "<|reserved_special_token_178|>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "128187": {
1500
+ "content": "<|reserved_special_token_179|>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "128188": {
1508
+ "content": "<|reserved_special_token_180|>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "128189": {
1516
+ "content": "<|reserved_special_token_181|>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "128190": {
1524
+ "content": "<|reserved_special_token_182|>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "128191": {
1532
+ "content": "<|reserved_special_token_183|>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "128192": {
1540
+ "content": "<|reserved_special_token_184|>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "128193": {
1548
+ "content": "<|reserved_special_token_185|>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "128194": {
1556
+ "content": "<|reserved_special_token_186|>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "128195": {
1564
+ "content": "<|reserved_special_token_187|>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "128196": {
1572
+ "content": "<|reserved_special_token_188|>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "128197": {
1580
+ "content": "<|reserved_special_token_189|>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "128198": {
1588
+ "content": "<|reserved_special_token_190|>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "128199": {
1596
+ "content": "<|reserved_special_token_191|>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "128200": {
1604
+ "content": "<|reserved_special_token_192|>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "128201": {
1612
+ "content": "<|reserved_special_token_193|>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "128202": {
1620
+ "content": "<|reserved_special_token_194|>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "128203": {
1628
+ "content": "<|reserved_special_token_195|>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "128204": {
1636
+ "content": "<|reserved_special_token_196|>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "128205": {
1644
+ "content": "<|reserved_special_token_197|>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "128206": {
1652
+ "content": "<|reserved_special_token_198|>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "128207": {
1660
+ "content": "<|reserved_special_token_199|>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "128208": {
1668
+ "content": "<|reserved_special_token_200|>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "128209": {
1676
+ "content": "<|reserved_special_token_201|>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "128210": {
1684
+ "content": "<|reserved_special_token_202|>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "128211": {
1692
+ "content": "<|reserved_special_token_203|>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "128212": {
1700
+ "content": "<|reserved_special_token_204|>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "128213": {
1708
+ "content": "<|reserved_special_token_205|>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "128214": {
1716
+ "content": "<|reserved_special_token_206|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "128215": {
1724
+ "content": "<|reserved_special_token_207|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "128216": {
1732
+ "content": "<|reserved_special_token_208|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ },
1739
+ "128217": {
1740
+ "content": "<|reserved_special_token_209|>",
1741
+ "lstrip": false,
1742
+ "normalized": false,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": true
1746
+ },
1747
+ "128218": {
1748
+ "content": "<|reserved_special_token_210|>",
1749
+ "lstrip": false,
1750
+ "normalized": false,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": true
1754
+ },
1755
+ "128219": {
1756
+ "content": "<|reserved_special_token_211|>",
1757
+ "lstrip": false,
1758
+ "normalized": false,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": true
1762
+ },
1763
+ "128220": {
1764
+ "content": "<|reserved_special_token_212|>",
1765
+ "lstrip": false,
1766
+ "normalized": false,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": true
1770
+ },
1771
+ "128221": {
1772
+ "content": "<|reserved_special_token_213|>",
1773
+ "lstrip": false,
1774
+ "normalized": false,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": true
1778
+ },
1779
+ "128222": {
1780
+ "content": "<|reserved_special_token_214|>",
1781
+ "lstrip": false,
1782
+ "normalized": false,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": true
1786
+ },
1787
+ "128223": {
1788
+ "content": "<|reserved_special_token_215|>",
1789
+ "lstrip": false,
1790
+ "normalized": false,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": true
1794
+ },
1795
+ "128224": {
1796
+ "content": "<|reserved_special_token_216|>",
1797
+ "lstrip": false,
1798
+ "normalized": false,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": true
1802
+ },
1803
+ "128225": {
1804
+ "content": "<|reserved_special_token_217|>",
1805
+ "lstrip": false,
1806
+ "normalized": false,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": true
1810
+ },
1811
+ "128226": {
1812
+ "content": "<|reserved_special_token_218|>",
1813
+ "lstrip": false,
1814
+ "normalized": false,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": true
1818
+ },
1819
+ "128227": {
1820
+ "content": "<|reserved_special_token_219|>",
1821
+ "lstrip": false,
1822
+ "normalized": false,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": true
1826
+ },
1827
+ "128228": {
1828
+ "content": "<|reserved_special_token_220|>",
1829
+ "lstrip": false,
1830
+ "normalized": false,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": true
1834
+ },
1835
+ "128229": {
1836
+ "content": "<|reserved_special_token_221|>",
1837
+ "lstrip": false,
1838
+ "normalized": false,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": true
1842
+ },
1843
+ "128230": {
1844
+ "content": "<|reserved_special_token_222|>",
1845
+ "lstrip": false,
1846
+ "normalized": false,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": true
1850
+ },
1851
+ "128231": {
1852
+ "content": "<|reserved_special_token_223|>",
1853
+ "lstrip": false,
1854
+ "normalized": false,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": true
1858
+ },
1859
+ "128232": {
1860
+ "content": "<|reserved_special_token_224|>",
1861
+ "lstrip": false,
1862
+ "normalized": false,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": true
1866
+ },
1867
+ "128233": {
1868
+ "content": "<|reserved_special_token_225|>",
1869
+ "lstrip": false,
1870
+ "normalized": false,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": true
1874
+ },
1875
+ "128234": {
1876
+ "content": "<|reserved_special_token_226|>",
1877
+ "lstrip": false,
1878
+ "normalized": false,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": true
1882
+ },
1883
+ "128235": {
1884
+ "content": "<|reserved_special_token_227|>",
1885
+ "lstrip": false,
1886
+ "normalized": false,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": true
1890
+ },
1891
+ "128236": {
1892
+ "content": "<|reserved_special_token_228|>",
1893
+ "lstrip": false,
1894
+ "normalized": false,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": true
1898
+ },
1899
+ "128237": {
1900
+ "content": "<|reserved_special_token_229|>",
1901
+ "lstrip": false,
1902
+ "normalized": false,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": true
1906
+ },
1907
+ "128238": {
1908
+ "content": "<|reserved_special_token_230|>",
1909
+ "lstrip": false,
1910
+ "normalized": false,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": true
1914
+ },
1915
+ "128239": {
1916
+ "content": "<|reserved_special_token_231|>",
1917
+ "lstrip": false,
1918
+ "normalized": false,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": true
1922
+ },
1923
+ "128240": {
1924
+ "content": "<|reserved_special_token_232|>",
1925
+ "lstrip": false,
1926
+ "normalized": false,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": true
1930
+ },
1931
+ "128241": {
1932
+ "content": "<|reserved_special_token_233|>",
1933
+ "lstrip": false,
1934
+ "normalized": false,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": true
1938
+ },
1939
+ "128242": {
1940
+ "content": "<|reserved_special_token_234|>",
1941
+ "lstrip": false,
1942
+ "normalized": false,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": true
1946
+ },
1947
+ "128243": {
1948
+ "content": "<|reserved_special_token_235|>",
1949
+ "lstrip": false,
1950
+ "normalized": false,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": true
1954
+ },
1955
+ "128244": {
1956
+ "content": "<|reserved_special_token_236|>",
1957
+ "lstrip": false,
1958
+ "normalized": false,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": true
1962
+ },
1963
+ "128245": {
1964
+ "content": "<|reserved_special_token_237|>",
1965
+ "lstrip": false,
1966
+ "normalized": false,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": true
1970
+ },
1971
+ "128246": {
1972
+ "content": "<|reserved_special_token_238|>",
1973
+ "lstrip": false,
1974
+ "normalized": false,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": true
1978
+ },
1979
+ "128247": {
1980
+ "content": "<|reserved_special_token_239|>",
1981
+ "lstrip": false,
1982
+ "normalized": false,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": true
1986
+ },
1987
+ "128248": {
1988
+ "content": "<|reserved_special_token_240|>",
1989
+ "lstrip": false,
1990
+ "normalized": false,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": true
1994
+ },
1995
+ "128249": {
1996
+ "content": "<|reserved_special_token_241|>",
1997
+ "lstrip": false,
1998
+ "normalized": false,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": true
2002
+ },
2003
+ "128250": {
2004
+ "content": "<|reserved_special_token_242|>",
2005
+ "lstrip": false,
2006
+ "normalized": false,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": true
2010
+ },
2011
+ "128251": {
2012
+ "content": "<|reserved_special_token_243|>",
2013
+ "lstrip": false,
2014
+ "normalized": false,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": true
2018
+ },
2019
+ "128252": {
2020
+ "content": "<|reserved_special_token_244|>",
2021
+ "lstrip": false,
2022
+ "normalized": false,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": true
2026
+ },
2027
+ "128253": {
2028
+ "content": "<|reserved_special_token_245|>",
2029
+ "lstrip": false,
2030
+ "normalized": false,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": true
2034
+ },
2035
+ "128254": {
2036
+ "content": "<|reserved_special_token_246|>",
2037
+ "lstrip": false,
2038
+ "normalized": false,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": true
2042
+ },
2043
+ "128255": {
2044
+ "content": "<|reserved_special_token_247|>",
2045
+ "lstrip": false,
2046
+ "normalized": false,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": true
2050
+ },
2051
+ "128256": {
2052
+ "content": "<audio>",
2053
+ "lstrip": false,
2054
+ "normalized": false,
2055
+ "rstrip": false,
2056
+ "single_word": false,
2057
+ "special": true
2058
+ }
2059
+ },
2060
+ "additional_special_tokens": [
2061
+ "<audio>"
2062
+ ],
2063
+ "bos_token": null,
2064
+ "clean_up_tokenization_spaces": true,
2065
+ "eos_token": "<|im_end|>",
2066
+ "extra_special_tokens": {},
2067
+ "fast": false,
2068
+ "model_input_names": [
2069
+ "input_ids",
2070
+ "attention_mask"
2071
+ ],
2072
+ "model_max_length": 131072,
2073
+ "pad_token": "<|finetune_right_pad_id|>",
2074
+ "tokenizer_class": "PreTrainedTokenizerFast"
2075
+ }