| from transformers import PretrainedConfig | |
| class SpeechUnitConfig(PretrainedConfig): | |
| model_type = "speechunit" | |
| def __init__( | |
| self, | |
| base_model_id: str = "meta-llama/Llama-3.2-1B", | |
| num_hidden_layers: int = 3, | |
| output_dim: int = 2048, | |
| num_heads: int = 8, | |
| initializer_range: float = 0.02, | |
| **kwargs, | |
| ): | |
| self.base_model_id = base_model_id | |
| self.num_hidden_layers = num_hidden_layers | |
| self.output_dim = output_dim | |
| self.num_heads = num_heads | |
| self.initializer_range = initializer_range | |
| super().__init__(**kwargs) |