| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| from typing import List |
| from transformers import PretrainedConfig |
|
|
|
|
| class TimerS1Config(PretrainedConfig): |
| model_type = "Timer-S1" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| input_token_len: int = 16, |
| hidden_size: int = 1024, |
| intermediate_size: int = 4096, |
| output_token_lens: List[int] = [16], |
| num_hidden_layers: int = 24, |
| num_attention_heads: int = 16, |
| hidden_act: str = "silu", |
| use_cache: bool = True, |
| rope_theta: int = 10000, |
| dropout_rate: float = 0.1, |
| initializer_range: float = 0.02, |
| max_position_embeddings: int = 12800, |
| quantiles: List[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], |
| num_experts: int = 32, |
| num_experts_per_token: int = 2, |
| |
| num_mtp_tokens: int = 16, |
| **kwargs, |
| ): |
| self.input_token_len = input_token_len |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.hidden_act = hidden_act |
| self.output_token_lens = output_token_lens |
| self.use_cache = use_cache |
| self.rope_theta = rope_theta |
| self.dropout_rate = dropout_rate |
| self.initializer_range = initializer_range |
| self.max_position_embeddings = max_position_embeddings |
| self.quantiles = quantiles |
| self.num_experts = num_experts |
| self.num_experts_per_token = num_experts_per_token |
| |
| self.num_mtp_tokens = num_mtp_tokens |
| super().__init__(**kwargs) |