xiaoyi1734 commited on
Commit
42c8f08
·
verified ·
1 Parent(s): ec25524

Upload Kimi-Audio-Reaction/configuration_moonshot_kimia.py with huggingface_hub

Browse files
Kimi-Audio-Reaction/configuration_moonshot_kimia.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
2
+
3
+
4
+ class KimiAudioConfig(Qwen2Config):
5
+ def __init__(
6
+ self,
7
+ vocab_size=163840,
8
+ hidden_size=4096,
9
+ intermediate_size=11008,
10
+ num_hidden_layers=32,
11
+ num_attention_heads=32,
12
+ num_key_value_heads=None,
13
+ hidden_act="silu",
14
+ initializer_range=0.02,
15
+ rms_norm_eps=1e-6,
16
+ use_cache=True,
17
+ rope_theta=10000.0,
18
+ rope_scaling=None,
19
+ tie_word_embeddings=False,
20
+ kimia_mimo_layers: int = 6,
21
+ kimia_mimo_audiodelaytokens: int = 5,
22
+ kimia_mimo_transformer_from_layer_index: int = 21,
23
+ kimia_audio_output_vocab: int = 16896,
24
+ kimia_text_output_vocab: int = 152064,
25
+ num_audio_special_tokens: int = 512,
26
+ num_base_tokens: int = 151643,
27
+ kimia_token_offset: int = 152064,
28
+ use_whisper_feature: bool = True,
29
+ kimia_adaptor_input_dim: int = 5120,
30
+ kimia_media_begin: int = 151661,
31
+ kimia_media_end: int = 151663,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(
35
+ vocab_size=vocab_size,
36
+ hidden_size=hidden_size,
37
+ intermediate_size=intermediate_size,
38
+ num_hidden_layers=num_hidden_layers,
39
+ num_attention_heads=num_attention_heads,
40
+ num_key_value_heads=num_key_value_heads,
41
+ hidden_act=hidden_act,
42
+ initializer_range=initializer_range,
43
+ rms_norm_eps=rms_norm_eps,
44
+ use_cache=use_cache,
45
+ tie_word_embeddings=tie_word_embeddings,
46
+ rope_theta=rope_theta,
47
+ rope_scaling=rope_scaling,
48
+ **kwargs,
49
+ )
50
+
51
+ self.kimia_mimo_layers = kimia_mimo_layers
52
+ self.kimia_mimo_audiodelaytokens = kimia_mimo_audiodelaytokens
53
+ # vocab
54
+ self.kimia_mimo_transformer_from_layer_index = (
55
+ kimia_mimo_transformer_from_layer_index
56
+ )
57
+ self.kimia_audio_output_vocab = kimia_audio_output_vocab
58
+ self.kimia_text_output_vocab = kimia_text_output_vocab
59
+ self.num_audio_special_tokens = num_audio_special_tokens
60
+ self.num_base_tokens = num_base_tokens
61
+ self.kimia_token_offset = kimia_token_offset
62
+ self.use_whisper_feature = use_whisper_feature
63
+ self.kimia_adaptor_input_dim = kimia_adaptor_input_dim
64
+ # special tokens
65
+ self.kimia_media_begin = kimia_media_begin
66
+ self.kimia_media_end = kimia_media_end