gbrabbit commited on
Commit
a3200aa
·
1 Parent(s): c8c097d

Auto commit at $(date '+m-H:S')

Browse files
Files changed (1) hide show
  1. configuration.py +125 -0
configuration.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.models.llama.configuration_llama import LlamaConfig
5
+ from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
6
+
7
+ logger = logging.getLogger("kanana-1.5-v")
8
+
9
+
10
+ class KananaVVisionConfig(PretrainedConfig):
11
+ model_type = "kanana-1.5-v-visual-encoder"
12
+ base_config_key = "vision_config"
13
+
14
+ def __init__(
15
+ self,
16
+ depth=32,
17
+ embed_dim=1280,
18
+ mlp_ratio=4,
19
+ num_heads=16,
20
+ in_chans=3,
21
+ hidden_size=1280,
22
+ patch_size=14,
23
+ spatial_merge_size=2,
24
+ spatial_patch_size=14,
25
+ temporal_patch_size=2,
26
+ initializer_range=0.02,
27
+ image_size="dynamic",
28
+ image_mean=OPENAI_CLIP_MEAN,
29
+ image_std=OPENAI_CLIP_STD,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+
34
+ self.depth = depth
35
+ self.embed_dim = embed_dim
36
+ self.mlp_ratio = mlp_ratio
37
+ self.num_heads = num_heads
38
+ self.in_chans = in_chans
39
+ self.hidden_size = hidden_size
40
+ self.patch_size = patch_size
41
+ self.spatial_merge_size = spatial_merge_size
42
+ self.spatial_patch_size = spatial_patch_size
43
+ self.temporal_patch_size = temporal_patch_size
44
+ self.initializer_range = initializer_range
45
+ self.image_size = image_size
46
+ self.image_mean = image_mean
47
+ self.image_std = image_std
48
+
49
+
50
+ class KananaVVisualProjectorConfig(PretrainedConfig):
51
+ model_type = "kanana-1.5-v-visual_projector"
52
+ base_config_key = "projector_config"
53
+
54
+ def __init__(
55
+ self,
56
+ depth=2,
57
+ encoder_hidden_size=1280,
58
+ feature_layer_index=-1,
59
+ hidden_size=1024,
60
+ merge_size=2,
61
+ mlp_depth=2,
62
+ num_eos_tokens=0,
63
+ output_hidden_size=2048,
64
+ pos_emb=True,
65
+ pos_emb_size=576,
66
+ prenorm=False,
67
+ projector_type="dynamic-c-abs",
68
+ **kwargs,
69
+ ):
70
+ super().__init__(**kwargs)
71
+
72
+ self.depth = depth
73
+ self.encoder_hidden_size = encoder_hidden_size
74
+ self.feature_layer_index = feature_layer_index
75
+ self.hidden_size = hidden_size
76
+ self.merge_size = merge_size
77
+ self.mlp_depth = mlp_depth
78
+ self.num_eos_tokens = num_eos_tokens
79
+ self.output_hidden_size = output_hidden_size
80
+ self.pos_emb = pos_emb
81
+ self.pos_emb_size = pos_emb_size
82
+ self.prenorm = prenorm
83
+ self.projector_type = projector_type
84
+
85
+
86
+ class KananaLanguageConfig(LlamaConfig):
87
+ model_type = "kanana-1.5-3b-instruct"
88
+ base_config_key = "text_config"
89
+
90
+ def __init__(
91
+ self,
92
+ **kwargs,
93
+ ):
94
+ super().__init__(**kwargs)
95
+
96
+
97
+ class KananaVConfig(PretrainedConfig):
98
+ model_type = "kanana-1.5-v"
99
+ is_composition = True
100
+
101
+ def __init__(
102
+ self,
103
+ vision_config: dict = {},
104
+ projector_config: dict = {},
105
+ text_config: dict = {},
106
+ **kwargs,
107
+ ):
108
+ super().__init__(**kwargs)
109
+
110
+ # Vision config
111
+ self.vision_config = KananaVVisionConfig(**vision_config)
112
+
113
+ # Visual projector config
114
+ self.projector_config = KananaVVisualProjectorConfig(**projector_config)
115
+
116
+ # Language model config
117
+ self.text_config = KananaLanguageConfig(**text_config)
118
+
119
+ @property
120
+ def num_visual_tokens(self):
121
+ return "dynamic"
122
+
123
+ @property
124
+ def hidden_size(self):
125
+ return self.text_config.hidden_size