File size: 3,233 Bytes
cb65f9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from transformers.configuration_utils import PretrainedConfig
from transformers import AutoConfig
from transformers.activations import ACT2FN


class QualityLinearAdapterConfig(PretrainedConfig):
    model_type = "QualityvForCausalLM"
    adapter_type = "linear"
    
    def __init__(self, 
                 in_hidden_size: int = 1024,
                 num_layers: int = 2, 
                 intermediate_size: int = 2048,
                 out_hidden_size: int = 2028,
                 act_fn: str = "gelu", 
                 **kwargs,
                 ) -> None:
        super().__init__(**kwargs)
        
        self.in_hidden_size = in_hidden_size
        self.num_layers = num_layers
        self.intermediate_size = intermediate_size
        self.out_hidden_size = out_hidden_size
        self.act_fn = act_fn
        
        
class QualityvConfig(PretrainedConfig):
    model_type = "QualityvForCausalLM"
    def __init__(self, 
                 vision_model_name: str=None,
                 audio_model_name: str=None,
                 llm_model_name: str=None,
                 image_token_id: int=None,
                 video_token_id: int=None,
                 audio_token_id: int=None,
                 adapter_type: str="linear",
                 num_adapter_layers: int=2,
                 **kwargs,
                 ) -> None:
        super().__init__(**kwargs)
        self.vision_model_name = vision_model_name
        self.audio_model_name = audio_model_name
        self.llm_model_name = llm_model_name
        self.image_token_id = image_token_id
        self.video_token_id = video_token_id
        self.audio_token_id = audio_token_id
        self.adapter_type = adapter_type
        self.num_adapter_layers = num_adapter_layers
        if llm_model_name is not None:
            self.llm_config = AutoConfig.from_pretrained(llm_model_name)
            for key, value in self.llm_config.to_dict().items():
                setattr(self, key, value)
            if vision_model_name is not None:
                self.vision_config = AutoConfig.from_pretrained(vision_model_name)
                self.vision_adapter_config = QualityLinearAdapterConfig(
                    in_hidden_size=self.vision_config.hidden_size,
                    intermediate_size=self.vision_config.hidden_size * 2,
                    out_hidden_size=self.llm_config.hidden_size,
                    num_layers=num_adapter_layers,
                )
            else:
                self.vision_config = None
            if audio_model_name is not None:
                self.audio_config = AutoConfig.from_pretrained(audio_model_name)
                self.audio_adapter_config = QualityLinearAdapterConfig(
                    in_hidden_size=self.audio_config.hidden_size,
                    intermediate_size=self.audio_config.hidden_size * 2,
                    out_hidden_size=self.llm_config.hidden_size,
                    num_layers=num_adapter_layers,
                )
            else:
                self.audio_config = None
                
    def get_vocab_size(self):
        return self.llm_config.vocab_size
    
    def get_text_config(self, **kwargs):
        return self.llm_config.get_text_config(**kwargs)