| from transformers.configuration_utils import PretrainedConfig |
| from transformers import AutoConfig |
| from transformers.activations import ACT2FN |
|
|
|
|
| class QualityLinearAdapterConfig(PretrainedConfig): |
| model_type = "QualityvForCausalLM" |
| adapter_type = "linear" |
| |
| def __init__(self, |
| in_hidden_size: int = 1024, |
| num_layers: int = 2, |
| intermediate_size: int = 2048, |
| out_hidden_size: int = 2028, |
| act_fn: str = "gelu", |
| **kwargs, |
| ) -> None: |
| super().__init__(**kwargs) |
| |
| self.in_hidden_size = in_hidden_size |
| self.num_layers = num_layers |
| self.intermediate_size = intermediate_size |
| self.out_hidden_size = out_hidden_size |
| self.act_fn = act_fn |
| |
| |
| class QualityvConfig(PretrainedConfig): |
| model_type = "QualityvForCausalLM" |
| def __init__(self, |
| vision_model_name: str=None, |
| audio_model_name: str=None, |
| llm_model_name: str=None, |
| image_token_id: int=None, |
| video_token_id: int=None, |
| audio_token_id: int=None, |
| adapter_type: str="linear", |
| num_adapter_layers: int=2, |
| **kwargs, |
| ) -> None: |
| super().__init__(**kwargs) |
| self.vision_model_name = vision_model_name |
| self.audio_model_name = audio_model_name |
| self.llm_model_name = llm_model_name |
| self.image_token_id = image_token_id |
| self.video_token_id = video_token_id |
| self.audio_token_id = audio_token_id |
| self.adapter_type = adapter_type |
| self.num_adapter_layers = num_adapter_layers |
| if llm_model_name is not None: |
| self.llm_config = AutoConfig.from_pretrained(llm_model_name) |
| for key, value in self.llm_config.to_dict().items(): |
| setattr(self, key, value) |
| if vision_model_name is not None: |
| self.vision_config = AutoConfig.from_pretrained(vision_model_name) |
| self.vision_adapter_config = QualityLinearAdapterConfig( |
| in_hidden_size=self.vision_config.hidden_size, |
| intermediate_size=self.vision_config.hidden_size * 2, |
| out_hidden_size=self.llm_config.hidden_size, |
| num_layers=num_adapter_layers, |
| ) |
| else: |
| self.vision_config = None |
| if audio_model_name is not None: |
| self.audio_config = AutoConfig.from_pretrained(audio_model_name) |
| self.audio_adapter_config = QualityLinearAdapterConfig( |
| in_hidden_size=self.audio_config.hidden_size, |
| intermediate_size=self.audio_config.hidden_size * 2, |
| out_hidden_size=self.llm_config.hidden_size, |
| num_layers=num_adapter_layers, |
| ) |
| else: |
| self.audio_config = None |
| |
| def get_vocab_size(self): |
| return self.llm_config.vocab_size |
| |
| def get_text_config(self, **kwargs): |
| return self.llm_config.get_text_config(**kwargs) |
|
|