from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig from transformers.activations import ACT2FN class QualityLinearAdapterConfig(PretrainedConfig): model_type = "QualityvForCausalLM" adapter_type = "linear" def __init__(self, in_hidden_size: int = 1024, num_layers: int = 2, intermediate_size: int = 2048, out_hidden_size: int = 2028, act_fn: str = "gelu", **kwargs, ) -> None: super().__init__(**kwargs) self.in_hidden_size = in_hidden_size self.num_layers = num_layers self.intermediate_size = intermediate_size self.out_hidden_size = out_hidden_size self.act_fn = act_fn class QualityvConfig(PretrainedConfig): model_type = "QualityvForCausalLM" def __init__(self, vision_model_name: str=None, audio_model_name: str=None, llm_model_name: str=None, image_token_id: int=None, video_token_id: int=None, audio_token_id: int=None, adapter_type: str="linear", num_adapter_layers: int=2, **kwargs, ) -> None: super().__init__(**kwargs) self.vision_model_name = vision_model_name self.audio_model_name = audio_model_name self.llm_model_name = llm_model_name self.image_token_id = image_token_id self.video_token_id = video_token_id self.audio_token_id = audio_token_id self.adapter_type = adapter_type self.num_adapter_layers = num_adapter_layers if llm_model_name is not None: self.llm_config = AutoConfig.from_pretrained(llm_model_name) for key, value in self.llm_config.to_dict().items(): setattr(self, key, value) if vision_model_name is not None: self.vision_config = AutoConfig.from_pretrained(vision_model_name) self.vision_adapter_config = QualityLinearAdapterConfig( in_hidden_size=self.vision_config.hidden_size, intermediate_size=self.vision_config.hidden_size * 2, out_hidden_size=self.llm_config.hidden_size, num_layers=num_adapter_layers, ) else: self.vision_config = None if audio_model_name is not None: self.audio_config = AutoConfig.from_pretrained(audio_model_name) self.audio_adapter_config = QualityLinearAdapterConfig( in_hidden_size=self.audio_config.hidden_size, intermediate_size=self.audio_config.hidden_size * 2, out_hidden_size=self.llm_config.hidden_size, num_layers=num_adapter_layers, ) else: self.audio_config = None def get_vocab_size(self): return self.llm_config.vocab_size def get_text_config(self, **kwargs): return self.llm_config.get_text_config(**kwargs)