File size: 3,233 Bytes
cb65f9f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | from transformers.configuration_utils import PretrainedConfig
from transformers import AutoConfig
from transformers.activations import ACT2FN
class QualityLinearAdapterConfig(PretrainedConfig):
model_type = "QualityvForCausalLM"
adapter_type = "linear"
def __init__(self,
in_hidden_size: int = 1024,
num_layers: int = 2,
intermediate_size: int = 2048,
out_hidden_size: int = 2028,
act_fn: str = "gelu",
**kwargs,
) -> None:
super().__init__(**kwargs)
self.in_hidden_size = in_hidden_size
self.num_layers = num_layers
self.intermediate_size = intermediate_size
self.out_hidden_size = out_hidden_size
self.act_fn = act_fn
class QualityvConfig(PretrainedConfig):
model_type = "QualityvForCausalLM"
def __init__(self,
vision_model_name: str=None,
audio_model_name: str=None,
llm_model_name: str=None,
image_token_id: int=None,
video_token_id: int=None,
audio_token_id: int=None,
adapter_type: str="linear",
num_adapter_layers: int=2,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.vision_model_name = vision_model_name
self.audio_model_name = audio_model_name
self.llm_model_name = llm_model_name
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.audio_token_id = audio_token_id
self.adapter_type = adapter_type
self.num_adapter_layers = num_adapter_layers
if llm_model_name is not None:
self.llm_config = AutoConfig.from_pretrained(llm_model_name)
for key, value in self.llm_config.to_dict().items():
setattr(self, key, value)
if vision_model_name is not None:
self.vision_config = AutoConfig.from_pretrained(vision_model_name)
self.vision_adapter_config = QualityLinearAdapterConfig(
in_hidden_size=self.vision_config.hidden_size,
intermediate_size=self.vision_config.hidden_size * 2,
out_hidden_size=self.llm_config.hidden_size,
num_layers=num_adapter_layers,
)
else:
self.vision_config = None
if audio_model_name is not None:
self.audio_config = AutoConfig.from_pretrained(audio_model_name)
self.audio_adapter_config = QualityLinearAdapterConfig(
in_hidden_size=self.audio_config.hidden_size,
intermediate_size=self.audio_config.hidden_size * 2,
out_hidden_size=self.llm_config.hidden_size,
num_layers=num_adapter_layers,
)
else:
self.audio_config = None
def get_vocab_size(self):
return self.llm_config.vocab_size
def get_text_config(self, **kwargs):
return self.llm_config.get_text_config(**kwargs)
|