| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """VLV model configuration""" |
|
|
| from typing import Optional, Dict, Any |
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class VLV_Config(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`VLV_MODEL`]. It is used to instantiate a VLV model |
| according to the specified arguments, defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| model_type (`str`, *optional*, defaults to "VLV_decoder"): |
| The model type identifier. |
| batch_size (`int`, *optional*, defaults to 1): |
| The batch size for inference. |
| deepspeed (`bool`, *optional*, defaults to True): |
| Whether to use deepspeed. |
| distributed (`bool`, *optional*, defaults to True): |
| Whether to use distributed training. |
| fp32 (`bool`, *optional*, defaults to True): |
| Whether to use fp32 precision. |
| guidance_scale (`float`, *optional*, defaults to 2.0): |
| The guidance scale for generation. |
| hidden_size (`int`, *optional*, defaults to 128): |
| The hidden size of the model. |
| image_size (`int`, *optional*, defaults to 768): |
| The size of input images. |
| learnable_token_length (`int`, *optional*, defaults to 77): |
| The length of learnable tokens. |
| local_rank (`int`, *optional*, defaults to 0): |
| The local rank for distributed training. |
| mixed_precision (`str`, *optional*, defaults to "bf16"): |
| The mixed precision mode. |
| num_inference_steps (`int`, *optional*, defaults to 50): |
| The number of inference steps. |
| torch_dtype (`str`, *optional*, defaults to "bfloat16"): |
| The torch dtype. |
| use_text_encoder (`bool`, *optional*, defaults to True): |
| Whether to use text encoder. |
| verbose (`bool`, *optional*, defaults to True): |
| Whether to enable verbose mode. |
| qwen_model (`str`, *optional*, defaults to "Qwen/Qwen2.5-3B"): |
| The Qwen model to use. |
| qwen2_config (`dict`, *optional*): |
| The Qwen2 configuration. |
| max_length (`int`, *optional*, defaults to 300): |
| Maximum length for generation. |
| num_beams (`int`, *optional*, defaults to 4): |
| Number of beams for beam search. |
| """ |
|
|
| model_type = "VLV_decoder" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| model_type: str = "VLV_decoder", |
| batch_size: int = 1, |
| deepspeed: bool = True, |
| distributed: bool = True, |
| fp32: bool = True, |
| guidance_scale: float = 2.0, |
| hidden_size: int = 128, |
| image_size: int = 768, |
| learnable_token_length: int = 77, |
| local_rank: int = 0, |
| mixed_precision: str = "bf16", |
| num_inference_steps: int = 50, |
| torch_dtype: str = "bfloat16", |
| transformers_version: str = "4.51.1", |
| use_text_encoder: bool = True, |
| verbose: bool = True, |
| qwen_model: str = "Qwen/Qwen2.5-3B", |
| stable_diffusion_model_path: str = "stabilityai/stable-diffusion-2-1-base", |
| florence2_model_path: str = "microsoft/Florence-2-large", |
| qwen2_config: Optional[Dict[str, Any]] = None, |
| max_length: int = 300, |
| num_beams: int = 4, |
| **kwargs, |
| ): |
| self.model_type = model_type |
| self.batch_size = batch_size |
| self.deepspeed = deepspeed |
| self.distributed = distributed |
| self.fp32 = fp32 |
| self.guidance_scale = guidance_scale |
| self.hidden_size = hidden_size |
| self.image_size = image_size |
| self.learnable_token_length = learnable_token_length |
| self.local_rank = local_rank |
| self.mixed_precision = mixed_precision |
| self.num_inference_steps = num_inference_steps |
| self.torch_dtype = torch_dtype |
| self.transformers_version = transformers_version |
| self.use_text_encoder = use_text_encoder |
| self.verbose = verbose |
| self.qwen_model = qwen_model |
| self.stable_diffusion_model_path = stable_diffusion_model_path |
| self.florence2_model_path = florence2_model_path |
| self.qwen2_config = qwen2_config or self._get_default_qwen2_config() |
| self.max_length = max_length |
| self.num_beams = num_beams |
|
|
| super().__init__(**kwargs) |
|
|
| def _get_default_qwen2_config(self): |
| """Get default Qwen2 configuration.""" |
| return { |
| "architectures": ["Qwen2ForCausalLM"], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 11008, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 36, |
| "model_type": "qwen2", |
| "num_attention_heads": 16, |
| "num_hidden_layers": 36, |
| "num_key_value_heads": 2, |
| "rms_norm_eps": 1e-06, |
| "rope_theta": 1000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": True, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.40.1", |
| "use_cache": True, |
| "use_mrope": False, |
| "use_sliding_window": False, |
| "vocab_size": 151936 |
| } |
|
|
|
|
| class CLIPDecoderConfig(PretrainedConfig): |
| r""" |
| Configuration class for CLIPDecoder model (legacy support). |
| """ |
| |
| model_type = "vlv_stage2" |
| |
| def __init__( |
| self, |
| input_dim: int = 1024, |
| bf16: bool = False, |
| **kwargs, |
| ): |
| self.input_dim = input_dim |
| self.bf16 = bf16 |
| super().__init__(**kwargs) |