File size: 2,868 Bytes
534f14c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from typing import Any

from transformers import PretrainedConfig, Qwen3Config

try:
    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
except ImportError:
    print('Please upgrade transformers to version 4.46.3 or higher')


class POINTSGUIConfig(PretrainedConfig):
    model_type = "points_gui"
    is_composition = True
    """Configuration class for `POINTSGUI`."""

    def __init__(self,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        if not kwargs:
            return
        vision_config = kwargs.pop("vision_config", None)
        llm_config = kwargs.pop("llm_config", None)
        if isinstance(vision_config, dict):
            self.vision_config = Qwen2VLVisionConfig(**vision_config)
        else:
            self.vision_config = vision_config
        if isinstance(llm_config, dict):
            self.llm_config = Qwen3Config(**llm_config)
        else:
            self.llm_config = llm_config

        self.vocab_size = llm_config["vocab_size"]
        self.max_position_embeddings = llm_config["max_position_embeddings"]
        self.hidden_size = llm_config["hidden_size"]
        self.intermediate_size = llm_config["intermediate_size"]
        self.num_hidden_layers = llm_config["num_hidden_layers"]
        self.num_attention_heads = llm_config["num_attention_heads"]
        self.use_sliding_window = llm_config["use_sliding_window"]
        self.sliding_window = llm_config["sliding_window"]  # we check `use_sliding_window` in the modeling code
        self.max_window_layers = llm_config["max_window_layers"]

        # for backward compatibility
        if llm_config["num_key_value_heads"] is None:
            llm_config["num_key_value_heads"] = llm_config["num_attention_heads"]

        self.num_key_value_heads = llm_config["num_key_value_heads"]
        self.head_dim = llm_config["head_dim"]
        self.hidden_act = llm_config["hidden_act"]
        self.initializer_range = llm_config["initializer_range"]
        self.rms_norm_eps = llm_config["rms_norm_eps"]
        self.use_cache = llm_config["use_cache"]
        self.rope_theta = llm_config["rope_theta"]
        self.rope_scaling = llm_config["rope_scaling"]
        self.attention_bias = llm_config["attention_bias"]
        self.attention_dropout = llm_config["attention_dropout"]
        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            if self.rope_scaling["type"] == "mrope":
                self.rope_scaling["type"] = "default"
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]

        super().__init__(
            tie_word_embeddings=llm_config["tie_word_embeddings"],
            **kwargs,
        )