Upload 2 files
Browse files- configuration_centurio.py +1 -1
- modeling_centurio.py +7 -6
configuration_centurio.py
CHANGED
|
@@ -37,7 +37,7 @@ class CenturioConfig(PretrainedConfig):
|
|
| 37 |
ignore_index=-100,
|
| 38 |
image_token_index=32000,
|
| 39 |
adapter_type="multiscale-pool",
|
| 40 |
-
adapter_config=
|
| 41 |
**kwargs,
|
| 42 |
):
|
| 43 |
self.ignore_index = ignore_index
|
|
|
|
| 37 |
ignore_index=-100,
|
| 38 |
image_token_index=32000,
|
| 39 |
adapter_type="multiscale-pool",
|
| 40 |
+
adapter_config=dict(),
|
| 41 |
**kwargs,
|
| 42 |
):
|
| 43 |
self.ignore_index = ignore_index
|
modeling_centurio.py
CHANGED
|
@@ -74,7 +74,7 @@ class LlavaMultiModalAdapter(nn.Module):
|
|
| 74 |
class WindowMLPProjector(nn.Module):
|
| 75 |
def __init__(self, config: LlavaConfig):
|
| 76 |
super().__init__()
|
| 77 |
-
self.multi_scale =
|
| 78 |
self.linear_1 = nn.Linear(config.image_hidden_size, config.text_config.hidden_size, bias=True)
|
| 79 |
self.act = ACT2FN["gelu"]
|
| 80 |
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
|
|
@@ -93,7 +93,7 @@ class WindowMLPProjector(nn.Module):
|
|
| 93 |
class WindowPoolProjector(nn.Module):
|
| 94 |
def __init__(self, config: LlavaConfig):
|
| 95 |
super().__init__()
|
| 96 |
-
self.multi_scale =
|
| 97 |
self.pool = nn.AdaptiveAvgPool2d(getattr(config, "adapter_pool", 8))
|
| 98 |
self.linear_1 = nn.Linear(config.image_hidden_size, config.text_config.hidden_size, bias=True)
|
| 99 |
self.act = ACT2FN["gelu"]
|
|
@@ -119,7 +119,7 @@ class WindowPoolProjector(nn.Module):
|
|
| 119 |
class WindowShuffelProjector(nn.Module):
|
| 120 |
def __init__(self, config: LlavaConfig):
|
| 121 |
super().__init__()
|
| 122 |
-
self.multi_scale =
|
| 123 |
self.scale_factor = getattr(config, "adapter_pool", 2)
|
| 124 |
self.pixel_unshuffel = nn.PixelUnshuffle(self.scale_factor)
|
| 125 |
self.linear_1 = nn.Linear(config.image_hidden_size*(self.scale_factor**2), config.text_config.hidden_size, bias=True)
|
|
@@ -148,7 +148,7 @@ class MultiscalePoolProjector(nn.Module):
|
|
| 148 |
def __init__(self, config: LlavaConfig):
|
| 149 |
super().__init__()
|
| 150 |
|
| 151 |
-
self.multi_scale = getattr(config, "adapter_multi_scale", 2)
|
| 152 |
self.pool = nn.AvgPool2d(self.multi_scale)
|
| 153 |
self.linear_1 = nn.Linear(config.image_hidden_size*2, config.text_config.hidden_size, bias=True)
|
| 154 |
self.act = ACT2FN["gelu"]
|
|
@@ -181,7 +181,7 @@ class MultiscaleShuffleProjector(nn.Module):
|
|
| 181 |
def __init__(self, config):
|
| 182 |
super().__init__()
|
| 183 |
|
| 184 |
-
self.multi_scale =
|
| 185 |
self.shuffle = nn.PixelUnshuffle(self.multi_scale)
|
| 186 |
|
| 187 |
inc, ouc = config.image_hidden_size*(1+self.multi_scale**2), config.text_config.hidden_size
|
|
@@ -447,7 +447,8 @@ class CenturioForConditionalGeneration(LlavaPreTrainedModel):
|
|
| 447 |
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
| 448 |
self.post_init()
|
| 449 |
|
| 450 |
-
|
|
|
|
| 451 |
|
| 452 |
def get_input_embeddings(self):
|
| 453 |
return self.language_model.get_input_embeddings()
|
|
|
|
| 74 |
class WindowMLPProjector(nn.Module):
|
| 75 |
def __init__(self, config: LlavaConfig):
|
| 76 |
super().__init__()
|
| 77 |
+
self.multi_scale = config.adapter_config.get("multi_scale", 2) #config.adapter_config.get("multi_scale")
|
| 78 |
self.linear_1 = nn.Linear(config.image_hidden_size, config.text_config.hidden_size, bias=True)
|
| 79 |
self.act = ACT2FN["gelu"]
|
| 80 |
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
|
|
|
|
| 93 |
class WindowPoolProjector(nn.Module):
|
| 94 |
def __init__(self, config: LlavaConfig):
|
| 95 |
super().__init__()
|
| 96 |
+
self.multi_scale = config.adapter_config.get("multi_scale", 2) #config.adapter_config.get("multi_scale")
|
| 97 |
self.pool = nn.AdaptiveAvgPool2d(getattr(config, "adapter_pool", 8))
|
| 98 |
self.linear_1 = nn.Linear(config.image_hidden_size, config.text_config.hidden_size, bias=True)
|
| 99 |
self.act = ACT2FN["gelu"]
|
|
|
|
| 119 |
class WindowShuffelProjector(nn.Module):
|
| 120 |
def __init__(self, config: LlavaConfig):
|
| 121 |
super().__init__()
|
| 122 |
+
self.multi_scale = config.adapter_config.get("multi_scale", 2) #config.adapter_config.get("multi_scale")
|
| 123 |
self.scale_factor = getattr(config, "adapter_pool", 2)
|
| 124 |
self.pixel_unshuffel = nn.PixelUnshuffle(self.scale_factor)
|
| 125 |
self.linear_1 = nn.Linear(config.image_hidden_size*(self.scale_factor**2), config.text_config.hidden_size, bias=True)
|
|
|
|
| 148 |
def __init__(self, config: LlavaConfig):
|
| 149 |
super().__init__()
|
| 150 |
|
| 151 |
+
self.multi_scale = config.adapter_config.get("multi_scale", 2) #getattr(config.adapter_config, "adapter_multi_scale", 2)
|
| 152 |
self.pool = nn.AvgPool2d(self.multi_scale)
|
| 153 |
self.linear_1 = nn.Linear(config.image_hidden_size*2, config.text_config.hidden_size, bias=True)
|
| 154 |
self.act = ACT2FN["gelu"]
|
|
|
|
| 181 |
def __init__(self, config):
|
| 182 |
super().__init__()
|
| 183 |
|
| 184 |
+
self.multi_scale = config.adapter_config.get("multi_scale", 2) #config.adapter_config.get("multi_scale")
|
| 185 |
self.shuffle = nn.PixelUnshuffle(self.multi_scale)
|
| 186 |
|
| 187 |
inc, ouc = config.image_hidden_size*(1+self.multi_scale**2), config.text_config.hidden_size
|
|
|
|
| 447 |
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
| 448 |
self.post_init()
|
| 449 |
|
| 450 |
+
def tie_weights(self):
|
| 451 |
+
return self.language_model.tie_weights()
|
| 452 |
|
| 453 |
def get_input_embeddings(self):
|
| 454 |
return self.language_model.get_input_embeddings()
|