ccloud0525 commited on
Commit
b40a476
·
1 Parent(s): b11fb36

feat: "first commit"

Browse files
Files changed (2) hide show
  1. modality_connector.py +17 -28
  2. ts_generation_mixin.py +3 -14
modality_connector.py CHANGED
@@ -11,28 +11,22 @@ from .configuration_aurora import AuroraConfig
11
 
12
 
13
  class VisionEncoder(nn.Module):
 
14
  def __init__(self, config: AuroraConfig):
15
  super().__init__()
16
-
17
- base_dir = os.path.dirname(os.path.abspath(__file__))
18
- self.config_path = os.path.join(base_dir, "vit_config")
19
-
20
- self.processor = UnifiedImageProcessor(config, self.config_path)
21
-
22
- vit_config_file = os.path.join(self.config_path, "config.json")
23
-
24
- self.model = ViTModel(ViTConfig.from_json_file(vit_config_file))
25
-
26
  for param in self.model.parameters():
27
  param.requires_grad = False
28
-
29
  self.hidden_size = self.model.config.hidden_size
30
  self.output_dim = config.hidden_size
31
  self.num_distill = config.num_distill
32
 
33
  self.projection = nn.Linear(self.hidden_size, self.output_dim)
 
34
  self.target_vision_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
35
 
 
36
  self.cross_vision = nn.TransformerDecoder(
37
  nn.TransformerDecoderLayer(
38
  d_model=config.hidden_size,
@@ -74,16 +68,16 @@ class VisionEncoder(nn.Module):
74
 
75
 
76
  class UnifiedImageProcessor(nn.Module):
77
- def __init__(self, config: AuroraConfig, vit_config_path: str):
 
78
  super().__init__()
 
 
 
79
 
80
- self.config_path = vit_config_path
81
-
82
- processor_file = os.path.join(self.config_path, "preprocessor_config.json")
83
- self.vit_processor = ViTImageProcessor.from_json_file(processor_file)
84
-
85
- self.target_size = self.vit_processor.size["height"]
86
  self.pseudo_resizer = Resize((self.target_size, self.target_size))
 
87
  self.token_len = config.token_len
88
 
89
  def process_real_image(self, images):
@@ -113,7 +107,7 @@ class UnifiedImageProcessor(nn.Module):
113
  period = input_length
114
 
115
  padding_length = (period - (input_length %
116
- period)) % period
117
  x_pad = F.pad(x, (padding_length, 0))
118
  x_2d = einops.rearrange(x_pad, 'b (p f) -> b 1 f p', f=period)
119
 
@@ -130,25 +124,20 @@ class UnifiedImageProcessor(nn.Module):
130
 
131
 
132
  class TextEncoder(nn.Module):
 
133
  def __init__(self, config: AuroraConfig):
134
  super().__init__()
135
-
136
- base_dir = os.path.dirname(os.path.abspath(__file__))
137
- self.config_path = os.path.join(base_dir, "bert_config")
138
-
139
- bert_config_file = os.path.join(self.config_path, "config.json")
140
-
141
- self.model = BertModel(BertConfig.from_json_file(bert_config_file))
142
-
143
  for param in self.model.parameters():
144
  param.requires_grad = False
145
-
146
  self.hidden_size = self.model.config.hidden_size
147
  self.output_dim = config.hidden_size
148
  self.num_distill = config.num_distill
149
  self.max_length = 125
150
 
151
  self.projection = nn.Linear(self.hidden_size, self.output_dim)
 
 
152
  self.target_text_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
153
 
154
  self.cross_text = nn.TransformerDecoder(
 
11
 
12
 
13
  class VisionEncoder(nn.Module):
14
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'vit_config')
15
  def __init__(self, config: AuroraConfig):
16
  super().__init__()
17
+ self.processor = UnifiedImageProcessor(config)
18
+ self.model = ViTModel(ViTConfig.from_json_file(os.path.join(self.config_path, 'config.json')))
 
 
 
 
 
 
 
 
19
  for param in self.model.parameters():
20
  param.requires_grad = False
 
21
  self.hidden_size = self.model.config.hidden_size
22
  self.output_dim = config.hidden_size
23
  self.num_distill = config.num_distill
24
 
25
  self.projection = nn.Linear(self.hidden_size, self.output_dim)
26
+
27
  self.target_vision_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
28
 
29
+ # Cross-attention layer
30
  self.cross_vision = nn.TransformerDecoder(
31
  nn.TransformerDecoderLayer(
32
  d_model=config.hidden_size,
 
68
 
69
 
70
  class UnifiedImageProcessor(nn.Module):
71
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'vit_config')
72
+ def __init__(self, config: AuroraConfig):
73
  super().__init__()
74
+ # Load ViT preprocessor to get pretrained normalization parameters and target size
75
+ self.vit_processor = ViTImageProcessor.from_json_file(os.path.join(self.config_path, 'preprocessor_config.json'))
76
+ self.target_size = self.vit_processor.size["height"] # e.g., 224 (default ViT input size)
77
 
78
+ # Define resizer for pseudo-images (matches real image target size)
 
 
 
 
 
79
  self.pseudo_resizer = Resize((self.target_size, self.target_size))
80
+
81
  self.token_len = config.token_len
82
 
83
  def process_real_image(self, images):
 
107
  period = input_length
108
 
109
  padding_length = (period - (input_length %
110
+ period)) % period
111
  x_pad = F.pad(x, (padding_length, 0))
112
  x_2d = einops.rearrange(x_pad, 'b (p f) -> b 1 f p', f=period)
113
 
 
124
 
125
 
126
  class TextEncoder(nn.Module):
127
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bert_config')
128
  def __init__(self, config: AuroraConfig):
129
  super().__init__()
130
+ self.model = BertModel(BertConfig.from_json_file(os.path.join(self.config_path, 'config.json')))
 
 
 
 
 
 
 
131
  for param in self.model.parameters():
132
  param.requires_grad = False
 
133
  self.hidden_size = self.model.config.hidden_size
134
  self.output_dim = config.hidden_size
135
  self.num_distill = config.num_distill
136
  self.max_length = 125
137
 
138
  self.projection = nn.Linear(self.hidden_size, self.output_dim)
139
+
140
+ # Define learnable target tokens (shape: [num_distill_tokens, hidden_size])
141
  self.target_text_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
142
 
143
  self.cross_text = nn.TransformerDecoder(
ts_generation_mixin.py CHANGED
@@ -7,19 +7,9 @@ from transformers import GenerationMixin, LogitsProcessorList, StoppingCriteriaL
7
  from transformers.generation.utils import GenerationConfig, GenerateOutput
8
  from transformers.utils import ModelOutput
9
 
10
- class TSGenerationMixin(GenerationMixin):
11
- _tokenizer = None
12
-
13
- def _get_tokenizer(self):
14
- if self._tokenizer is None:
15
- base_dir = os.path.dirname(os.path.abspath(__file__))
16
- tokenizer_dir = os.path.join(base_dir, "bert_config")
17
 
18
- self._tokenizer = BertTokenizer.from_pretrained(
19
- tokenizer_dir,
20
- local_files_only=True
21
- )
22
- return self._tokenizer
23
 
24
  @torch.no_grad()
25
  def generate(
@@ -105,8 +95,7 @@ class TSGenerationMixin(GenerationMixin):
105
  }
106
 
107
  def _tokenize(self, texts, max_length):
108
- tokenizer = self._get_tokenizer()
109
- return tokenizer(
110
  texts,
111
  padding='max_length',
112
  truncation=True,
 
7
  from transformers.generation.utils import GenerationConfig, GenerateOutput
8
  from transformers.utils import ModelOutput
9
 
 
 
 
 
 
 
 
10
 
11
+ class TSGenerationMixin(GenerationMixin):
12
+ tokenizer = BertTokenizer.from_pretrained(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bert_config'), local_files_only=True)
 
 
 
13
 
14
  @torch.no_grad()
15
  def generate(
 
95
  }
96
 
97
  def _tokenize(self, texts, max_length):
98
+ return self.tokenizer(
 
99
  texts,
100
  padding='max_length',
101
  truncation=True,