peshk1n commited on
Commit
4e5aad1
·
verified ·
1 Parent(s): 1d133f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -111
app.py CHANGED
@@ -16,9 +16,8 @@ from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
16
  from tensorflow.keras.preprocessing import image
17
  from tensorflow.keras.models import Model
18
 
19
- os.environ["KERAS_BACKEND"] = "tensorflow"
20
 
21
- # Переменные ================================
22
  start_token = "[BOS]"
23
  end_token = "[EOS]"
24
  cls_token = "[CLS]"
@@ -43,17 +42,13 @@ attn_pool_dim = proj_dim
43
  attn_pool_heads = num_heads
44
  cap_query_num = 128
45
 
46
- #RNN
47
  rnn_embedding_dim = 256
48
  rnn_proj_dim = 512
49
 
50
- # =================================
51
 
52
- # Загрузка word_index
53
  with open('vocabs/word_index.json', 'r', encoding='utf-8') as f:
54
  word_index = {np.str_(word): np.int64(idx) for word, idx in json.load(f).items()}
55
 
56
- # Загрузка index_word
57
  with open('vocabs/index_word.json', 'r', encoding='utf-8') as f:
58
  index_word = {np.int64(idx): np.str_(word) for idx, word in json.load(f).items()}
59
 
@@ -81,7 +76,7 @@ class PositionalEmbedding(layers.Layer):
81
  return output
82
 
83
 
84
- class AttentionalPooling(tf.keras.layers.Layer):
85
  def __init__(self, embed_dim, num_heads=6):
86
  super().__init__()
87
  self.embed_dim = embed_dim
@@ -100,7 +95,7 @@ class AttentionalPooling(tf.keras.layers.Layer):
100
  return self.norm(attn_output)
101
 
102
 
103
- class TransformerBlock(tf.keras.layers.Layer):
104
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, is_multimodal=False, **kwargs):
105
  super().__init__(**kwargs)
106
  self.embed_dim = embed_dim
@@ -109,14 +104,12 @@ class TransformerBlock(tf.keras.layers.Layer):
109
  self.dropout_rate = dropout_rate
110
  self.ln_epsilon = ln_epsilon
111
 
112
- # Self-Attention
113
  self.self_attention = layers.MultiHeadAttention(
114
  num_heads=self.num_heads,
115
  key_dim=self.embed_dim,
116
  dropout=self.dropout_rate
117
  )
118
 
119
- # Cross-Attention
120
  if is_multimodal:
121
  self.norm2 = layers.LayerNormalization(epsilon=self.ln_epsilon)
122
  self.dropout2 = layers.Dropout(self.dropout_rate)
@@ -126,19 +119,15 @@ class TransformerBlock(tf.keras.layers.Layer):
126
  dropout=self.dropout_rate
127
  )
128
 
129
-
130
- # Feed-Forward Network
131
  self.dense_proj = tf.keras.Sequential([
132
  layers.Dense(self.dense_dim, activation="gelu"),
133
  layers.Dropout(self.dropout_rate),
134
  layers.Dense(self.embed_dim)
135
  ])
136
 
137
- # Layer Normalization
138
  self.norm1 = layers.LayerNormalization(epsilon=self.ln_epsilon)
139
  self.norm3 = layers.LayerNormalization(epsilon=self.ln_epsilon)
140
 
141
- # Dropout
142
  self.dropout1 = layers.Dropout(self.dropout_rate)
143
  self.dropout3 = layers.Dropout(self.dropout_rate)
144
 
@@ -148,11 +137,11 @@ class TransformerBlock(tf.keras.layers.Layer):
148
  causal_mask = tf.linalg.band_part(tf.ones((seq_len, seq_len), tf.bool), -1, 0)
149
  return tf.expand_dims(causal_mask, 0)
150
 
151
-
152
  def get_combined_mask(self, causal_mask, padding_mask):
153
  padding_mask = tf.cast(padding_mask, tf.bool)
154
 
155
- padding_mask = tf.expand_dims(padding_mask, 1) # (B, 1, L)
156
  return causal_mask & padding_mask
157
 
158
 
@@ -161,31 +150,28 @@ class TransformerBlock(tf.keras.layers.Layer):
161
  if mask is not None:
162
  att_mask = self.get_combined_mask(att_mask, mask)
163
 
164
- # Self-Attention
165
  x = self.norm1(inputs)
166
  attention_output_1 = self.self_attention(
167
  query=x, key=x, value=x, attention_mask=att_mask
168
  )
169
  attention_output_1 = self.dropout1(attention_output_1)
170
- x = x + attention_output_1 # Add & Norm
171
-
172
- # Cross-Attention
173
  if encoder_outputs is not None:
174
  x_norm = self.norm2(x)
175
  attention_output_2 = self.cross_attention(
176
  query=x_norm, key=encoder_outputs, value=encoder_outputs
177
  )
178
  attention_output_2 = self.dropout2(attention_output_2)
179
- x = x + attention_output_2 # Add & Norm
180
 
181
- # Feed-Forward Network (FFN)
182
  x_norm = self.norm3(x)
183
  proj_output = self.dense_proj(x_norm)
184
  proj_output = self.dropout3(proj_output)
185
- return x + proj_output # Add & Norm
186
 
187
 
188
- class UnimodalTextDecoder(tf.keras.layers.Layer):
189
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, num_layers=4, **kwargs):
190
  super().__init__()
191
  self.embed_dim = embed_dim
@@ -201,15 +187,13 @@ class UnimodalTextDecoder(tf.keras.layers.Layer):
201
  ]
202
  self.norm = tf.keras.layers.LayerNormalization()
203
 
204
-
205
  def call(self, x, mask=None):
206
  for layer in self.layers:
207
  x = layer(inputs=x, mask=mask)
208
  return self.norm(x)
209
 
210
 
211
-
212
- class MultimodalTextDecoder(tf.keras.layers.Layer):
213
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, num_layers=4, **kwargs):
214
  super().__init__()
215
  self.embed_dim = embed_dim
@@ -225,7 +209,6 @@ class MultimodalTextDecoder(tf.keras.layers.Layer):
225
  ]
226
  self.norm = tf.keras.layers.LayerNormalization()
227
 
228
-
229
  def call(self, x, encoder_outputs, mask=None):
230
  for layer in self.layers:
231
  x = layer(inputs=x, encoder_outputs=encoder_outputs, mask=mask)
@@ -302,7 +285,6 @@ class CoCaEncoder(tf.keras.Model):
302
  name="cap_query"
303
  )
304
 
305
-
306
  def call(self, input, training=False):
307
  img_feature = self.vit(input).last_hidden_state
308
 
@@ -316,7 +298,6 @@ class CoCaEncoder(tf.keras.Model):
316
  return con_feature, cap_feature
317
 
318
 
319
-
320
  class CoCaDecoder(tf.keras.Model):
321
  def __init__(self,
322
  cls_token_id,
@@ -344,7 +325,6 @@ class CoCaDecoder(tf.keras.Model):
344
 
345
  self.norm = layers.LayerNormalization()
346
 
347
-
348
  def call(self, inputs, training=False):
349
  input_text, cap_feature = inputs
350
  batch_size = tf.shape(input_text)[0]
@@ -366,15 +346,12 @@ class CoCaDecoder(tf.keras.Model):
366
  return cls_token_feature, multimodal_logits
367
 
368
 
369
-
370
- # день 6
371
  class CoCaModel(tf.keras.Model):
372
  def __init__(self,
373
  vit,
374
  cls_token_id,
375
  num_heads,
376
  num_layers):
377
-
378
  super().__init__()
379
 
380
  self.encoder = CoCaEncoder(vit, name="coca_encoder")
@@ -384,34 +361,28 @@ class CoCaModel(tf.keras.Model):
384
  self.text_to_latents = EmbedToLatents(proj_dim)
385
 
386
  self.pad_id = 0
387
- self.temperature = 0.2 # 0.5 #0.9 #1.0
388
  self.caption_loss_weight = 1.0
389
  self.contrastive_loss_weight = 1.0
390
 
391
  self.perplexity = Perplexity()
392
 
393
-
394
  def call(self, inputs, training=False):
395
  image, text = inputs
396
-
397
  con_feature, cap_feature = self.encoder(image)
398
  cls_token_feature, multimodal_logits = self.decoder([text, cap_feature])
399
-
400
  return con_feature, cls_token_feature, multimodal_logits
401
 
402
-
403
  def compile(self, optimizer):
404
  super().compile()
405
  self.optimizer = optimizer
406
 
407
-
408
  def compute_caption_loss(self, multimodal_out, caption_target):
409
  caption_loss = tf.keras.losses.sparse_categorical_crossentropy(
410
  caption_target, multimodal_out, from_logits=True, ignore_class=self.pad_id)
411
 
412
  return tf.reduce_mean(caption_loss)
413
 
414
-
415
  def compute_contrastive_loss(self, con_feature, cls_feature):
416
  text_embeds = tf.squeeze(cls_feature, axis=1)
417
  image_embeds = tf.squeeze(con_feature, axis=1)
@@ -419,21 +390,17 @@ class CoCaModel(tf.keras.Model):
419
  text_latents = self.text_to_latents(text_embeds)
420
  image_latents = self.img_to_latents(image_embeds)
421
 
422
- # Матрица схожести
423
- sim = tf.matmul(text_latents, image_latents, transpose_b=True) / self.temperature # tf.exp(self.log_temp)
424
 
425
- # Метки
426
  batch_size = tf.shape(sim)[0]
427
  contrastive_labels = tf.range(batch_size)
428
 
429
- # Вычисление потерь
430
  loss1 = tf.keras.losses.sparse_categorical_crossentropy(contrastive_labels, sim, from_logits=True)
431
  loss2 = tf.keras.losses.sparse_categorical_crossentropy(contrastive_labels, tf.transpose(sim), from_logits=True)
432
  contrastive_loss = tf.reduce_mean((loss1 + loss2) * 0.5)
433
 
434
  return contrastive_loss
435
 
436
-
437
  def train_step(self, data):
438
  (images, caption_input), caption_target = data
439
 
@@ -457,7 +424,6 @@ class CoCaModel(tf.keras.Model):
457
  'perplexity': self.perplexity.result()
458
  }
459
 
460
-
461
  def test_step(self, data):
462
  (images, caption_input), caption_target = data
463
 
@@ -477,14 +443,10 @@ class CoCaModel(tf.keras.Model):
477
  'perplexity': self.perplexity.result()
478
  }
479
 
480
-
481
  def reset_metrics(self):
482
  self.perplexity.reset_state()
483
 
484
 
485
- # ===========================================
486
- # Загрузка весов для коки
487
-
488
  coca_model = CoCaModel(vit_tiny_model, cls_token_id=cls_token_id, num_heads=num_heads, num_layers=num_layers)
489
 
490
  dummy_features = tf.zeros((1, 3, img_size, img_size), dtype=tf.float32)
@@ -498,22 +460,19 @@ save_dir = "models/"
498
  model_name = "coca_007"
499
  coca_model.load_weights(f"{save_dir}/{model_name}.weights.h5")
500
 
501
- # ===========================================
502
- # RNN =======================================
503
  img_embed_dim = 2048
504
  reg_count = 7 * 7
505
 
506
  base_model = ResNet50(weights='imagenet', include_top=False)
507
  model = Model(inputs=base_model.input, outputs=base_model.output)
508
 
509
-
510
  def preprocess_image(img):
511
  img = tf.image.resize(img, (img_size, img_size))
512
  img = tf.convert_to_tensor(img)
513
  img = preprocess_input(img)
514
  return np.expand_dims(img, axis=0)
515
 
516
-
517
  def create_features(img):
518
  img = preprocess_image(img)
519
  features = model.predict(img, verbose=0)
@@ -539,7 +498,6 @@ class BahdanauAttention(layers.Layer):
539
  return context, alpha
540
 
541
 
542
-
543
  class ImageCaptioningModel(tf.keras.Model):
544
  def __init__(self, vocab_size, max_caption_len, embedding_dim=512, lstm_units=512, dropout_rate=0.5, **kwargs):
545
  super().__init__(**kwargs)
@@ -562,7 +520,6 @@ class ImageCaptioningModel(tf.keras.Model):
562
 
563
  self.concatenate = layers.Concatenate(axis=-1)
564
 
565
-
566
  def call(self, inputs):
567
  features, captions = inputs
568
 
@@ -588,7 +545,6 @@ class ImageCaptioningModel(tf.keras.Model):
588
  return self.fc(outputs)
589
 
590
 
591
-
592
  rnn_model = ImageCaptioningModel(vocab_size, sentence_length-1, rnn_embedding_dim, rnn_proj_dim)
593
  image_input = np.random.rand(batch_size, reg_count, img_embed_dim).astype(np.float32)
594
  text_input = np.random.randint(0, 10000, size=(batch_size, sentence_length))
@@ -605,9 +561,6 @@ model_name = "rnn_att_v4"
605
 
606
  rnn_model.load_weights(f"{save_dir}/{model_name}.weights.h5")
607
 
608
- # =====================================
609
- # Методы генерации
610
-
611
  beam_width=3
612
  max_length=sentence_length-1
613
  temperature=1.0
@@ -631,7 +584,6 @@ def has_repeated_ngrams(seq, n=2):
631
  return len(ngrams) != len(set(ngrams))
632
 
633
 
634
- # метод с улучшениями для коки
635
  def generate_caption_coca(image):
636
  img_processed = load_and_preprocess_image(image)
637
  _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
@@ -659,7 +611,6 @@ def generate_caption_coca(image):
659
  new_seq = seq + [token]
660
  new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
661
 
662
- # Штраф за повторения
663
  if has_repeated_ngrams(new_seq, n=2):
664
  new_log_prob -= 0.5
665
 
@@ -673,7 +624,6 @@ def generate_caption_coca(image):
673
  return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
674
 
675
 
676
- # метод с улучшениями для rnn
677
  def generate_caption_rnn(image):
678
  image_embedding = create_features(image)
679
  beams = [([word_index[start_token]], 0.0)]
@@ -698,7 +648,6 @@ def generate_caption_rnn(image):
698
  new_seq = seq + [token]
699
  new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
700
 
701
- # Штраф за повторения
702
  if has_repeated_ngrams(new_seq, n=2):
703
  new_log_prob -= 0.5
704
  new_beams.append((new_seq, new_log_prob))
@@ -717,25 +666,6 @@ def generate_both(image):
717
  return f"RNN: {caption1}\n\nCoCa: {caption2}"
718
 
719
 
720
- # interface = gr.Interface(
721
- # fn=generate_both,
722
- # inputs=gr.Image(type="pil", label="Изображение"),
723
- # outputs=gr.Textbox(label="Описания", autoscroll=True, show_copy_button=True),
724
- # title="Генератор описаний к изображениям",
725
- # allow_flagging="never",
726
- # submit_btn="Сгенерировать",
727
- # clear_btn="Очистить"
728
- # )
729
-
730
- #------------------------------
731
- css = """
732
- #hosted-by-hf {
733
- top: unset !important;
734
- bottom: 20px !important;
735
- right: 20px !important;
736
- }
737
- """
738
-
739
  interface = gr.Interface(
740
  fn=generate_both,
741
  inputs=gr.Image(type="pil", label="Изображение"),
@@ -750,33 +680,7 @@ with gr.Blocks(css=css) as demo:
750
  gr.Markdown("# 🖼️ Генератор описаний к изображениям")
751
  interface.render()
752
 
753
- # if __name__ == "__main__":
754
- # #interface.launch(ssr_mode=False)
755
- # demo.launch(ssr_mode=False)
756
-
757
-
758
- # custom_css = """
759
- # footer {visibility: hidden !important;}
760
- # .share-button {display: none !important;}
761
- # #component-1 {margin-top: -1.5rem !important;} # Уменьшаем отступ сверху
762
- # """
763
-
764
- # interface = gr.Interface(
765
- # fn=generate_both,
766
- # inputs=gr.Image(type="pil", label="Изображение"),
767
- # outputs=gr.Textbox(label="Описания", autoscroll=True, show_copy_button=True),
768
- # allow_flagging="never",
769
- # submit_btn="Сгенерировать",
770
- # clear_btn="Очистить"
771
- # )
772
-
773
- # with gr.Blocks(css=custom_css) as demo:
774
- # gr.Markdown("## 🖼️ Генератор описаний к изображениям")
775
- # interface.render()
776
 
777
  if __name__ == "__main__":
778
- demo.launch(
779
- ssr_mode=False,
780
- show_api=False
781
- )
782
 
 
16
  from tensorflow.keras.preprocessing import image
17
  from tensorflow.keras.models import Model
18
 
 
19
 
20
+ os.environ["KERAS_BACKEND"] = "tensorflow"
21
  start_token = "[BOS]"
22
  end_token = "[EOS]"
23
  cls_token = "[CLS]"
 
42
  attn_pool_heads = num_heads
43
  cap_query_num = 128
44
 
 
45
  rnn_embedding_dim = 256
46
  rnn_proj_dim = 512
47
 
 
48
 
 
49
  with open('vocabs/word_index.json', 'r', encoding='utf-8') as f:
50
  word_index = {np.str_(word): np.int64(idx) for word, idx in json.load(f).items()}
51
 
 
52
  with open('vocabs/index_word.json', 'r', encoding='utf-8') as f:
53
  index_word = {np.int64(idx): np.str_(word) for idx, word in json.load(f).items()}
54
 
 
76
  return output
77
 
78
 
79
+ class AttentionalPooling(layers.Layer):
80
  def __init__(self, embed_dim, num_heads=6):
81
  super().__init__()
82
  self.embed_dim = embed_dim
 
95
  return self.norm(attn_output)
96
 
97
 
98
+ class TransformerBlock(layers.Layer):
99
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, is_multimodal=False, **kwargs):
100
  super().__init__(**kwargs)
101
  self.embed_dim = embed_dim
 
104
  self.dropout_rate = dropout_rate
105
  self.ln_epsilon = ln_epsilon
106
 
 
107
  self.self_attention = layers.MultiHeadAttention(
108
  num_heads=self.num_heads,
109
  key_dim=self.embed_dim,
110
  dropout=self.dropout_rate
111
  )
112
 
 
113
  if is_multimodal:
114
  self.norm2 = layers.LayerNormalization(epsilon=self.ln_epsilon)
115
  self.dropout2 = layers.Dropout(self.dropout_rate)
 
119
  dropout=self.dropout_rate
120
  )
121
 
 
 
122
  self.dense_proj = tf.keras.Sequential([
123
  layers.Dense(self.dense_dim, activation="gelu"),
124
  layers.Dropout(self.dropout_rate),
125
  layers.Dense(self.embed_dim)
126
  ])
127
 
 
128
  self.norm1 = layers.LayerNormalization(epsilon=self.ln_epsilon)
129
  self.norm3 = layers.LayerNormalization(epsilon=self.ln_epsilon)
130
 
 
131
  self.dropout1 = layers.Dropout(self.dropout_rate)
132
  self.dropout3 = layers.Dropout(self.dropout_rate)
133
 
 
137
  causal_mask = tf.linalg.band_part(tf.ones((seq_len, seq_len), tf.bool), -1, 0)
138
  return tf.expand_dims(causal_mask, 0)
139
 
140
+
141
  def get_combined_mask(self, causal_mask, padding_mask):
142
  padding_mask = tf.cast(padding_mask, tf.bool)
143
 
144
+ padding_mask = tf.expand_dims(padding_mask, 1)
145
  return causal_mask & padding_mask
146
 
147
 
 
150
  if mask is not None:
151
  att_mask = self.get_combined_mask(att_mask, mask)
152
 
 
153
  x = self.norm1(inputs)
154
  attention_output_1 = self.self_attention(
155
  query=x, key=x, value=x, attention_mask=att_mask
156
  )
157
  attention_output_1 = self.dropout1(attention_output_1)
158
+ x = x + attention_output_1
159
+
 
160
  if encoder_outputs is not None:
161
  x_norm = self.norm2(x)
162
  attention_output_2 = self.cross_attention(
163
  query=x_norm, key=encoder_outputs, value=encoder_outputs
164
  )
165
  attention_output_2 = self.dropout2(attention_output_2)
166
+ x = x + attention_output_2
167
 
 
168
  x_norm = self.norm3(x)
169
  proj_output = self.dense_proj(x_norm)
170
  proj_output = self.dropout3(proj_output)
171
+ return x + proj_output
172
 
173
 
174
+ class UnimodalTextDecoder(layers.Layer):
175
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, num_layers=4, **kwargs):
176
  super().__init__()
177
  self.embed_dim = embed_dim
 
187
  ]
188
  self.norm = tf.keras.layers.LayerNormalization()
189
 
 
190
  def call(self, x, mask=None):
191
  for layer in self.layers:
192
  x = layer(inputs=x, mask=mask)
193
  return self.norm(x)
194
 
195
 
196
+ class MultimodalTextDecoder(layers.Layer):
 
197
  def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1, ln_epsilon=1e-6, num_layers=4, **kwargs):
198
  super().__init__()
199
  self.embed_dim = embed_dim
 
209
  ]
210
  self.norm = tf.keras.layers.LayerNormalization()
211
 
 
212
  def call(self, x, encoder_outputs, mask=None):
213
  for layer in self.layers:
214
  x = layer(inputs=x, encoder_outputs=encoder_outputs, mask=mask)
 
285
  name="cap_query"
286
  )
287
 
 
288
  def call(self, input, training=False):
289
  img_feature = self.vit(input).last_hidden_state
290
 
 
298
  return con_feature, cap_feature
299
 
300
 
 
301
  class CoCaDecoder(tf.keras.Model):
302
  def __init__(self,
303
  cls_token_id,
 
325
 
326
  self.norm = layers.LayerNormalization()
327
 
 
328
  def call(self, inputs, training=False):
329
  input_text, cap_feature = inputs
330
  batch_size = tf.shape(input_text)[0]
 
346
  return cls_token_feature, multimodal_logits
347
 
348
 
 
 
349
  class CoCaModel(tf.keras.Model):
350
  def __init__(self,
351
  vit,
352
  cls_token_id,
353
  num_heads,
354
  num_layers):
 
355
  super().__init__()
356
 
357
  self.encoder = CoCaEncoder(vit, name="coca_encoder")
 
361
  self.text_to_latents = EmbedToLatents(proj_dim)
362
 
363
  self.pad_id = 0
364
+ self.temperature = 0.07
365
  self.caption_loss_weight = 1.0
366
  self.contrastive_loss_weight = 1.0
367
 
368
  self.perplexity = Perplexity()
369
 
 
370
  def call(self, inputs, training=False):
371
  image, text = inputs
 
372
  con_feature, cap_feature = self.encoder(image)
373
  cls_token_feature, multimodal_logits = self.decoder([text, cap_feature])
 
374
  return con_feature, cls_token_feature, multimodal_logits
375
 
 
376
  def compile(self, optimizer):
377
  super().compile()
378
  self.optimizer = optimizer
379
 
 
380
  def compute_caption_loss(self, multimodal_out, caption_target):
381
  caption_loss = tf.keras.losses.sparse_categorical_crossentropy(
382
  caption_target, multimodal_out, from_logits=True, ignore_class=self.pad_id)
383
 
384
  return tf.reduce_mean(caption_loss)
385
 
 
386
  def compute_contrastive_loss(self, con_feature, cls_feature):
387
  text_embeds = tf.squeeze(cls_feature, axis=1)
388
  image_embeds = tf.squeeze(con_feature, axis=1)
 
390
  text_latents = self.text_to_latents(text_embeds)
391
  image_latents = self.img_to_latents(image_embeds)
392
 
393
+ sim = tf.matmul(text_latents, image_latents, transpose_b=True) / self.temperature
 
394
 
 
395
  batch_size = tf.shape(sim)[0]
396
  contrastive_labels = tf.range(batch_size)
397
 
 
398
  loss1 = tf.keras.losses.sparse_categorical_crossentropy(contrastive_labels, sim, from_logits=True)
399
  loss2 = tf.keras.losses.sparse_categorical_crossentropy(contrastive_labels, tf.transpose(sim), from_logits=True)
400
  contrastive_loss = tf.reduce_mean((loss1 + loss2) * 0.5)
401
 
402
  return contrastive_loss
403
 
 
404
  def train_step(self, data):
405
  (images, caption_input), caption_target = data
406
 
 
424
  'perplexity': self.perplexity.result()
425
  }
426
 
 
427
  def test_step(self, data):
428
  (images, caption_input), caption_target = data
429
 
 
443
  'perplexity': self.perplexity.result()
444
  }
445
 
 
446
  def reset_metrics(self):
447
  self.perplexity.reset_state()
448
 
449
 
 
 
 
450
  coca_model = CoCaModel(vit_tiny_model, cls_token_id=cls_token_id, num_heads=num_heads, num_layers=num_layers)
451
 
452
  dummy_features = tf.zeros((1, 3, img_size, img_size), dtype=tf.float32)
 
460
  model_name = "coca_007"
461
  coca_model.load_weights(f"{save_dir}/{model_name}.weights.h5")
462
 
463
+
 
464
  img_embed_dim = 2048
465
  reg_count = 7 * 7
466
 
467
  base_model = ResNet50(weights='imagenet', include_top=False)
468
  model = Model(inputs=base_model.input, outputs=base_model.output)
469
 
 
470
  def preprocess_image(img):
471
  img = tf.image.resize(img, (img_size, img_size))
472
  img = tf.convert_to_tensor(img)
473
  img = preprocess_input(img)
474
  return np.expand_dims(img, axis=0)
475
 
 
476
  def create_features(img):
477
  img = preprocess_image(img)
478
  features = model.predict(img, verbose=0)
 
498
  return context, alpha
499
 
500
 
 
501
  class ImageCaptioningModel(tf.keras.Model):
502
  def __init__(self, vocab_size, max_caption_len, embedding_dim=512, lstm_units=512, dropout_rate=0.5, **kwargs):
503
  super().__init__(**kwargs)
 
520
 
521
  self.concatenate = layers.Concatenate(axis=-1)
522
 
 
523
  def call(self, inputs):
524
  features, captions = inputs
525
 
 
545
  return self.fc(outputs)
546
 
547
 
 
548
  rnn_model = ImageCaptioningModel(vocab_size, sentence_length-1, rnn_embedding_dim, rnn_proj_dim)
549
  image_input = np.random.rand(batch_size, reg_count, img_embed_dim).astype(np.float32)
550
  text_input = np.random.randint(0, 10000, size=(batch_size, sentence_length))
 
561
 
562
  rnn_model.load_weights(f"{save_dir}/{model_name}.weights.h5")
563
 
 
 
 
564
  beam_width=3
565
  max_length=sentence_length-1
566
  temperature=1.0
 
584
  return len(ngrams) != len(set(ngrams))
585
 
586
 
 
587
  def generate_caption_coca(image):
588
  img_processed = load_and_preprocess_image(image)
589
  _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
 
611
  new_seq = seq + [token]
612
  new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
613
 
 
614
  if has_repeated_ngrams(new_seq, n=2):
615
  new_log_prob -= 0.5
616
 
 
624
  return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
625
 
626
 
 
627
  def generate_caption_rnn(image):
628
  image_embedding = create_features(image)
629
  beams = [([word_index[start_token]], 0.0)]
 
648
  new_seq = seq + [token]
649
  new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
650
 
 
651
  if has_repeated_ngrams(new_seq, n=2):
652
  new_log_prob -= 0.5
653
  new_beams.append((new_seq, new_log_prob))
 
666
  return f"RNN: {caption1}\n\nCoCa: {caption2}"
667
 
668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  interface = gr.Interface(
670
  fn=generate_both,
671
  inputs=gr.Image(type="pil", label="Изображение"),
 
680
  gr.Markdown("# 🖼️ Генератор описаний к изображениям")
681
  interface.render()
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
  if __name__ == "__main__":
685
+ demo.launch(ssr_mode=False, show_api=False)
 
 
 
686