peshk1n commited on
Commit
6520e8e
·
verified ·
1 Parent(s): 0d5fe01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -53
app.py CHANGED
@@ -599,77 +599,77 @@ def load_and_preprocess_image(img):
599
  return np.expand_dims(img, axis=0)
600
 
601
 
602
- def generate_caption(image):
603
- img_processed = load_and_preprocess_image(image)
604
- img_processed = np.expand_dims(img_processed, axis=0)
605
 
606
- _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
607
- cap_features = cap_features.astype(np.float32)
608
 
609
- start_token_id = word_index[start_token]
610
- end_token_id = word_index[end_token]
611
- sequence = [start_token_id]
612
- text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
613
 
614
- for t in range(sentence_length - 1):
615
- text_input[0, :len(sequence)] = sequence
616
 
617
- _, logits = coca_model.decoder.predict(
618
- [text_input, cap_features],
619
- verbose=0
620
- )
621
- next_token = np.argmax(logits[0, t, :])
622
 
623
- sequence.append(next_token)
624
- if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
625
- break
626
 
627
- caption = " ".join(
628
- [index_word[token] for token in sequence
629
- if token not in {word_index[start_token], word_index[end_token]}]
630
- )
631
 
632
- return caption
633
 
634
 
635
- # def generate_caption_coca(image):
636
- # img_processed = load_and_preprocess_image(image)
637
- # _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
638
 
639
- # beams = [([word_index[start_token]], 0.0)]
640
 
641
- # for _ in range(max_length):
642
- # new_beams = []
643
- # for seq, log_prob in beams:
644
- # if seq[-1] == word_index[end_token]:
645
- # new_beams.append((seq, log_prob))
646
- # continue
647
 
648
- # text_input = np.zeros((1, max_length), dtype=np.int32)
649
- # text_input[0, :len(seq)] = seq
650
 
651
- # predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
652
- # _, logits = predictions
653
- # logits = logits[0, len(seq)-1, :] / temperature
654
- # probs = np.exp(logits - np.max(logits))
655
- # probs /= probs.sum()
656
 
657
- # top_k = np.argpartition(probs, -beam_width)[-beam_width:]
658
- # for token in top_k:
659
- # new_seq = seq + [token]
660
- # new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
661
 
662
- # if has_repeated_ngrams(new_seq, n=2):
663
- # new_log_prob -= 0.5
664
 
665
- # new_beams.append((new_seq, new_log_prob))
666
 
667
- # beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
668
- # if all(beam[0][-1] == word_index[end_token] for beam in beams):
669
- # break
670
 
671
- # best_seq = max(beams, key=lambda x: x[1])[0]
672
- # return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
673
 
674
 
675
  def generate_caption_rnn(image):
 
599
  return np.expand_dims(img, axis=0)
600
 
601
 
602
+ # def generate_caption(image):
603
+ # img_processed = load_and_preprocess_image(image)
604
+ # img_processed = np.expand_dims(img_processed, axis=0)
605
 
606
+ # _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
607
+ # cap_features = cap_features.astype(np.float32)
608
 
609
+ # start_token_id = word_index[start_token]
610
+ # end_token_id = word_index[end_token]
611
+ # sequence = [start_token_id]
612
+ # text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
613
 
614
+ # for t in range(sentence_length - 1):
615
+ # text_input[0, :len(sequence)] = sequence
616
 
617
+ # _, logits = coca_model.decoder.predict(
618
+ # [text_input, cap_features],
619
+ # verbose=0
620
+ # )
621
+ # next_token = np.argmax(logits[0, t, :])
622
 
623
+ # sequence.append(next_token)
624
+ # if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
625
+ # break
626
 
627
+ # caption = " ".join(
628
+ # [index_word[token] for token in sequence
629
+ # if token not in {word_index[start_token], word_index[end_token]}]
630
+ # )
631
 
632
+ # return caption
633
 
634
 
635
+ def generate_caption_coca(image):
636
+ img_processed = load_and_preprocess_image(image)
637
+ _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
638
 
639
+ beams = [([word_index[start_token]], 0.0)]
640
 
641
+ for _ in range(max_length):
642
+ new_beams = []
643
+ for seq, log_prob in beams:
644
+ if seq[-1] == word_index[end_token]:
645
+ new_beams.append((seq, log_prob))
646
+ continue
647
 
648
+ text_input = np.zeros((1, max_length), dtype=np.int32)
649
+ text_input[0, :len(seq)] = seq
650
 
651
+ predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
652
+ _, logits = predictions
653
+ logits = logits[0, len(seq)-1, :]
654
+ probs = np.exp(logits - np.max(logits))
655
+ probs /= probs.sum()
656
 
657
+ top_k = np.argsort(-probs)[:beam_width]
658
+ for token in top_k:
659
+ new_seq = seq + [token]
660
+ new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
661
 
662
+ if has_repeated_ngrams(new_seq, n=2):
663
+ new_log_prob -= 0.5
664
 
665
+ new_beams.append((new_seq, new_log_prob))
666
 
667
+ beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
668
+ if all(beam[0][-1] == word_index[end_token] for beam in beams):
669
+ break
670
 
671
+ best_seq = max(beams, key=lambda x: x[1])[0]
672
+ return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
673
 
674
 
675
  def generate_caption_rnn(image):