peshk1n commited on
Commit
d0fc8f3
·
verified ·
1 Parent(s): 6520e8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -53
app.py CHANGED
@@ -599,77 +599,77 @@ def load_and_preprocess_image(img):
599
  return np.expand_dims(img, axis=0)
600
 
601
 
602
- # def generate_caption(image):
603
- # img_processed = load_and_preprocess_image(image)
604
- # img_processed = np.expand_dims(img_processed, axis=0)
605
 
606
- # _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
607
- # cap_features = cap_features.astype(np.float32)
608
 
609
- # start_token_id = word_index[start_token]
610
- # end_token_id = word_index[end_token]
611
- # sequence = [start_token_id]
612
- # text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
613
 
614
- # for t in range(sentence_length - 1):
615
- # text_input[0, :len(sequence)] = sequence
616
 
617
- # _, logits = coca_model.decoder.predict(
618
- # [text_input, cap_features],
619
- # verbose=0
620
- # )
621
- # next_token = np.argmax(logits[0, t, :])
622
 
623
- # sequence.append(next_token)
624
- # if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
625
- # break
626
 
627
- # caption = " ".join(
628
- # [index_word[token] for token in sequence
629
- # if token not in {word_index[start_token], word_index[end_token]}]
630
- # )
631
 
632
- # return caption
633
 
634
 
635
- def generate_caption_coca(image):
636
- img_processed = load_and_preprocess_image(image)
637
- _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
638
 
639
- beams = [([word_index[start_token]], 0.0)]
640
 
641
- for _ in range(max_length):
642
- new_beams = []
643
- for seq, log_prob in beams:
644
- if seq[-1] == word_index[end_token]:
645
- new_beams.append((seq, log_prob))
646
- continue
647
 
648
- text_input = np.zeros((1, max_length), dtype=np.int32)
649
- text_input[0, :len(seq)] = seq
650
 
651
- predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
652
- _, logits = predictions
653
- logits = logits[0, len(seq)-1, :]
654
- probs = np.exp(logits - np.max(logits))
655
- probs /= probs.sum()
656
 
657
- top_k = np.argsort(-probs)[:beam_width]
658
- for token in top_k:
659
- new_seq = seq + [token]
660
- new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
661
 
662
- if has_repeated_ngrams(new_seq, n=2):
663
- new_log_prob -= 0.5
664
 
665
- new_beams.append((new_seq, new_log_prob))
666
 
667
- beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
668
- if all(beam[0][-1] == word_index[end_token] for beam in beams):
669
- break
670
 
671
- best_seq = max(beams, key=lambda x: x[1])[0]
672
- return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
673
 
674
 
675
  def generate_caption_rnn(image):
 
599
  return np.expand_dims(img, axis=0)
600
 
601
 
602
+ def generate_caption(image):
603
+ img_processed = load_and_preprocess_image(image)
604
+ img_processed = np.expand_dims(img_processed, axis=0)
605
 
606
+ _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
607
+ cap_features = cap_features.astype(np.float32)
608
 
609
+ start_token_id = word_index[start_token]
610
+ end_token_id = word_index[end_token]
611
+ sequence = [start_token_id]
612
+ text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
613
 
614
+ for t in range(sentence_length - 1):
615
+ text_input[0, :len(sequence)] = sequence
616
 
617
+ _, logits = coca_model.decoder.predict(
618
+ [text_input, cap_features],
619
+ verbose=0
620
+ )
621
+ next_token = np.argmax(logits[0, t, :])
622
 
623
+ sequence.append(next_token)
624
+ if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
625
+ break
626
 
627
+ caption = " ".join(
628
+ [index_word[token] for token in sequence
629
+ if token not in {word_index[start_token], word_index[end_token]}]
630
+ )
631
 
632
+ return caption
633
 
634
 
635
+ # def generate_caption_coca(image):
636
+ # img_processed = load_and_preprocess_image(image)
637
+ # _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
638
 
639
+ # beams = [([word_index[start_token]], 0.0)]
640
 
641
+ # for _ in range(max_length):
642
+ # new_beams = []
643
+ # for seq, log_prob in beams:
644
+ # if seq[-1] == word_index[end_token]:
645
+ # new_beams.append((seq, log_prob))
646
+ # continue
647
 
648
+ # text_input = np.zeros((1, max_length), dtype=np.int32)
649
+ # text_input[0, :len(seq)] = seq
650
 
651
+ # predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
652
+ # _, logits = predictions
653
+ # logits = logits[0, len(seq)-1, :]
654
+ # probs = np.exp(logits - np.max(logits))
655
+ # probs /= probs.sum()
656
 
657
+ # top_k = np.argsort(-probs)[:beam_width]
658
+ # for token in top_k:
659
+ # new_seq = seq + [token]
660
+ # new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
661
 
662
+ # if has_repeated_ngrams(new_seq, n=2):
663
+ # new_log_prob -= 0.5
664
 
665
+ # new_beams.append((new_seq, new_log_prob))
666
 
667
+ # beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
668
+ # if all(beam[0][-1] == word_index[end_token] for beam in beams):
669
+ # break
670
 
671
+ # best_seq = max(beams, key=lambda x: x[1])[0]
672
+ # return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
673
 
674
 
675
  def generate_caption_rnn(image):