Spaces:

peshk1n
/

image-captioning

Sleeping

App Files Files Community

peshk1n commited on Jun 24, 2025

Commit

6520e8e

verified ·

1 Parent(s): 0d5fe01

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -53

app.py CHANGED Viewed

@@ -599,77 +599,77 @@ def load_and_preprocess_image(img):
     return np.expand_dims(img, axis=0)
-def generate_caption(image):
-    img_processed = load_and_preprocess_image(image)
-    img_processed = np.expand_dims(img_processed, axis=0)
-    _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
-    cap_features = cap_features.astype(np.float32)
-    start_token_id = word_index[start_token]
-    end_token_id = word_index[end_token]
-    sequence = [start_token_id]
-    text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
-    for t in range(sentence_length - 1):
-        text_input[0, :len(sequence)] = sequence
-        _, logits = coca_model.decoder.predict(
-            [text_input, cap_features],
-            verbose=0
-        )
-        next_token = np.argmax(logits[0, t, :])
-        sequence.append(next_token)
-        if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
-            break
-    caption = " ".join(
-        [index_word[token] for token in sequence
-         if token not in {word_index[start_token], word_index[end_token]}]
-    )
-    return caption
-# def generate_caption_coca(image):
-#     img_processed = load_and_preprocess_image(image)
-#     _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
-#     beams = [([word_index[start_token]], 0.0)]
-#     for _ in range(max_length):
-#         new_beams = []
-#         for seq, log_prob in beams:
-#             if seq[-1] == word_index[end_token]:
-#                 new_beams.append((seq, log_prob))
-#                 continue
-#             text_input = np.zeros((1, max_length), dtype=np.int32)
-#             text_input[0, :len(seq)] = seq
-#             predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
-#             _, logits = predictions
-#             logits = logits[0, len(seq)-1, :] / temperature
-#             probs = np.exp(logits - np.max(logits))
-#             probs /= probs.sum()
-#             top_k = np.argpartition(probs, -beam_width)[-beam_width:]
-#             for token in top_k:
-#                 new_seq = seq + [token]
-#                 new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
-#                 if has_repeated_ngrams(new_seq, n=2):
-#                     new_log_prob -= 0.5
-#                 new_beams.append((new_seq, new_log_prob))
-#         beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
-#         if all(beam[0][-1] == word_index[end_token] for beam in beams):
-#             break
-#     best_seq = max(beams, key=lambda x: x[1])[0]
-#     return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
 def generate_caption_rnn(image):

     return np.expand_dims(img, axis=0)
+# def generate_caption(image):
+#     img_processed = load_and_preprocess_image(image)
+#     img_processed = np.expand_dims(img_processed, axis=0)
+#     _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
+#     cap_features = cap_features.astype(np.float32)
+#     start_token_id = word_index[start_token]
+#     end_token_id = word_index[end_token]
+#     sequence = [start_token_id]
+#     text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
+#     for t in range(sentence_length - 1):
+#         text_input[0, :len(sequence)] = sequence
+#         _, logits = coca_model.decoder.predict(
+#             [text_input, cap_features],
+#             verbose=0
+#         )
+#         next_token = np.argmax(logits[0, t, :])
+#         sequence.append(next_token)
+#         if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
+#             break
+#     caption = " ".join(
+#         [index_word[token] for token in sequence
+#          if token not in {word_index[start_token], word_index[end_token]}]
+#     )
+#     return caption
+def generate_caption_coca(image):
+    img_processed = load_and_preprocess_image(image)
+    _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
+    beams = [([word_index[start_token]], 0.0)]
+    for _ in range(max_length):
+        new_beams = []
+        for seq, log_prob in beams:
+            if seq[-1] == word_index[end_token]:
+                new_beams.append((seq, log_prob))
+                continue
+            text_input = np.zeros((1, max_length), dtype=np.int32)
+            text_input[0, :len(seq)] = seq
+            predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
+            _, logits = predictions
+            logits = logits[0, len(seq)-1, :]
+            probs = np.exp(logits - np.max(logits))
+            probs /= probs.sum()
+            top_k = np.argsort(-probs)[:beam_width]
+            for token in top_k:
+                new_seq = seq + [token]
+                new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
+                if has_repeated_ngrams(new_seq, n=2):
+                    new_log_prob -= 0.5
+                new_beams.append((new_seq, new_log_prob))
+        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
+        if all(beam[0][-1] == word_index[end_token] for beam in beams):
+            break
+    best_seq = max(beams, key=lambda x: x[1])[0]
+    return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
 def generate_caption_rnn(image):