Spaces:

peshk1n
/

image-captioning

Sleeping

App Files Files Community

peshk1n commited on Jun 25, 2025

Commit

d0fc8f3

verified ·

1 Parent(s): 6520e8e

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -53

app.py CHANGED Viewed

@@ -599,77 +599,77 @@ def load_and_preprocess_image(img):
     return np.expand_dims(img, axis=0)
-# def generate_caption(image):
-#     img_processed = load_and_preprocess_image(image)
-#     img_processed = np.expand_dims(img_processed, axis=0)
-#     _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
-#     cap_features = cap_features.astype(np.float32)
-#     start_token_id = word_index[start_token]
-#     end_token_id = word_index[end_token]
-#     sequence = [start_token_id]
-#     text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
-#     for t in range(sentence_length - 1):
-#         text_input[0, :len(sequence)] = sequence
-#         _, logits = coca_model.decoder.predict(
-#             [text_input, cap_features],
-#             verbose=0
-#         )
-#         next_token = np.argmax(logits[0, t, :])
-#         sequence.append(next_token)
-#         if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
-#             break
-#     caption = " ".join(
-#         [index_word[token] for token in sequence
-#          if token not in {word_index[start_token], word_index[end_token]}]
-#     )
-#     return caption
-def generate_caption_coca(image):
-    img_processed = load_and_preprocess_image(image)
-    _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
-    beams = [([word_index[start_token]], 0.0)]
-    for _ in range(max_length):
-        new_beams = []
-        for seq, log_prob in beams:
-            if seq[-1] == word_index[end_token]:
-                new_beams.append((seq, log_prob))
-                continue
-            text_input = np.zeros((1, max_length), dtype=np.int32)
-            text_input[0, :len(seq)] = seq
-            predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
-            _, logits = predictions
-            logits = logits[0, len(seq)-1, :]
-            probs = np.exp(logits - np.max(logits))
-            probs /= probs.sum()
-            top_k = np.argsort(-probs)[:beam_width]
-            for token in top_k:
-                new_seq = seq + [token]
-                new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
-                if has_repeated_ngrams(new_seq, n=2):
-                    new_log_prob -= 0.5
-                new_beams.append((new_seq, new_log_prob))
-        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
-        if all(beam[0][-1] == word_index[end_token] for beam in beams):
-            break
-    best_seq = max(beams, key=lambda x: x[1])[0]
-    return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
 def generate_caption_rnn(image):

     return np.expand_dims(img, axis=0)
+def generate_caption(image):
+    img_processed = load_and_preprocess_image(image)
+    img_processed = np.expand_dims(img_processed, axis=0)
+    _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
+    cap_features = cap_features.astype(np.float32)
+    start_token_id = word_index[start_token]
+    end_token_id = word_index[end_token]
+    sequence = [start_token_id]
+    text_input = np.zeros((1, sentence_length - 1), dtype=np.float32)
+    for t in range(sentence_length - 1):
+        text_input[0, :len(sequence)] = sequence
+        _, logits = coca_model.decoder.predict(
+            [text_input, cap_features],
+            verbose=0
+        )
+        next_token = np.argmax(logits[0, t, :])
+        sequence.append(next_token)
+        if next_token == end_token_id or len(sequence) >= (sentence_length - 1):
+            break
+    caption = " ".join(
+        [index_word[token] for token in sequence
+         if token not in {word_index[start_token], word_index[end_token]}]
+    )
+    return caption
+# def generate_caption_coca(image):
+#     img_processed = load_and_preprocess_image(image)
+#     _, cap_features = coca_model.encoder.predict(img_processed, verbose=0)
+#     beams = [([word_index[start_token]], 0.0)]
+#     for _ in range(max_length):
+#         new_beams = []
+#         for seq, log_prob in beams:
+#             if seq[-1] == word_index[end_token]:
+#                 new_beams.append((seq, log_prob))
+#                 continue
+#             text_input = np.zeros((1, max_length), dtype=np.int32)
+#             text_input[0, :len(seq)] = seq
+#             predictions = coca_model.decoder.predict([text_input, cap_features], verbose=0)
+#             _, logits = predictions
+#             logits = logits[0, len(seq)-1, :]
+#             probs = np.exp(logits - np.max(logits))
+#             probs /= probs.sum()
+#             top_k = np.argsort(-probs)[:beam_width]
+#             for token in top_k:
+#                 new_seq = seq + [token]
+#                 new_log_prob = (log_prob * len(seq) + np.log(probs[token])) / (len(seq) + 1)
+#                 if has_repeated_ngrams(new_seq, n=2):
+#                     new_log_prob -= 0.5
+#                 new_beams.append((new_seq, new_log_prob))
+#         beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
+#         if all(beam[0][-1] == word_index[end_token] for beam in beams):
+#             break
+#     best_seq = max(beams, key=lambda x: x[1])[0]
+#     return " ".join(index_word[i] for i in best_seq if i not in {word_index[start_token], word_index[end_token]})
 def generate_caption_rnn(image):