Spaces:

slyviee
/

Img_captioning

Sleeping

App Files Files Community

slyviee commited on Aug 29, 2025

Commit

ca0042c

verified ·

1 Parent(s): 83af921

Update flickr30k.py

Browse files

Files changed (1) hide show

flickr30k.py +155 -207

flickr30k.py CHANGED Viewed

@@ -1,207 +1,155 @@
-import os
-import cv2
-import numpy as np
-import pickle
-from PIL import Image
-import matplotlib.pyplot as plt
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import load_model, Model
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
-from tensorflow.keras.applications import EfficientNetV2B0
-from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.preprocessing.image import img_to_array
-from tqdm import tqdm
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
-import random
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from PIL import Image
-import pickle
-# -----------------------------
-# Custom attention layers
-# -----------------------------
-class ChannelAttention(layers.Layer):
-    def __init__(self, ratio=8, **kwargs):
-        super(ChannelAttention, self).__init__(**kwargs)
-        self.ratio = ratio
-    def build(self, input_shape):
-        self.gap = layers.GlobalAveragePooling1D()
-        self.gmp = layers.GlobalMaxPooling1D()
-        self.shared_mlp = tf.keras.Sequential([
-            layers.Dense(units=1280 // self.ratio, activation='relu'),
-            layers.Dense(units=1280)
-        ])
-        self.sigmoid = layers.Activation('sigmoid')
-        super(ChannelAttention, self).build(input_shape)
-    def call(self, inputs):
-        gap = self.gap(inputs)
-        gmp = self.gmp(inputs)
-        gap_mlp = self.shared_mlp(gap)
-        gmp_mlp = self.shared_mlp(gmp)
-        channel_attention = self.sigmoid(gap_mlp + gmp_mlp)
-        return inputs * tf.expand_dims(channel_attention, axis=1)
-    def get_config(self):
-        config = super(ChannelAttention, self).get_config()
-        config.update({'ratio': self.ratio})
-        return config
-    @classmethod
-    def from_config(cls, config):
-        return cls(**config)
-class SpatialAttention(layers.Layer):
-    def __init__(self, **kwargs):
-        super(SpatialAttention, self).__init__(**kwargs)
-    def build(self, input_shape):
-        self.conv = layers.Conv1D(1, kernel_size=3, padding='same', activation='sigmoid')
-        super(SpatialAttention, self).build(input_shape)
-    def call(self, inputs):
-        spatial_attention = self.conv(inputs)
-        return inputs * spatial_attention
-    def get_config(self):
-        return super(SpatialAttention, self).get_config()
-    @classmethod
-    def from_config(cls, config):
-        return cls(**config)
-# -----------------------------
-# Load model + tokenizer
-# -----------------------------
-def load_caption_model(model_path='best_model.keras'):
-    custom_objects = {
-        'ChannelAttention': ChannelAttention,
-        'SpatialAttention': SpatialAttention
-    }
-    model = load_model(model_path, custom_objects=custom_objects)
-    print("✅ Đã load model thành công!")
-    return model
-def load_tokenizer_and_config():
-    with open('tokenizer.pkl', 'rb') as f:
-        tokenizer = pickle.load(f)
-    with open('model_config.pkl', 'rb') as f:
-        config = pickle.load(f)
-    return tokenizer, config['max_length'], config['vocab_size']
-# -----------------------------
-# Feature extractor - EfficientNetV2B0
-# -----------------------------
-def load_feature_extractor():
-    base_model = EfficientNetV2B0(include_top=False, weights='imagenet', pooling='avg')
-    return Model(inputs=base_model.input, outputs=base_model.output)
-def extract_features_from_image(image_path, extractor):
-    image = cv2.imread(image_path)
-    if image is None:
-        print(f"❌ Không đọc được ảnh: {image_path}")
-        return None
-    image = cv2.resize(image, (224, 224))
-    image = img_to_array(image)
-    image = np.expand_dims(image, axis=0)
-    image = efficientnet_preprocess(image)
-    feature = extractor.predict(image, verbose=0)
-    return feature
-# -----------------------------
-# Generate caption
-# -----------------------------
-def generate_caption(model, tokenizer, image_features, max_length):
-    in_text = 'startseq'
-    for _ in range(max_length):
-        sequence = tokenizer.texts_to_sequences([in_text])[0]
-        sequence = pad_sequences([sequence], maxlen=max_length)
-        yhat = model.predict([image_features, sequence], verbose=0)
-        yhat = np.argmax(yhat)
-        word = tokenizer.index_word.get(yhat)
-        if word is None or word == 'endseq':
-            break
-        in_text += ' ' + word
-    return in_text.replace('startseq ', '')
-# -----------------------------
-# Hiển thị ảnh và caption
-# -----------------------------
-def display_caption(image_path, caption):
-    img = Image.open(image_path)
-    img = img.resize((1024, 768))  # Resize for better display
-    plt.imshow(img)
-    plt.axis('off')
-    plt.title(f"Caption: {caption}", fontsize=14, pad=10)
-    plt.show()
-# -----------------------------
-# Chạy test
-# -----------------------------
-if __name__ == '__main__':
-    image_path = 'running.jpg'
-    model = load_caption_model()
-    tokenizer, max_length, vocab_size = load_tokenizer_and_config()
-    extractor = load_feature_extractor()
-    features = extract_features_from_image(image_path, extractor)
-    if features is not None:
-        caption = generate_caption(model, tokenizer, features, max_length)
-        print("Caption:", caption)
-        display_caption(image_path, caption)
-def evaluate_model(model, tokenizer, test_ids, captions, max_length, sample_size=500):
-    actual, predicted = [], []
-    test_subset = test_ids[:sample_size]
-    for image_id in tqdm(test_subset, desc="Evaluating"):
-        features = feature_extractor.extract_features(image_path, image_id)
-        if features is None:
-            continue
-        yhat = generate_caption(model, tokenizer, features, max_length)
-        references = [c.replace('startseq ', '').replace(' endseq', '') for c in captions[image_id]]
-        actual.append(references)
-        predicted.append(yhat)
-    bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
-    bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
-    bleu3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0), smoothing_function=smoothie)
-    bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
-    print("\nModel Evaluation Results:")
-    print(f"BLEU-1: {bleu1:.4f}")
-    print(f"BLEU-2: {bleu2:.4f}")
-    print(f"BLEU-3: {bleu3:.4f}")
-    print(f"BLEU-4: {bleu4:.4f}")
-    return bleu1, bleu2, bleu3, bleu4
-print("\nEvaluating model on test set...")
-bleu_scores = evaluate_model(model, tokenizer, test_ids, captions, max_length)
-print("\nTraining and evaluation complete!")

+import os
+import cv2
+import numpy as np
+import pickle
+from PIL import Image
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras import layers
+from tensorflow.keras.models import load_model, Model
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+from tensorflow.keras.applications import EfficientNetV2B0
+from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.image import img_to_array
+from tqdm import tqdm
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+import random
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from PIL import Image
+import pickle
+# -----------------------------
+# Custom attention layers
+# -----------------------------
+class ChannelAttention(layers.Layer):
+    def __init__(self, ratio=8, **kwargs):
+        super(ChannelAttention, self).__init__(**kwargs)
+        self.ratio = ratio
+    def build(self, input_shape):
+        self.gap = layers.GlobalAveragePooling1D()
+        self.gmp = layers.GlobalMaxPooling1D()
+        self.shared_mlp = tf.keras.Sequential([
+            layers.Dense(units=1280 // self.ratio, activation='relu'),
+            layers.Dense(units=1280)
+        ])
+        self.sigmoid = layers.Activation('sigmoid')
+        super(ChannelAttention, self).build(input_shape)
+    def call(self, inputs):
+        gap = self.gap(inputs)
+        gmp = self.gmp(inputs)
+        gap_mlp = self.shared_mlp(gap)
+        gmp_mlp = self.shared_mlp(gmp)
+        channel_attention = self.sigmoid(gap_mlp + gmp_mlp)
+        return inputs * tf.expand_dims(channel_attention, axis=1)
+    def get_config(self):
+        config = super(ChannelAttention, self).get_config()
+        config.update({'ratio': self.ratio})
+        return config
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+class SpatialAttention(layers.Layer):
+    def __init__(self, **kwargs):
+        super(SpatialAttention, self).__init__(**kwargs)
+    def build(self, input_shape):
+        self.conv = layers.Conv1D(1, kernel_size=3, padding='same', activation='sigmoid')
+        super(SpatialAttention, self).build(input_shape)
+    def call(self, inputs):
+        spatial_attention = self.conv(inputs)
+        return inputs * spatial_attention
+    def get_config(self):
+        return super(SpatialAttention, self).get_config()
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+# -----------------------------
+# Load model + tokenizer
+# -----------------------------
+def load_caption_model(model_path='best_model.keras'):
+    custom_objects = {
+        'ChannelAttention': ChannelAttention,
+        'SpatialAttention': SpatialAttention
+    }
+    model = load_model(model_path, custom_objects=custom_objects)
+    print("✅ Đã load model thành công!")
+    return model
+def load_tokenizer_and_config():
+    with open('tokenizer.pkl', 'rb') as f:
+        tokenizer = pickle.load(f)
+    with open('model_config.pkl', 'rb') as f:
+        config = pickle.load(f)
+    return tokenizer, config['max_length'], config['vocab_size']
+# -----------------------------
+# Feature extractor - EfficientNetV2B0
+# -----------------------------
+def load_feature_extractor():
+    base_model = EfficientNetV2B0(include_top=False, weights='imagenet', pooling='avg')
+    return Model(inputs=base_model.input, outputs=base_model.output)
+def extract_features_from_image(image_path, extractor):
+    image = cv2.imread(image_path)
+    if image is None:
+        print(f"❌ Không đọc được ảnh: {image_path}")
+        return None
+    image = cv2.resize(image, (224, 224))
+    image = img_to_array(image)
+    image = np.expand_dims(image, axis=0)
+    image = efficientnet_preprocess(image)
+    feature = extractor.predict(image, verbose=0)
+    return feature
+# -----------------------------
+# Generate caption
+# -----------------------------
+def generate_caption(model, tokenizer, image_features, max_length):
+    in_text = 'startseq'
+    for _ in range(max_length):
+        sequence = tokenizer.texts_to_sequences([in_text])[0]
+        sequence = pad_sequences([sequence], maxlen=max_length)
+        yhat = model.predict([image_features, sequence], verbose=0)
+        yhat = np.argmax(yhat)
+        word = tokenizer.index_word.get(yhat)
+        if word is None or word == 'endseq':
+            break
+        in_text += ' ' + word
+    return in_text.replace('startseq ', '')
+# -----------------------------
+# Hiển thị ảnh và caption
+# -----------------------------
+def display_caption(image_path, caption):
+    img = Image.open(image_path)
+    img = img.resize((1024, 768))  # Resize for better display
+    plt.imshow(img)
+    plt.axis('off')
+    plt.title(f"Caption: {caption}", fontsize=14, pad=10)
+    plt.show()