Spaces:

slyviee
/

Img_captioning

Sleeping

App Files Files Community

slyviee commited on Aug 29, 2025

Commit

a25283a

verified ·

1 Parent(s): 237057f

Upload 4 files

Browse files

Files changed (4) hide show

README.md +41 -13
app.py +86 -0
flickr30k.py +207 -0
requirements.txt +16 -0

README.md CHANGED Viewed

@@ -1,13 +1,41 @@
----
-title: Img Captioning
-emoji: 📚
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
-short_description: Upload an image, return a caption
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Image Captioning — Hugging Face Space
+Triển khai inference cho mô hình image captioning dùng TensorFlow/Keras, EfficientNetV2B0 và giao diện Gradio.
+## Cấu trúc tệp cần có
+```text
+.
+├── app.py                      # UI Gradio cho Hugging Face Space
+├── flickr30k.py                # Logic model + tiền xử lý (đã cung cấp)
+├── best_model.keras            # Trọng số mô hình (đặt cùng thư mục)
+├── tokenizer.pkl               # Tokenizer đã fit
+├── model_config.pkl            # Chứa max_length, vocab_size
+├── requirements.txt
+└── README.md
+```
+Các hàm sử dụng trực tiếp từ `flickr30k.py`: `load_caption_model`, `load_tokenizer_and_config`, `load_feature_extractor`, `extract_features_from_image`, `generate_caption`.
+## Chạy cục bộ
+```bash
+python -m venv .venv
+. .venv/bin/activate  # Windows: .venv\Scripts\activate
+pip install --upgrade pip
+pip install -r requirements.txt
+# Đảm bảo 3 tệp đã có:
+# best_model.keras, tokenizer.pkl, model_config.pkl
+python app.py
+```
+Mở URL Gradio hiển thị trong terminal.
+## Triển khai lên Hugging Face Spaces
+1) Tạo Space mới: SDK = Gradio, chọn CPU hoặc GPU tùy trọng số.
+2) Đẩy các tệp: `app.py`, `flickr30k.py`, `requirements.txt`, `README.md`, và 3 tệp trọng số/cấu hình.
+3) Sau khi build hoàn tất, Space sẽ mở UI upload ảnh và trả caption.
+## Ghi chú tương thích
+- Mặc định dùng `tensorflow==2.12.0`. Nếu bạn dùng trọng số huấn luyện ở phiên bản khác, cần đồng bộ phiên bản TensorFlow/Keras tương ứng.
+- Sử dụng `opencv-python-headless` thay vì `opencv-python` để tránh lỗi GUI trên môi trường server.
+- Nếu thiếu tài nguyên trên Space Free, hạ kích thước mô hình hoặc chuyển phần cứng sang GPU trả phí.

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import tempfile
+import traceback
+from pathlib import Path
+import gradio as gr
+from PIL import Image
+# Import các hàm từ source gốc
+from flickr30k import (
+    load_caption_model,
+    load_tokenizer_and_config,
+    load_feature_extractor,
+    extract_features_from_image,
+    generate_caption,
+)
+# Khởi tạo tài nguyên toàn cục khi app start
+MODEL_PATH = os.environ.get("CAP_MODEL_PATH", "best_model.keras")
+TOKENIZER_PATH = os.environ.get("CAP_TOKENIZER_PATH", "tokenizer.pkl")
+CONFIG_PATH = os.environ.get("CAP_CONFIG_PATH", "model_config.pkl")
+model = None
+tokenizer = None
+max_length = None
+vocab_size = None
+extractor = None
+ready = False
+startup_error = ""
+def _startup():
+    global model, tokenizer, max_length, vocab_size, extractor, ready, startup_error
+    try:
+        # Kiểm tra sự tồn tại của các tệp cần thiết
+        missing = [p for p in [MODEL_PATH, TOKENIZER_PATH, CONFIG_PATH] if not Path(p).exists()]
+        if missing:
+            startup_error = "Thiếu tệp: " + ", ".join(missing)
+            ready = False
+            return
+        model = load_caption_model(MODEL_PATH)
+        tokenizer, max_length, vocab_size = load_tokenizer_and_config()
+        extractor = load_feature_extractor()
+        ready = True
+    except Exception as e:
+        startup_error = f"Khởi tạo lỗi: {e}\n{traceback.format_exc()}"
+        ready = False
+_startup()
+def predict(pil_image: Image.Image):
+    if not ready:
+        return f"Hệ thống chưa sẵn sàng. {startup_error or 'Thiếu model/tokenizer/config.'}"
+    try:
+        # Lưu ảnh tạm để tái sử dụng hàm extract_features_from_image (đọc bằng cv2)
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+            pil_image.convert("RGB").save(tmp.name, format="JPEG")
+            tmp_path = tmp.name
+        features = extract_features_from_image(tmp_path, extractor)
+        os.unlink(tmp_path)
+        if features is None:
+            return "Không đọc được ảnh đầu vào."
+        caption = generate_caption(model, tokenizer, features, max_length)
+        return caption
+    except Exception as e:
+        return f"Lỗi suy luận: {e}"
+DESCRIPTION = (
+    "Upload ảnh và nhận caption sinh ra bởi mô hình. "
+    "Cần có các tệp: best_model.keras, tokenizer.pkl, model_config.pkl."
+)
+demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil", label="Ảnh vào"),
+    outputs=gr.Textbox(label="Caption"),
+    title="Image Captioning — Gradio",
+    description=DESCRIPTION,
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()

flickr30k.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import cv2
+import numpy as np
+import pickle
+from PIL import Image
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras import layers
+from tensorflow.keras.models import load_model, Model
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+from tensorflow.keras.applications import EfficientNetV2B0
+from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.image import img_to_array
+from tqdm import tqdm
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+import random
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from PIL import Image
+import pickle
+# -----------------------------
+# Custom attention layers
+# -----------------------------
+class ChannelAttention(layers.Layer):
+    def __init__(self, ratio=8, **kwargs):
+        super(ChannelAttention, self).__init__(**kwargs)
+        self.ratio = ratio
+    def build(self, input_shape):
+        self.gap = layers.GlobalAveragePooling1D()
+        self.gmp = layers.GlobalMaxPooling1D()
+        self.shared_mlp = tf.keras.Sequential([
+            layers.Dense(units=1280 // self.ratio, activation='relu'),
+            layers.Dense(units=1280)
+        ])
+        self.sigmoid = layers.Activation('sigmoid')
+        super(ChannelAttention, self).build(input_shape)
+    def call(self, inputs):
+        gap = self.gap(inputs)
+        gmp = self.gmp(inputs)
+        gap_mlp = self.shared_mlp(gap)
+        gmp_mlp = self.shared_mlp(gmp)
+        channel_attention = self.sigmoid(gap_mlp + gmp_mlp)
+        return inputs * tf.expand_dims(channel_attention, axis=1)
+    def get_config(self):
+        config = super(ChannelAttention, self).get_config()
+        config.update({'ratio': self.ratio})
+        return config
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+class SpatialAttention(layers.Layer):
+    def __init__(self, **kwargs):
+        super(SpatialAttention, self).__init__(**kwargs)
+    def build(self, input_shape):
+        self.conv = layers.Conv1D(1, kernel_size=3, padding='same', activation='sigmoid')
+        super(SpatialAttention, self).build(input_shape)
+    def call(self, inputs):
+        spatial_attention = self.conv(inputs)
+        return inputs * spatial_attention
+    def get_config(self):
+        return super(SpatialAttention, self).get_config()
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+# -----------------------------
+# Load model + tokenizer
+# -----------------------------
+def load_caption_model(model_path='best_model.keras'):
+    custom_objects = {
+        'ChannelAttention': ChannelAttention,
+        'SpatialAttention': SpatialAttention
+    }
+    model = load_model(model_path, custom_objects=custom_objects)
+    print("✅ Đã load model thành công!")
+    return model
+def load_tokenizer_and_config():
+    with open('tokenizer.pkl', 'rb') as f:
+        tokenizer = pickle.load(f)
+    with open('model_config.pkl', 'rb') as f:
+        config = pickle.load(f)
+    return tokenizer, config['max_length'], config['vocab_size']
+# -----------------------------
+# Feature extractor - EfficientNetV2B0
+# -----------------------------
+def load_feature_extractor():
+    base_model = EfficientNetV2B0(include_top=False, weights='imagenet', pooling='avg')
+    return Model(inputs=base_model.input, outputs=base_model.output)
+def extract_features_from_image(image_path, extractor):
+    image = cv2.imread(image_path)
+    if image is None:
+        print(f"❌ Không đọc được ảnh: {image_path}")
+        return None
+    image = cv2.resize(image, (224, 224))
+    image = img_to_array(image)
+    image = np.expand_dims(image, axis=0)
+    image = efficientnet_preprocess(image)
+    feature = extractor.predict(image, verbose=0)
+    return feature
+# -----------------------------
+# Generate caption
+# -----------------------------
+def generate_caption(model, tokenizer, image_features, max_length):
+    in_text = 'startseq'
+    for _ in range(max_length):
+        sequence = tokenizer.texts_to_sequences([in_text])[0]
+        sequence = pad_sequences([sequence], maxlen=max_length)
+        yhat = model.predict([image_features, sequence], verbose=0)
+        yhat = np.argmax(yhat)
+        word = tokenizer.index_word.get(yhat)
+        if word is None or word == 'endseq':
+            break
+        in_text += ' ' + word
+    return in_text.replace('startseq ', '')
+# -----------------------------
+# Hiển thị ảnh và caption
+# -----------------------------
+def display_caption(image_path, caption):
+    img = Image.open(image_path)
+    img = img.resize((1024, 768))  # Resize for better display
+    plt.imshow(img)
+    plt.axis('off')
+    plt.title(f"Caption: {caption}", fontsize=14, pad=10)
+    plt.show()
+# -----------------------------
+# Chạy test
+# -----------------------------
+if __name__ == '__main__':
+    image_path = 'running.jpg'
+    model = load_caption_model()
+    tokenizer, max_length, vocab_size = load_tokenizer_and_config()
+    extractor = load_feature_extractor()
+    features = extract_features_from_image(image_path, extractor)
+    if features is not None:
+        caption = generate_caption(model, tokenizer, features, max_length)
+        print("Caption:", caption)
+        display_caption(image_path, caption)
+def evaluate_model(model, tokenizer, test_ids, captions, max_length, sample_size=500):
+    actual, predicted = [], []
+    test_subset = test_ids[:sample_size]
+    for image_id in tqdm(test_subset, desc="Evaluating"):
+        features = feature_extractor.extract_features(image_path, image_id)
+        if features is None:
+            continue
+        yhat = generate_caption(model, tokenizer, features, max_length)
+        references = [c.replace('startseq ', '').replace(' endseq', '') for c in captions[image_id]]
+        actual.append(references)
+        predicted.append(yhat)
+    bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
+    bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
+    bleu3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0), smoothing_function=smoothie)
+    bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
+    print("\nModel Evaluation Results:")
+    print(f"BLEU-1: {bleu1:.4f}")
+    print(f"BLEU-2: {bleu2:.4f}")
+    print(f"BLEU-3: {bleu3:.4f}")
+    print(f"BLEU-4: {bleu4:.4f}")
+    return bleu1, bleu2, bleu3, bleu4
+print("\nEvaluating model on test set...")
+bleu_scores = evaluate_model(model, tokenizer, test_ids, captions, max_length)
+print("\nTraining and evaluation complete!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# Core runtime
+tensorflow==2.12.0
+h5py>=3.8.0
+numpy>=1.24.0
+pillow>=9.5.0
+opencv-python-headless==4.9.0.80
+matplotlib>=3.7.0
+# NLP + metrics
+nltk>=3.8.1
+# UI
+gradio>=4.40.0
+# Progress bars
+tqdm>=4.66.0