slyviee commited on
Commit
a25283a
·
verified ·
1 Parent(s): 237057f

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +41 -13
  2. app.py +86 -0
  3. flickr30k.py +207 -0
  4. requirements.txt +16 -0
README.md CHANGED
@@ -1,13 +1,41 @@
1
- ---
2
- title: Img Captioning
3
- emoji: 📚
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.44.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Upload an image, return a caption
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Captioning — Hugging Face Space
2
+
3
+ Triển khai inference cho mô hình image captioning dùng TensorFlow/Keras, EfficientNetV2B0 và giao diện Gradio.
4
+
5
+ ## Cấu trúc tệp cần có
6
+ ```text
7
+ .
8
+ ├── app.py # UI Gradio cho Hugging Face Space
9
+ ├── flickr30k.py # Logic model + tiền xử lý (đã cung cấp)
10
+ ├── best_model.keras # Trọng số hình (đặt cùng thư mục)
11
+ ├── tokenizer.pkl # Tokenizer đã fit
12
+ ├── model_config.pkl # Chứa max_length, vocab_size
13
+ ├── requirements.txt
14
+ └── README.md
15
+ ```
16
+
17
+ Các hàm sử dụng trực tiếp từ `flickr30k.py`: `load_caption_model`, `load_tokenizer_and_config`, `load_feature_extractor`, `extract_features_from_image`, `generate_caption`.
18
+
19
+ ## Chạy cục bộ
20
+ ```bash
21
+ python -m venv .venv
22
+ . .venv/bin/activate # Windows: .venv\Scripts\activate
23
+ pip install --upgrade pip
24
+ pip install -r requirements.txt
25
+
26
+ # Đảm bảo 3 tệp đã có:
27
+ # best_model.keras, tokenizer.pkl, model_config.pkl
28
+
29
+ python app.py
30
+ ```
31
+ Mở URL Gradio hiển thị trong terminal.
32
+
33
+ ## Triển khai lên Hugging Face Spaces
34
+ 1) Tạo Space mới: SDK = Gradio, chọn CPU hoặc GPU tùy trọng số.
35
+ 2) Đẩy các tệp: `app.py`, `flickr30k.py`, `requirements.txt`, `README.md`, và 3 tệp trọng số/cấu hình.
36
+ 3) Sau khi build hoàn tất, Space sẽ mở UI upload ảnh và trả caption.
37
+
38
+ ## Ghi chú tương thích
39
+ - Mặc định dùng `tensorflow==2.12.0`. Nếu bạn dùng trọng số huấn luyện ở phiên bản khác, cần đồng bộ phiên bản TensorFlow/Keras tương ứng.
40
+ - Sử dụng `opencv-python-headless` thay vì `opencv-python` để tránh lỗi GUI trên môi trường server.
41
+ - Nếu thiếu tài nguyên trên Space Free, hạ kích thước mô hình hoặc chuyển phần cứng sang GPU trả phí.
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import traceback
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ from PIL import Image
8
+
9
+ # Import các hàm từ source gốc
10
+ from flickr30k import (
11
+ load_caption_model,
12
+ load_tokenizer_and_config,
13
+ load_feature_extractor,
14
+ extract_features_from_image,
15
+ generate_caption,
16
+ )
17
+
18
+ # Khởi tạo tài nguyên toàn cục khi app start
19
+ MODEL_PATH = os.environ.get("CAP_MODEL_PATH", "best_model.keras")
20
+ TOKENIZER_PATH = os.environ.get("CAP_TOKENIZER_PATH", "tokenizer.pkl")
21
+ CONFIG_PATH = os.environ.get("CAP_CONFIG_PATH", "model_config.pkl")
22
+
23
+ model = None
24
+ tokenizer = None
25
+ max_length = None
26
+ vocab_size = None
27
+ extractor = None
28
+ ready = False
29
+ startup_error = ""
30
+
31
+ def _startup():
32
+ global model, tokenizer, max_length, vocab_size, extractor, ready, startup_error
33
+ try:
34
+ # Kiểm tra sự tồn tại của các tệp cần thiết
35
+ missing = [p for p in [MODEL_PATH, TOKENIZER_PATH, CONFIG_PATH] if not Path(p).exists()]
36
+ if missing:
37
+ startup_error = "Thiếu tệp: " + ", ".join(missing)
38
+ ready = False
39
+ return
40
+
41
+ model = load_caption_model(MODEL_PATH)
42
+ tokenizer, max_length, vocab_size = load_tokenizer_and_config()
43
+ extractor = load_feature_extractor()
44
+ ready = True
45
+ except Exception as e:
46
+ startup_error = f"Khởi tạo lỗi: {e}\n{traceback.format_exc()}"
47
+ ready = False
48
+
49
+ _startup()
50
+
51
+ def predict(pil_image: Image.Image):
52
+ if not ready:
53
+ return f"Hệ thống chưa sẵn sàng. {startup_error or 'Thiếu model/tokenizer/config.'}"
54
+
55
+ try:
56
+ # Lưu ảnh tạm để tái sử dụng hàm extract_features_from_image (đọc bằng cv2)
57
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
58
+ pil_image.convert("RGB").save(tmp.name, format="JPEG")
59
+ tmp_path = tmp.name
60
+
61
+ features = extract_features_from_image(tmp_path, extractor)
62
+ os.unlink(tmp_path)
63
+
64
+ if features is None:
65
+ return "Không đọc được ảnh đầu vào."
66
+ caption = generate_caption(model, tokenizer, features, max_length)
67
+ return caption
68
+ except Exception as e:
69
+ return f"Lỗi suy luận: {e}"
70
+
71
+ DESCRIPTION = (
72
+ "Upload ảnh và nhận caption sinh ra bởi mô hình. "
73
+ "Cần có các tệp: best_model.keras, tokenizer.pkl, model_config.pkl."
74
+ )
75
+
76
+ demo = gr.Interface(
77
+ fn=predict,
78
+ inputs=gr.Image(type="pil", label="Ảnh vào"),
79
+ outputs=gr.Textbox(label="Caption"),
80
+ title="Image Captioning — Gradio",
81
+ description=DESCRIPTION,
82
+ allow_flagging="never",
83
+ )
84
+
85
+ if __name__ == "__main__":
86
+ demo.launch()
flickr30k.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import pickle
5
+ from PIL import Image
6
+ import matplotlib.pyplot as plt
7
+ import tensorflow as tf
8
+ from tensorflow.keras import layers
9
+ from tensorflow.keras.models import load_model, Model
10
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
11
+ from tensorflow.keras.applications import EfficientNetV2B0
12
+ from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+ from tensorflow.keras.preprocessing.image import img_to_array
15
+ from tqdm import tqdm
16
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
17
+ import random
18
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
19
+ from PIL import Image
20
+ import pickle
21
+
22
+
23
+
24
+ # -----------------------------
25
+ # Custom attention layers
26
+ # -----------------------------
27
+
28
+ class ChannelAttention(layers.Layer):
29
+ def __init__(self, ratio=8, **kwargs):
30
+ super(ChannelAttention, self).__init__(**kwargs)
31
+ self.ratio = ratio
32
+
33
+ def build(self, input_shape):
34
+ self.gap = layers.GlobalAveragePooling1D()
35
+ self.gmp = layers.GlobalMaxPooling1D()
36
+ self.shared_mlp = tf.keras.Sequential([
37
+ layers.Dense(units=1280 // self.ratio, activation='relu'),
38
+ layers.Dense(units=1280)
39
+ ])
40
+ self.sigmoid = layers.Activation('sigmoid')
41
+ super(ChannelAttention, self).build(input_shape)
42
+
43
+ def call(self, inputs):
44
+ gap = self.gap(inputs)
45
+ gmp = self.gmp(inputs)
46
+ gap_mlp = self.shared_mlp(gap)
47
+ gmp_mlp = self.shared_mlp(gmp)
48
+ channel_attention = self.sigmoid(gap_mlp + gmp_mlp)
49
+ return inputs * tf.expand_dims(channel_attention, axis=1)
50
+
51
+ def get_config(self):
52
+ config = super(ChannelAttention, self).get_config()
53
+ config.update({'ratio': self.ratio})
54
+ return config
55
+
56
+ @classmethod
57
+ def from_config(cls, config):
58
+ return cls(**config)
59
+
60
+
61
+
62
+ class SpatialAttention(layers.Layer):
63
+ def __init__(self, **kwargs):
64
+ super(SpatialAttention, self).__init__(**kwargs)
65
+
66
+ def build(self, input_shape):
67
+ self.conv = layers.Conv1D(1, kernel_size=3, padding='same', activation='sigmoid')
68
+ super(SpatialAttention, self).build(input_shape)
69
+
70
+ def call(self, inputs):
71
+ spatial_attention = self.conv(inputs)
72
+ return inputs * spatial_attention
73
+
74
+ def get_config(self):
75
+ return super(SpatialAttention, self).get_config()
76
+
77
+ @classmethod
78
+ def from_config(cls, config):
79
+ return cls(**config)
80
+
81
+
82
+
83
+ # -----------------------------
84
+ # Load model + tokenizer
85
+ # -----------------------------
86
+
87
+ def load_caption_model(model_path='best_model.keras'):
88
+ custom_objects = {
89
+ 'ChannelAttention': ChannelAttention,
90
+ 'SpatialAttention': SpatialAttention
91
+ }
92
+ model = load_model(model_path, custom_objects=custom_objects)
93
+ print("✅ Đã load model thành công!")
94
+ return model
95
+
96
+
97
+ def load_tokenizer_and_config():
98
+ with open('tokenizer.pkl', 'rb') as f:
99
+ tokenizer = pickle.load(f)
100
+ with open('model_config.pkl', 'rb') as f:
101
+ config = pickle.load(f)
102
+ return tokenizer, config['max_length'], config['vocab_size']
103
+
104
+
105
+ # -----------------------------
106
+ # Feature extractor - EfficientNetV2B0
107
+ # -----------------------------
108
+
109
+ def load_feature_extractor():
110
+ base_model = EfficientNetV2B0(include_top=False, weights='imagenet', pooling='avg')
111
+ return Model(inputs=base_model.input, outputs=base_model.output)
112
+
113
+
114
+ def extract_features_from_image(image_path, extractor):
115
+ image = cv2.imread(image_path)
116
+ if image is None:
117
+ print(f"❌ Không đọc được ảnh: {image_path}")
118
+ return None
119
+ image = cv2.resize(image, (224, 224))
120
+ image = img_to_array(image)
121
+ image = np.expand_dims(image, axis=0)
122
+ image = efficientnet_preprocess(image)
123
+ feature = extractor.predict(image, verbose=0)
124
+ return feature
125
+
126
+
127
+ # -----------------------------
128
+ # Generate caption
129
+ # -----------------------------
130
+
131
+ def generate_caption(model, tokenizer, image_features, max_length):
132
+ in_text = 'startseq'
133
+ for _ in range(max_length):
134
+ sequence = tokenizer.texts_to_sequences([in_text])[0]
135
+ sequence = pad_sequences([sequence], maxlen=max_length)
136
+ yhat = model.predict([image_features, sequence], verbose=0)
137
+ yhat = np.argmax(yhat)
138
+ word = tokenizer.index_word.get(yhat)
139
+ if word is None or word == 'endseq':
140
+ break
141
+ in_text += ' ' + word
142
+ return in_text.replace('startseq ', '')
143
+
144
+
145
+ # -----------------------------
146
+ # Hiển thị ảnh và caption
147
+ # -----------------------------
148
+
149
+ def display_caption(image_path, caption):
150
+ img = Image.open(image_path)
151
+ img = img.resize((1024, 768)) # Resize for better display
152
+ plt.imshow(img)
153
+ plt.axis('off')
154
+ plt.title(f"Caption: {caption}", fontsize=14, pad=10)
155
+ plt.show()
156
+
157
+
158
+ # -----------------------------
159
+ # Chạy test
160
+ # -----------------------------
161
+
162
+ if __name__ == '__main__':
163
+ image_path = 'running.jpg'
164
+
165
+ model = load_caption_model()
166
+ tokenizer, max_length, vocab_size = load_tokenizer_and_config()
167
+ extractor = load_feature_extractor()
168
+
169
+ features = extract_features_from_image(image_path, extractor)
170
+ if features is not None:
171
+ caption = generate_caption(model, tokenizer, features, max_length)
172
+ print("Caption:", caption)
173
+ display_caption(image_path, caption)
174
+
175
+ def evaluate_model(model, tokenizer, test_ids, captions, max_length, sample_size=500):
176
+ actual, predicted = [], []
177
+ test_subset = test_ids[:sample_size]
178
+
179
+ for image_id in tqdm(test_subset, desc="Evaluating"):
180
+ features = feature_extractor.extract_features(image_path, image_id)
181
+ if features is None:
182
+ continue
183
+
184
+ yhat = generate_caption(model, tokenizer, features, max_length)
185
+ references = [c.replace('startseq ', '').replace(' endseq', '') for c in captions[image_id]]
186
+
187
+ actual.append(references)
188
+ predicted.append(yhat)
189
+
190
+ bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
191
+ bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
192
+ bleu3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0), smoothing_function=smoothie)
193
+ bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
194
+
195
+ print("\nModel Evaluation Results:")
196
+ print(f"BLEU-1: {bleu1:.4f}")
197
+ print(f"BLEU-2: {bleu2:.4f}")
198
+ print(f"BLEU-3: {bleu3:.4f}")
199
+ print(f"BLEU-4: {bleu4:.4f}")
200
+
201
+ return bleu1, bleu2, bleu3, bleu4
202
+
203
+
204
+
205
+ print("\nEvaluating model on test set...")
206
+ bleu_scores = evaluate_model(model, tokenizer, test_ids, captions, max_length)
207
+ print("\nTraining and evaluation complete!")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core runtime
2
+ tensorflow==2.12.0
3
+ h5py>=3.8.0
4
+ numpy>=1.24.0
5
+ pillow>=9.5.0
6
+ opencv-python-headless==4.9.0.80
7
+ matplotlib>=3.7.0
8
+
9
+ # NLP + metrics
10
+ nltk>=3.8.1
11
+
12
+ # UI
13
+ gradio>=4.40.0
14
+
15
+ # Progress bars
16
+ tqdm>=4.66.0