File size: 2,163 Bytes
cac8567
b258952
 
 
 
 
375c40b
b258952
 
 
 
 
 
 
15fc9af
375c40b
 
 
 
 
 
b258952
6f3dcaa
f086606
 
b258952
 
 
 
f086606
b258952
 
e2c4b41
b258952
375c40b
b258952
e2c4b41
b258952
 
 
 
 
 
 
 
 
 
 
d15da10
 
 
8db8540
d15da10
a9b7c42
8686796
b258952
 
 
 
8db8540
 
 
b258952
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import tensorflow as tf
import numpy as np
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tempfile

# Paths
token_path = 'saved_models/Flickr8k.token.txt'
train_images_path = 'saved_models/Flickr_8k.trainImages.txt'
test_images_path = 'saved_models/Flickr_8k.testImages.txt'
model_path = 'saved_models/Final_Image_Captioning.h5'

def preprocess_image(image_file):
    with tempfile.NamedTemporaryFile(suffix=".jpg") as temp_file:
        image_file.save(temp_file.name)
        img = image.load_img(temp_file.name, target_size=(299, 299))
        img = image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
    return img


def generate_captions(image_file, sequence):
    # Load the pre-trained model
    model = load_model(model_path)

    # Preprocess the image
    img = preprocess_image(image_file)

    # Generate the caption
    caption = generate_caption_from_image(model, img, sequence)

    return caption

def generate_caption_from_image(model, img, sequence):
    max_length = 34
    start_token = "<start>"
    end_token = "<end>"
    wordtoix = np.load("wordtoix.npy", allow_pickle=True).item()
    ixtoword = np.load("ixtoword.npy", allow_pickle=True).item()

    initial_state = [np.zeros((1, 256)), np.zeros((1, 256))]

    # Generate caption using greedy search
    caption = start_token
    for _ in range(max_length):
        seq = [wordtoix[word] for word in caption.split() if word in wordtoix]
        seq += sequence  # Append the given sequence
        seq = pad_sequences([seq], maxlen=max_length)

        y_pred, h, c = model.predict([img, seq] + initial_state)
        y_pred = np.argmax(y_pred, axis=-1)
        word = ixtoword[y_pred[0][0]]
        caption += " " + word
        if word == end_token:
            break

        # Update initial state
        initial_state = [h, c]

    # Remove start and end tokens
    caption = " ".join(caption.split()[1:-1])

    return caption