image-caption-generator / inference.py
Param20h's picture
Upload folder using huggingface_hub
d31183e verified
import torch
import torchvision.transforms as transforms
from PIL import Image
from model import CNNtoRNN
import pickle
import argparse
import sys
def predict_caption(image_path, model_path, vocab_path):
device = torch.device(
"cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
)
# Load Vocabulary
try:
with open(vocab_path, "rb") as f:
vocab = pickle.load(f)
except FileNotFoundError:
print(f"Vocabulary file {vocab_path} not found. Please train the model first.")
sys.exit(1)
vocab_size = len(vocab)
embed_size = 256
hidden_size = 256
num_layers = 1
# Load Model
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
try:
model.load_state_dict(torch.load(model_path, map_location=device))
except FileNotFoundError:
print(f"Model file {model_path} not found. Please train the model first.")
sys.exit(1)
model.eval()
# Load and Transform Image
try:
image = Image.open(image_path).convert("RGB")
except Exception as e:
print(f"Error opening image: {e}")
sys.exit(1)
transform = transforms.Compose([
transforms.Resize((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])
image_tensor = transform(image).unsqueeze(0).to(device)
# Predict caption
caption_list = model.caption_image(image_tensor, vocab)
caption = " ".join(caption_list)
print("--------------------------------------------------")
print(f"Generated Caption: {caption.capitalize()}")
print("--------------------------------------------------")
return caption
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Image Caption Generator Inference")
parser.add_argument("--image", type=str, required=True, help="Path to the image file")
parser.add_argument("--model", type=str, default="caption_model.pth", help="Path to trained model")
parser.add_argument("--vocab", type=str, default="vocab.pkl", help="Path to saved vocabulary")
args = parser.parse_args()
predict_caption(args.image, args.model, args.vocab)