import torch from PIL import Image import torchvision.transforms as transforms from model import load_model # Import the load_model function from your model.py # Set device (GPU if available) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Specify the checkpoint path (local path where the checkpoint will be downloaded) checkpoint_path = "checkpoint.pth" # Load the model and tokenizer using the helper function model, tokenizer = load_model(checkpoint_path, device) # Define the image transformation (should match what was used during training) transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) # Specify the path to an example image (update this to a valid image file path on your system) image_path = "/home/rishabh/coding/minor_project/for deployment/image-captionator/GJwtW4JGdR4.jpg" # Open and preprocess the image image = Image.open(image_path).convert("RGB") image_tensor = transform(image).unsqueeze(0).to(device) # Use the model's generate function to produce a caption output_ids = model.generate(pixel_values=image_tensor, max_length=30, num_beams=4) caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Print the generated caption print("Generated Caption:", caption)