Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import VisionEncoderDecoderModel, GPT2Tokenizer | |
| import torch | |
| from PIL import Image | |
| from torchvision import transforms | |
| # Load model and tokenizer | |
| model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned") | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| # Define manual preprocessing | |
| preprocess = transforms.Compose([ | |
| transforms.Resize((224, 224)), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), | |
| ]) | |
| # Streamlit app setup | |
| st.title("Image Captioning with ViT-GPT2") | |
| st.write("Upload an image to generate a caption.") | |
| uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"]) | |
| if uploaded_file is not None: | |
| image = Image.open(uploaded_file) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| # Preprocess the image manually | |
| inputs = preprocess(image).unsqueeze(0) # Add batch dimension | |
| # Generate the caption | |
| with st.spinner("Generating caption..."): | |
| output = model.generate(inputs) | |
| caption = tokenizer.decode(output[0], skip_special_tokens=True) | |
| st.success("Generated Caption:") | |
| st.write(caption) | |