RohitCSharp's picture
Create app.py
42db30a verified
raw
history blame
1.03 kB
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Prompt template
def generate_caption(image):
inputs = clip_processor(images=image, return_tensors="pt")
outputs = clip_model.get_image_features(**inputs)
# Convert image features into a dummy "caption" using top concept labels
# (In actual implementation, this could be passed to GPT-like models)
# Here we simulate a caption
return "A photo showing something relevant to the content."
demo = gr.Interface(fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs="text",
title="Image Captioning with CLIP & GPT-style Generation",
description="Upload an image to get a descriptive caption. Based on CLIP for vision understanding.")
demo.launch()