import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch # Load CLIP clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # Prompt template def generate_caption(image): inputs = clip_processor(images=image, return_tensors="pt") outputs = clip_model.get_image_features(**inputs) # Convert image features into a dummy "caption" using top concept labels # (In actual implementation, this could be passed to GPT-like models) # Here we simulate a caption return "A photo showing something relevant to the content." demo = gr.Interface(fn=generate_caption, inputs=gr.Image(type="pil"), outputs="text", title="Image Captioning with CLIP & GPT-style Generation", description="Upload an image to get a descriptive caption. Based on CLIP for vision understanding.") demo.launch()