Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import CLIPProcessor, CLIPModel | |
| from PIL import Image | |
| import torch | |
| # Load CLIP | |
| clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| # Prompt template | |
| def generate_caption(image): | |
| inputs = clip_processor(images=image, return_tensors="pt") | |
| outputs = clip_model.get_image_features(**inputs) | |
| # Convert image features into a dummy "caption" using top concept labels | |
| # (In actual implementation, this could be passed to GPT-like models) | |
| # Here we simulate a caption | |
| return "A photo showing something relevant to the content." | |
| demo = gr.Interface(fn=generate_caption, | |
| inputs=gr.Image(type="pil"), | |
| outputs="text", | |
| title="Image Captioning with CLIP & GPT-style Generation", | |
| description="Upload an image to get a descriptive caption. Based on CLIP for vision understanding.") | |
| demo.launch() | |