Spaces:

dschandra
/

AI_Image_Caption

Sleeping

File size: 2,458 Bytes

ea1dace
 
4942d79
 
 
ea1dace
4942d79
ea1dace
 
 
4942d79
 
 
 
 
 
 
ea1dace
4942d79
ea1dace
 
 
4942d79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea1dace
 
 
4942d79
 
ea1dace

import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image, ImageDraw

# Load BLIP model for captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load DETR model for object detection (Detectron)
detr_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# List of objects for dynamic description
objects_of_interest = ["tree", "water", "mountain", "beach"]

def generate_caption(image):
    # Process the image for caption generation
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Object Detection: Detect objects in the image
    inputs = detr_processor(images=image, return_tensors="pt")
    outputs = detr_model(**inputs)

    # Get detected objects and their labels
    target_sizes = torch.tensor([image.size[::-1]])
    results = detr_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    detected_objects = []
    for score, label in zip(results["scores"], results["labels"]):
        if label.item() == 23:  # label for "tree"
            detected_objects.append("trees")
        if label.item() == 8:  # label for "water"
            detected_objects.append("water")
        if label.item() == 72:  # label for "mountain"
            detected_objects.append("mountains")

    # Custom dynamic description based on detected objects
    description = "This image includes "
    if detected_objects:
        description += ", ".join(detected_objects)
    else:
        description += "various elements of nature."

    description += ". It provides a beautiful view that invites relaxation and exploration."

    return caption + "\n" + description

# Gradio Interface
iface = gr.Interface(fn=generate_caption, 
                     inputs=gr.Image(type="pil"), 
                     outputs=gr.Textbox(), 
                     title="Dynamic Image Caption Generator", 
                     description="Upload any image and get a detailed description of its contents.")

if __name__ == "__main__":
    iface.launch()