File size: 3,332 Bytes
3a06621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import os
import json
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont, ImageColor
import google.generativeai as genai
from dotenv import load_dotenv

# =========================
# 1. SETUP API KEY
# =========================
load_dotenv()
api_key = os.getenv("Gemini_API_Key")
genai.configure(api_key=api_key)

# =========================
# 2. MODEL CONFIG
# =========================
bounding_box_system_instructions = """

Return bounding boxes as a JSON array with labels.

Never return masks or code fencing.

Limit to 25 objects.

If an object appears multiple times, use unique labels.

"""

model = genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    system_instruction=bounding_box_system_instructions
)

generation_config = genai.types.GenerationConfig(
    temperature=0.5
)

# =========================
# 3. HELPERS
# =========================
def parse_json(json_output):
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if "```" in line:
            json_output = "\n".join(lines[i + 1:])
            json_output = json_output.split("```")[0]
            break
    return json_output


def plot_bounding_boxes(im, bounding_boxes):
    im = im.copy()
    width, height = im.size
    draw = ImageDraw.Draw(im)

    colors = list(ImageColor.colormap.keys())
    font = ImageFont.load_default()

    boxes = json.loads(bounding_boxes)

    for i, box in enumerate(boxes):
        color = colors[i % len(colors)]
        y1, x1, y2, x2 = box["box_2d"]

        # Convert from 0–1000 scale to image pixels
        x1 = int(x1 / 1000 * width)
        x2 = int(x2 / 1000 * width)
        y1 = int(y1 / 1000 * height)
        y2 = int(y2 / 1000 * height)

        draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=4)
        draw.text((x1 + 6, y1 + 6), box["label"], fill=color, font=font)

    return im


# =========================
# 4. MAIN FUNCTION (GRADIO)
# =========================
def detect_objects(user_prompt, image):
    if image is None:
        return None

    prompt = user_prompt.strip()
    if prompt == "":
        prompt = "Identify and label the objects in the image."

    response = model.generate_content(
        [prompt, image],
        generation_config=generation_config
    )

    bounding_boxes = parse_json(response.text)
    image_with_boxes = plot_bounding_boxes(image, bounding_boxes)

    return image_with_boxes


# =========================
# 5. GRADIO UI
# =========================
with gr.Blocks(title="Gemini Bounding Box Detector") as demo:
    gr.Markdown("## Gemini Vision – Object Detection (Bounding Boxes Only)")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            prompt_input = gr.Textbox(
                label="Prompt",
                placeholder="e.g. Detect cookies and plates"
            )
            submit_btn = gr.Button("Detect Objects ")

        with gr.Column():
            image_output = gr.Image(label="Image with Bounding Boxes")

    submit_btn.click(
        fn=detect_objects,
        inputs=[prompt_input, image_input],
        outputs=image_output
    )

demo.launch()