import gradio as gr import os import json from io import BytesIO from PIL import Image, ImageDraw, ImageFont, ImageColor import google.generativeai as genai from dotenv import load_dotenv # ========================= # 1. SETUP API KEY # ========================= load_dotenv() api_key = os.getenv("Gemini_API_Key") genai.configure(api_key=api_key) # ========================= # 2. MODEL CONFIG # ========================= bounding_box_system_instructions = """ Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to 25 objects. If an object appears multiple times, use unique labels. """ model = genai.GenerativeModel( model_name="gemini-2.5-flash", system_instruction=bounding_box_system_instructions ) generation_config = genai.types.GenerationConfig( temperature=0.5 ) # ========================= # 3. HELPERS # ========================= def parse_json(json_output): lines = json_output.splitlines() for i, line in enumerate(lines): if "```" in line: json_output = "\n".join(lines[i + 1:]) json_output = json_output.split("```")[0] break return json_output def plot_bounding_boxes(im, bounding_boxes): im = im.copy() width, height = im.size draw = ImageDraw.Draw(im) colors = list(ImageColor.colormap.keys()) font = ImageFont.load_default() boxes = json.loads(bounding_boxes) for i, box in enumerate(boxes): color = colors[i % len(colors)] y1, x1, y2, x2 = box["box_2d"] # Convert from 0–1000 scale to image pixels x1 = int(x1 / 1000 * width) x2 = int(x2 / 1000 * width) y1 = int(y1 / 1000 * height) y2 = int(y2 / 1000 * height) draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=4) draw.text((x1 + 6, y1 + 6), box["label"], fill=color, font=font) return im # ========================= # 4. MAIN FUNCTION (GRADIO) # ========================= def detect_objects(user_prompt, image): if image is None: return None prompt = user_prompt.strip() if prompt == "": prompt = "Identify and label the objects in the image." response = model.generate_content( [prompt, image], generation_config=generation_config ) bounding_boxes = parse_json(response.text) image_with_boxes = plot_bounding_boxes(image, bounding_boxes) return image_with_boxes # ========================= # 5. GRADIO UI # ========================= with gr.Blocks(title="Gemini Bounding Box Detector") as demo: gr.Markdown("## Gemini Vision – Object Detection (Bounding Boxes Only)") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") prompt_input = gr.Textbox( label="Prompt", placeholder="e.g. Detect cookies and plates" ) submit_btn = gr.Button("Detect Objects ") with gr.Column(): image_output = gr.Image(label="Image with Bounding Boxes") submit_btn.click( fn=detect_objects, inputs=[prompt_input, image_input], outputs=image_output ) demo.launch()