MIM / app.py
hamada056's picture
Upload app.py
3a06621 verified
import gradio as gr
import os
import json
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont, ImageColor
import google.generativeai as genai
from dotenv import load_dotenv
# =========================
# 1. SETUP API KEY
# =========================
load_dotenv()
api_key = os.getenv("Gemini_API_Key")
genai.configure(api_key=api_key)
# =========================
# 2. MODEL CONFIG
# =========================
bounding_box_system_instructions = """
Return bounding boxes as a JSON array with labels.
Never return masks or code fencing.
Limit to 25 objects.
If an object appears multiple times, use unique labels.
"""
model = genai.GenerativeModel(
model_name="gemini-2.5-flash",
system_instruction=bounding_box_system_instructions
)
generation_config = genai.types.GenerationConfig(
temperature=0.5
)
# =========================
# 3. HELPERS
# =========================
def parse_json(json_output):
lines = json_output.splitlines()
for i, line in enumerate(lines):
if "```" in line:
json_output = "\n".join(lines[i + 1:])
json_output = json_output.split("```")[0]
break
return json_output
def plot_bounding_boxes(im, bounding_boxes):
im = im.copy()
width, height = im.size
draw = ImageDraw.Draw(im)
colors = list(ImageColor.colormap.keys())
font = ImageFont.load_default()
boxes = json.loads(bounding_boxes)
for i, box in enumerate(boxes):
color = colors[i % len(colors)]
y1, x1, y2, x2 = box["box_2d"]
# Convert from 0–1000 scale to image pixels
x1 = int(x1 / 1000 * width)
x2 = int(x2 / 1000 * width)
y1 = int(y1 / 1000 * height)
y2 = int(y2 / 1000 * height)
draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=4)
draw.text((x1 + 6, y1 + 6), box["label"], fill=color, font=font)
return im
# =========================
# 4. MAIN FUNCTION (GRADIO)
# =========================
def detect_objects(user_prompt, image):
if image is None:
return None
prompt = user_prompt.strip()
if prompt == "":
prompt = "Identify and label the objects in the image."
response = model.generate_content(
[prompt, image],
generation_config=generation_config
)
bounding_boxes = parse_json(response.text)
image_with_boxes = plot_bounding_boxes(image, bounding_boxes)
return image_with_boxes
# =========================
# 5. GRADIO UI
# =========================
with gr.Blocks(title="Gemini Bounding Box Detector") as demo:
gr.Markdown("## Gemini Vision – Object Detection (Bounding Boxes Only)")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image")
prompt_input = gr.Textbox(
label="Prompt",
placeholder="e.g. Detect cookies and plates"
)
submit_btn = gr.Button("Detect Objects ")
with gr.Column():
image_output = gr.Image(label="Image with Bounding Boxes")
submit_btn.click(
fn=detect_objects,
inputs=[prompt_input, image_input],
outputs=image_output
)
demo.launch()