MultiModel_Assistant

Sleeping

App Files Files Community

mahmoudalyosify commited on Feb 10

Commit

7aefb49

verified ·

1 Parent(s): 65e8cd5

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -101

app.py CHANGED Viewed

@@ -1,141 +1,179 @@
 import gradio as gr
 import os
-from io import BytesIO
-from PIL import Image, ImageDraw, ImageFont
-from PIL import ImageColor
 import json
 import google.generativeai as genai
 from google.generativeai import types
 from dotenv import load_dotenv
-# 1. SETUP API KEY
-# ----------------
 load_dotenv()
-api_key = os.getenv("Gemini_API_Key")
-# Configure the Google AI library
-genai.configure(api_key=api_key)
-# 2. DEFINE MODEL AND INSTRUCTIONS
 bounding_box_system_instructions = """
-    Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to 25 objects.
-    If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc..).
-      """
-model = genai.GenerativeModel( model_name='gemini-2.5-flash', system_instruction=bounding_box_system_instructions , safety_settings=[ types.SafetySettingDict( category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH", ) ],)
-generation_config = genai.types.GenerationConfig(
-        temperature=0.5,
-    )
-def generate_bounding_boxes(prompt, image):
-    image = image.resize((1024, int(1024 * image.height / image.width)))
-    response = model.generate_content([prompt, image], generation_config=generation_config)
-    bounding_boxes = parse_json(response.text)
-    img=plot_bounding_boxes(image, bounding_boxes)
-    return img
-def parse_json(json_output):
     lines = json_output.splitlines()
     for i, line in enumerate(lines):
-        if line == "```json":
-            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
-            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
             break
-    return json_output
 def plot_bounding_boxes(im, bounding_boxes):
-    """
-    Plots bounding boxes on an image with labels.
-    """
-    additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
     im = im.copy()
     width, height = im.size
     draw = ImageDraw.Draw(im)
     colors = [
         'red', 'green', 'blue', 'yellow', 'orange', 'pink', 'purple', 'cyan',
         'lime', 'magenta', 'violet', 'gold', 'silver'
     ] + additional_colors
     try:
-        # Use a default font if NotoSansCJK is not available
-        try:
-            font = ImageFont.load_default()
-        except OSError:
-            print("NotoSansCJK-Regular.ttc not found. Using default font.")
-            font = ImageFont.load_default()
-        bounding_boxes_json = json.loads(bounding_boxes)
-        for i, bounding_box in enumerate(bounding_boxes_json):
-            color = colors[i % len(colors)]
-            abs_y1 = int(bounding_box["box_2d"][0] / 1000 * height)
-            abs_x1 = int(bounding_box["box_2d"][1] / 1000 * width)
-            abs_y2 = int(bounding_box["box_2d"][2] / 1000 * height)
-            abs_x2 = int(bounding_box["box_2d"][3] / 1000 * width)
-            if abs_x1 > abs_x2:
-                abs_x1, abs_x2 = abs_x2, abs_x1
-            if abs_y1 > abs_y2:
-                abs_y1, abs_y2 = abs_y2, abs_y1
-            # Draw bounding box and label
-            draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4)
-            if "label" in bounding_box:
-                draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
-    except Exception as e:
-        print(f"Error drawing bounding boxes: {e}")
     return im
-def gradio_interface():
-    """
-    Gradio app interface for bounding box generation with example pairs.
-    """
-    # Example image + prompt pairs
-    examples = [
-        ["cookies.jpg", "Detect the cookies and label their types."],
-        ["messed_room.jpg", "Find the unorganized item and suggest action in label in the image to fix them."],
-        ["yoga.jpg", "Show the different yoga poses and name them."],
-        ["zoom_face.png", "Label the tired faces in the image."]
-    ]
-    with gr.Blocks(gr.themes.Glass(secondary_hue= "rose")) as demo:
-        gr.Markdown("# Gemini Bounding Box Generator")
         with gr.Row():
             with gr.Column():
-                gr.Markdown("### Input Section")
-                input_image = gr.Image(type="pil", label="Input Image")
-                input_prompt = gr.Textbox(lines=2, label="Input Prompt", placeholder="Describe what to detect.")
-                submit_btn = gr.Button("Generate")
             with gr.Column():
-                gr.Markdown("### Output Section")
-                output_image = gr.Image(type="pil", label="Output Image")
-                #output_json = gr.Textbox(label="Bounding Boxes JSON")
-        gr.Markdown("### Examples")
         gr.Examples(
             examples=examples,
-            inputs=[input_image, input_prompt],
-            label="Example Images with Prompts"
-        )
-        # Event to generate bounding boxes
-        submit_btn.click(
-            generate_bounding_boxes,
-            inputs=[input_prompt, input_image],
-            outputs=[output_image]
         )
     return demo
 if __name__ == "__main__":
-    app = gradio_interface()
-    app.launch()

 import gradio as gr
 import os
 import json
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont, ImageColor
 import google.generativeai as genai
 from google.generativeai import types
 from dotenv import load_dotenv
+# -----------------------------
+# 1. LOAD API KEY
+# -----------------------------
 load_dotenv()
+DEFAULT_API_KEY = os.getenv("Gemini_API_Key")  # fallback if user doesn't input
+# -----------------------------
+# 2. MODEL SETTINGS
+# -----------------------------
+DEFAULT_MODEL = "gemini-2.5-flash"
+DEFAULT_TEMPERATURE = 0.5
+DEFAULT_MAX_TOKENS = 500
 bounding_box_system_instructions = """
+Return bounding boxes as a JSON array with labels. Never return masks or code fencing.
+Limit to 25 objects. If an object is present multiple times, name them according to their unique characteristics
+(colors, size, position, unique features, etc.). Also provide actionable suggestions for each object if applicable.
+"""
+# -----------------------------
+# 3. IMAGE PREPROCESSING
+# -----------------------------
+def preprocess_image(image):
+    image = image.convert("RGB")
+    max_dim = 1024
+    if image.width > max_dim or image.height > max_dim:
+        ratio = min(max_dim / image.width, max_dim / image.height)
+        new_size = (int(image.width * ratio), int(image.height * ratio))
+        image = image.resize(new_size)
+    return image
+# -----------------------------
+# 4. PARSE JSON OUTPUT
+# -----------------------------
+def parse_json(json_output):
     lines = json_output.splitlines()
     for i, line in enumerate(lines):
+        if line.strip() == "```json":
+            json_output = "\n".join(lines[i+1:])
+            json_output = json_output.split("```")[0]
             break
+    try:
+        return json.loads(json_output)
+    except json.JSONDecodeError:
+        return []
+# -----------------------------
+# 5. PLOT BOUNDING BOXES
+# -----------------------------
 def plot_bounding_boxes(im, bounding_boxes):
     im = im.copy()
     width, height = im.size
     draw = ImageDraw.Draw(im)
+    additional_colors = [color for color in ImageColor.colormap.keys()]
     colors = [
         'red', 'green', 'blue', 'yellow', 'orange', 'pink', 'purple', 'cyan',
         'lime', 'magenta', 'violet', 'gold', 'silver'
     ] + additional_colors
     try:
+        font = ImageFont.load_default()
+    except OSError:
+        font = ImageFont.load_default()
+    for i, bbox in enumerate(bounding_boxes):
+        color = colors[i % len(colors)]
+        x1, y1, x2, y2 = bbox.get("box_2d", [0,0,0,0])
+        abs_x1 = int(x1 / 1000 * width)
+        abs_y1 = int(y1 / 1000 * height)
+        abs_x2 = int(x2 / 1000 * width)
+        abs_y2 = int(y2 / 1000 * height)
+        if abs_x1 > abs_x2: abs_x1, abs_x2 = abs_x2, abs_x1
+        if abs_y1 > abs_y2: abs_y1, abs_y2 = abs_y2, abs_y1
+        draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=3)
+        label = bbox.get("label", "")
+        suggestion = bbox.get("suggestion", "")
+        if label:
+            draw.text((abs_x1 + 5, abs_y1 + 5), f"{label}", fill=color, font=font)
+        if suggestion:
+            draw.text((abs_x1 + 5, abs_y1 + 20), f"{suggestion}", fill=color, font=font)
     return im
+# -----------------------------
+# 6. GENERATE RESPONSE
+# -----------------------------
+def generate_response(
+    user_prompt,
+    user_image=None,
+    api_key_input=None,
+    model_choice=DEFAULT_MODEL,
+    temperature=DEFAULT_TEMPERATURE,
+    max_tokens=DEFAULT_MAX_TOKENS
+):
+    api_key_to_use = api_key_input if api_key_input else DEFAULT_API_KEY
+    genai.configure(api_key=api_key_to_use)
+    model = genai.GenerativeModel(
+        model_name=model_choice,
+        system_instruction=bounding_box_system_instructions,
+        safety_settings=[types.SafetySettingDict(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH")]
+    )
+    generation_config = types.GenerationConfig(
+        temperature=temperature,
+        max_output_tokens=max_tokens
+    )
+    if user_image:
+        user_image = preprocess_image(user_image)
+        response = model.generate_content([user_prompt, user_image], generation_config=generation_config)
+        bboxes = parse_json(response.text)
+        output_image = plot_bounding_boxes(user_image, bboxes)
+        return response.text, output_image
+    else:
+        response = model.generate_content([user_prompt], generation_config=generation_config)
+        return response.text, None
+# -----------------------------
+# 7. GRADIO INTERFACE
+# -----------------------------
+def build_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Multi-Modal Assistant with Bounding Boxes & Suggestions")
         with gr.Row():
             with gr.Column():
+                gr.Markdown("### User Inputs")
+                text_input = gr.Textbox(lines=3, label="Prompt")
+                image_input = gr.Image(type="pil", label="Optional Image")
+                api_key_input = gr.Textbox(label="Google API Key (Optional)", placeholder="Enter your API key")
+                model_choice = gr.Radio(["gemini-2.5-flash", "gemini-2.0"], label="Select Model", value=DEFAULT_MODEL)
+                temperature_slider = gr.Slider(0, 1, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
+                max_tokens_slider = gr.Slider(50, 2000, value=DEFAULT_MAX_TOKENS, step=50, label="Max Tokens")
+                run_btn = gr.Button("Run")
             with gr.Column():
+                gr.Markdown("### Outputs")
+                chatbot_output = gr.Textbox(label="Model Output (Text)", lines=15)
+                output_image = gr.Image(type="pil", label="Output Image with Bounding Boxes (if image provided)")
+        # Event
+        run_btn.click(
+            generate_response,
+            inputs=[text_input, image_input, api_key_input, model_choice, temperature_slider, max_tokens_slider],
+            outputs=[chatbot_output, output_image]
+        )
+        # Add example images + prompts if desired
+        gr.Markdown("### Examples (Optional)")
+        examples = [
+            ["cookies.jpg", "Detect types of cookies and provide suggestions."],
+            ["messed_room.jpg", "Identify unorganized items and suggest actions."],
+            ["yoga.jpg", "Label the different yoga poses."],
+        ]
         gr.Examples(
             examples=examples,
+            inputs=[text_input, image_input],
+            label="Example Prompts & Images"
         )
     return demo
+# -----------------------------
+# 8. RUN APP
+# -----------------------------
 if __name__ == "__main__":
+    app = build_ui()
+    app.launch()