multiuse

Sleeping

App Files Files Community

mhamoody commited on 21 days ago

Commit

1f70db6

verified ·

1 Parent(s): 8069f71

update

Browse files

Files changed (1) hide show

app.py +175 -13

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from typing import List, Tuple, Optional
 import google.genai as genai
 import gradio as gr
 from PIL import Image
 GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
@@ -12,8 +14,11 @@ IMAGE_WIDTH = 512
 system_instruction_analysis = "You are an expert of the given topic. Analyze the provided text with a focus on the topic, identifying recent issues, recent insights, or improvements relevant to academic standards and effectiveness. Offer actionable advice for enhancing knowledge and suggest real-life examples."
 model_name = "gemini-2.5-flash"
-# Initialize model (will be configured with API key in bot function)
-model = None
 # Helper Functions
 def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
@@ -111,6 +116,95 @@ def bot(
     except Exception as e:
         chatbot[-1]["content"] = f"Error processing response: {str(e)}"
         yield chatbot
 # Components
 google_key_component = gr.Textbox(
     label="Google API Key",
@@ -127,6 +221,8 @@ text_prompt_component = gr.Textbox(
     lines=3
 )
 run_button_component = gr.Button("Submit")
 temperature_component = gr.Slider(
     minimum=0,
     maximum=1.0,
@@ -168,7 +264,7 @@ example_scenarios = [
     "Describe Multimodal AI",
     "What are the difference between multiagent llm and multiagent system",
     "Why it's difficult to integrate multimodality in prompt"]
-example_images = [["ex1.png"],["ex2.png"]]
 # Gradio Interface
 user_inputs = [text_prompt_component, chatbot_component]
@@ -184,19 +280,79 @@ bot_inputs = [
 ]
 with gr.Blocks() as demo:
-    gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini 2.5 Multimodal Chatbot</h1>")
     with gr.Row():
         google_key_component.render()
     with gr.Row():
         chatbot_component.render()
     with gr.Row():
         with gr.Column(scale=1):
-           text_prompt_component.render()
         with gr.Column(scale=1):
-           image_prompt_component.render()
         with gr.Column(scale=1):
             run_button_component.render()
     with gr.Accordion("🧪Example Text 💬", open=False):
         example_radio = gr.Radio(
         choices=example_scenarios,
@@ -207,12 +363,6 @@ with gr.Blocks() as demo:
         fn=lambda query: query if query else "No query selected.",
         inputs=[example_radio],
         outputs=[text_prompt_component])
-    with gr.Accordion("🧪Example Image 🩻", open=False):
-        gr.Examples(
-        examples=example_images,
-        inputs=[image_prompt_component],
-        label="Example Figures",
-        )
     with gr.Accordion("🛠️Customize", open=False):
         temperature_component.render()
         max_output_tokens_component.render()
@@ -223,7 +373,19 @@ with gr.Blocks() as demo:
     run_button_component.click(
         fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
     ).then(
-        fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
     )
 if __name__ == "__main__":

 import google.genai as genai
 import gradio as gr
 from PIL import Image
+from PIL import ImageDraw, ImageFont, ImageColor
+import json
 GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
 system_instruction_analysis = "You are an expert of the given topic. Analyze the provided text with a focus on the topic, identifying recent issues, recent insights, or improvements relevant to academic standards and effectiveness. Offer actionable advice for enhancing knowledge and suggest real-life examples."
 model_name = "gemini-2.5-flash"
+# Bounding box system instruction
+bounding_box_system_instructions = (
+    "Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to 25 objects. "
+    "If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc.)."
+)
 # Helper Functions
 def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
     except Exception as e:
         chatbot[-1]["content"] = f"Error processing response: {str(e)}"
         yield chatbot
+def _strip_codefence_json(text: str) -> str:
+    """Strip markdown code fences and return the JSON payload portion."""
+    if not text:
+        return ""
+    lines = text.splitlines()
+    for i, line in enumerate(lines):
+        if line.strip().startswith("```json"):
+            payload = "\n".join(lines[i+1:])
+            payload = payload.split("```")[0]
+            return payload.strip()
+    # fallback: try to find first '[' or '{'
+    idx = min((text.find("{") if text.find("{")!=-1 else len(text)), (text.find("[") if text.find("[")!=-1 else len(text)))
+    return text[idx:].strip() if idx < len(text) else text.strip()
+def generate_bounding_boxes(google_key: str, prompt: str, image: Optional[Image.Image]):
+    """Generate bounding boxes from the model and return a PIL image with boxes drawn."""
+    google_key = google_key or GOOGLE_API_KEY
+    if not google_key:
+        raise ValueError("GOOGLE_API_KEY is not set. Please set it up.")
+    if image is None:
+        # Nothing to process
+        return None
+    client = genai.Client(api_key=google_key)
+    # Resize image for generation (keep aspect ratio)
+    img_for_model = image.resize((1024, int(1024 * image.height / image.width)))
+    try:
+        response = client.models.generate_content(
+            model=model_name,
+            contents=[prompt, img_for_model],
+            config=genai.types.GenerateContentConfig(
+                system_instruction=bounding_box_system_instructions,
+                temperature=0.3,
+                max_output_tokens=1024,
+            ),
+        )
+    except Exception as e:
+        print("Error generating bounding boxes:", e)
+        return None
+    json_text = _strip_codefence_json(getattr(response, "text", "") or "")
+    try:
+        bounding_boxes = json.loads(json_text)
+    except Exception as e:
+        print("Failed to parse bounding box JSON:", e)
+        return None
+    # Draw boxes
+    try:
+        out = image.copy()
+        draw = ImageDraw.Draw(out)
+        width, height = out.size
+        # font
+        try:
+            font = ImageFont.load_default()
+        except Exception:
+            font = None
+        colors = list(ImageColor.colormap.keys())
+        for i, bb in enumerate(bounding_boxes):
+            color = colors[i % len(colors)]
+            # Expecting box_2d as [y1, x1, y2, x2] in 0-1000 scale like test.py
+            y1 = int(bb["box_2d"][0] / 1000 * height)
+            x1 = int(bb["box_2d"][1] / 1000 * width)
+            y2 = int(bb["box_2d"][2] / 1000 * height)
+            x2 = int(bb["box_2d"][3] / 1000 * width)
+            # normalize
+            if x1 > x2:
+                x1, x2 = x2, x1
+            if y1 > y2:
+                y1, y2 = y2, y1
+            draw.rectangle(((x1, y1), (x2, y2)), outline=color, width=4)
+            label = bb.get("label") or bb.get("name") or ""
+            if label:
+                draw.text((x1 + 6, y1 + 4), label, fill=color, font=font)
+        return out
+    except Exception as e:
+        print("Error drawing bounding boxes:", e)
+        return None
 # Components
 google_key_component = gr.Textbox(
     label="Google API Key",
     lines=3
 )
 run_button_component = gr.Button("Submit")
+bbox_mode_component = gr.Checkbox(label="Bounding box mode (detect & label objects)", value=False)
+output_image_component = gr.Image(type="pil", label="Output Image")
 temperature_component = gr.Slider(
     minimum=0,
     maximum=1.0,
     "Describe Multimodal AI",
     "What are the difference between multiagent llm and multiagent system",
     "Why it's difficult to integrate multimodality in prompt"]
 # Gradio Interface
 user_inputs = [text_prompt_component, chatbot_component]
 ]
+def handle_submit(
+    google_key: str,
+    image_prompt: Optional[Image.Image],
+    temperature: float,
+    max_output_tokens: int,
+    stop_sequences: str,
+    top_k: int,
+    top_p: float,
+    chatbot: List,
+    bbox_mode: bool,
+):
+    """Route submission: if bounding-box-mode (or keywords present) and image exists, call bounding box generator; otherwise stream text via `bot`."""
+    # Extract last user text
+    content = chatbot[-1]["content"] if chatbot else None
+    text_prompt = None
+    if isinstance(content, str):
+        text_prompt = content.strip() if content else None
+    elif isinstance(content, list) and len(content) > 0:
+        for item in content:
+            if isinstance(item, str):
+                text_prompt = item.strip()
+                break
+    # Simple keyword detection
+    bbox_triggers = ["detect", "detect the", "bounding", "box", "label", "find the"]
+    trigger = False
+    if bbox_mode:
+        trigger = True
+    elif image_prompt is not None and text_prompt:
+        low = text_prompt.lower()
+        for kw in bbox_triggers:
+            if kw in low:
+                trigger = True
+                break
+    if trigger and image_prompt is not None:
+        out_img = generate_bounding_boxes(google_key, text_prompt or "Detect objects in the image", image_prompt)
+        # Append an assistant message
+        chatbot.append({"role": "assistant", "content": "Generated bounding boxes (see image)."})
+        yield chatbot, out_img
+        return
+    # Fallback to text generation: stream from bot and keep image output empty
+    for chat_state in bot(
+        google_key,
+        image_prompt,
+        temperature,
+        max_output_tokens,
+        stop_sequences,
+        top_k,
+        top_p,
+        chatbot,
+    ):
+        yield chat_state, None
 with gr.Blocks() as demo:
+    gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini 2.0 Multimodal Chatbot</h1>")
     with gr.Row():
         google_key_component.render()
     with gr.Row():
         chatbot_component.render()
     with gr.Row():
         with gr.Column(scale=1):
+            text_prompt_component.render()
+            bbox_mode_component.render()
         with gr.Column(scale=1):
+            image_prompt_component.render()
         with gr.Column(scale=1):
             run_button_component.render()
+    with gr.Row():
+        with gr.Column(scale=1):
+            output_image_component.render()
     with gr.Accordion("🧪Example Text 💬", open=False):
         example_radio = gr.Radio(
         choices=example_scenarios,
         fn=lambda query: query if query else "No query selected.",
         inputs=[example_radio],
         outputs=[text_prompt_component])
     with gr.Accordion("🛠️Customize", open=False):
         temperature_component.render()
         max_output_tokens_component.render()
     run_button_component.click(
         fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
     ).then(
+        fn=handle_submit,
+        inputs=[
+            google_key_component,
+            image_prompt_component,
+            temperature_component,
+            max_output_tokens_component,
+            stop_sequences_component,
+            top_k_component,
+            top_p_component,
+            chatbot_component,
+            bbox_mode_component,
+        ],
+        outputs=[chatbot_component, output_image_component],
     )
 if __name__ == "__main__":