MultiModel_Assistant

Sleeping

App Files Files Community

mahmoudalyosify commited on Feb 10

Commit

10ac47c

verified ·

1 Parent(s): 4c17176

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -132

app.py CHANGED Viewed

@@ -8,172 +8,119 @@ from google.generativeai import types
 from dotenv import load_dotenv
 # -----------------------------
-# 1. LOAD API KEY
 # -----------------------------
 load_dotenv()
-DEFAULT_API_KEY = os.getenv("Gemini_API_Key")  # fallback if user doesn't input
 # -----------------------------
-# 2. MODEL SETTINGS
 # -----------------------------
-DEFAULT_MODEL = "gemini-2.5-flash"
-DEFAULT_TEMPERATURE = 0.5
-DEFAULT_MAX_TOKENS = 500
 bounding_box_system_instructions = """
-Return bounding boxes as a JSON array with labels. Never return masks or code fencing.
-Limit to 25 objects. If an object is present multiple times, name them according to their unique characteristics
-(colors, size, position, unique features, etc.). Also provide actionable suggestions for each object if applicable.
 """
-# -----------------------------
-# 3. IMAGE PREPROCESSING
-# -----------------------------
-def preprocess_image(image):
-    image = image.convert("RGB")
-    max_dim = 1024
-    if image.width > max_dim or image.height > max_dim:
-        ratio = min(max_dim / image.width, max_dim / image.height)
-        new_size = (int(image.width * ratio), int(image.height * ratio))
-        image = image.resize(new_size)
-    return image
 # -----------------------------
-# 4. PARSE JSON OUTPUT
 # -----------------------------
-def parse_json(json_output):
     lines = json_output.splitlines()
     for i, line in enumerate(lines):
         if line.strip() == "```json":
             json_output = "\n".join(lines[i+1:])
             json_output = json_output.split("```")[0]
             break
-    try:
-        return json.loads(json_output)
-    except json.JSONDecodeError:
-        return []
-# -----------------------------
-# 5. PLOT BOUNDING BOXES
-# -----------------------------
 def plot_bounding_boxes(im, bounding_boxes):
     im = im.copy()
     width, height = im.size
     draw = ImageDraw.Draw(im)
-    additional_colors = [color for color in ImageColor.colormap.keys()]
-    colors = [
-        'red', 'green', 'blue', 'yellow', 'orange', 'pink', 'purple', 'cyan',
-        'lime', 'magenta', 'violet', 'gold', 'silver'
-    ] + additional_colors
     try:
         font = ImageFont.load_default()
-    except OSError:
-        font = ImageFont.load_default()
-    for i, bbox in enumerate(bounding_boxes):
-        color = colors[i % len(colors)]
-        x1, y1, x2, y2 = bbox.get("box_2d", [0,0,0,0])
-        abs_x1 = int(x1 / 1000 * width)
-        abs_y1 = int(y1 / 1000 * height)
-        abs_x2 = int(x2 / 1000 * width)
-        abs_y2 = int(y2 / 1000 * height)
-        if abs_x1 > abs_x2: abs_x1, abs_x2 = abs_x2, abs_x1
-        if abs_y1 > abs_y2: abs_y1, abs_y2 = abs_y2, abs_y1
-        draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=3)
-        label = bbox.get("label", "")
-        suggestion = bbox.get("suggestion", "")
-        if label:
-            draw.text((abs_x1 + 5, abs_y1 + 5), f"{label}", fill=color, font=font)
-        if suggestion:
-            draw.text((abs_x1 + 5, abs_y1 + 20), f"{suggestion}", fill=color, font=font)
     return im
 # -----------------------------
-# 6. GENERATE RESPONSE
 # -----------------------------
-def generate_response(
-    user_prompt,
-    user_image=None,
-    api_key_input=None,
-    model_choice=DEFAULT_MODEL,
-    temperature=DEFAULT_TEMPERATURE,
-    max_tokens=DEFAULT_MAX_TOKENS
-):
-    api_key_to_use = api_key_input if api_key_input else DEFAULT_API_KEY
-    genai.configure(api_key=api_key_to_use)
-    model = genai.GenerativeModel(
-        model_name=model_choice,
-        system_instruction=bounding_box_system_instructions,
-        safety_settings=[types.SafetySettingDict(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH")]
-    )
-    generation_config = types.GenerationConfig(
-        temperature=temperature,
-        max_output_tokens=max_tokens
-    )
-    if user_image:
-        user_image = preprocess_image(user_image)
-        response = model.generate_content([user_prompt, user_image], generation_config=generation_config)
-        bboxes = parse_json(response.text)
-        output_image = plot_bounding_boxes(user_image, bboxes)
-        return response.text, output_image
-    else:
-        response = model.generate_content([user_prompt], generation_config=generation_config)
-        return response.text, None
 # -----------------------------
-# 7. GRADIO INTERFACE
 # -----------------------------
-def build_ui():
     with gr.Blocks() as demo:
-        gr.Markdown("# Multi-Modal Assistant with Bounding Boxes & Suggestions")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### User Inputs")
-                text_input = gr.Textbox(lines=3, label="Prompt")
-                image_input = gr.Image(type="pil", label="Optional Image")
-                api_key_input = gr.Textbox(label="Google API Key (Optional)", placeholder="Enter your API key")
-                model_choice = gr.Radio(["gemini-2.5-flash", "gemini-2.0"], label="Select Model", value=DEFAULT_MODEL)
-                temperature_slider = gr.Slider(0, 1, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
-                max_tokens_slider = gr.Slider(50, 2000, value=DEFAULT_MAX_TOKENS, step=50, label="Max Tokens")
-                run_btn = gr.Button("Run")
-            with gr.Column():
-                gr.Markdown("### Outputs")
-                chatbot_output = gr.Textbox(label="Model Output (Text)", lines=15)
-                output_image = gr.Image(type="pil", label="Output Image with Bounding Boxes (if image provided)")
-        # Event
-        run_btn.click(
-            generate_response,
-            inputs=[text_input, image_input, api_key_input, model_choice, temperature_slider, max_tokens_slider],
-            outputs=[chatbot_output, output_image]
-        )
-        # Add example images + prompts if desired
-        gr.Markdown("### Examples (Optional)")
-        examples = [
-            ["cookies.jpg", "Detect types of cookies and provide suggestions."],
-            ["messed_room.jpg", "Identify unorganized items and suggest actions."],
-            ["yoga.jpg", "Label the different yoga poses."],
-        ]
-        gr.Examples(
-            examples=examples,
-            inputs=[text_input, image_input],
-            label="Example Prompts & Images"
-        )
     return demo
-# -----------------------------
-# 8. RUN APP
-# -----------------------------
 if __name__ == "__main__":
-    app = build_ui()
-    app.launch()

 from dotenv import load_dotenv
 # -----------------------------
+# 1. SETUP API KEY
 # -----------------------------
 load_dotenv()
+api_key = os.getenv("Gemini_API_Key")  # لازم تحط المفتاح في Hugging Face Secrets
+genai.configure(api_key=api_key)
 # -----------------------------
+# 2. DEFINE MODELS
 # -----------------------------
+# Text & Web Search Model
+TEXT_MODEL_ID = "gemini-2.5-flash"
+# Image / Bounding Box Model
 bounding_box_system_instructions = """
+Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to 25 objects.
+If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc..).
 """
+IMAGE_MODEL = genai.GenerativeModel(
+    model_name='gemini-2.5-flash',
+    system_instruction=bounding_box_system_instructions,
+    safety_settings=[types.SafetySettingDict(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH")],
+)
+GEN_CONFIG = genai.types.GenerationConfig(temperature=0.5)
 # -----------------------------
+# 3. IMAGE FUNCTIONS
 # -----------------------------
+def parse_json(json_output):
     lines = json_output.splitlines()
     for i, line in enumerate(lines):
         if line.strip() == "```json":
             json_output = "\n".join(lines[i+1:])
             json_output = json_output.split("```")[0]
             break
+    return json_output
 def plot_bounding_boxes(im, bounding_boxes):
+    additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
     im = im.copy()
     width, height = im.size
     draw = ImageDraw.Draw(im)
+    colors = ['red','green','blue','yellow','orange','pink','purple','cyan','lime','magenta','violet','gold','silver'] + additional_colors
     try:
         font = ImageFont.load_default()
+        bounding_boxes_json = json.loads(bounding_boxes)
+        for i, bounding_box in enumerate(bounding_boxes_json):
+            color = colors[i % len(colors)]
+            abs_y1 = int(bounding_box["box_2d"][0] / 1000 * height)
+            abs_x1 = int(bounding_box["box_2d"][1] / 1000 * width)
+            abs_y2 = int(bounding_box["box_2d"][2] / 1000 * height)
+            abs_x2 = int(bounding_box["box_2d"][3] / 1000 * width)
+            if abs_x1 > abs_x2: abs_x1, abs_x2 = abs_x2, abs_x1
+            if abs_y1 > abs_y2: abs_y1, abs_y2 = abs_y2, abs_y1
+            draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4)
+            if "label" in bounding_box: draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
+    except Exception as e:
+        print(f"Error drawing bounding boxes: {e}")
     return im
+def generate_bounding_boxes(prompt, image):
+    image = image.resize((1024, int(1024 * image.height / image.width)))
+    response = IMAGE_MODEL.generate_content([prompt, image], generation_config=GEN_CONFIG)
+    bounding_boxes = parse_json(response.text)
+    img = plot_bounding_boxes(image, bounding_boxes)
+    return img
 # -----------------------------
+# 4. TEXT / SEARCH FUNCTION
 # -----------------------------
+def text_search_query(question):
+    try:
+        search_tool = types.Tool(google_search=types.GoogleSearch())
+        response = genai.models.generate_content(
+            model=TEXT_MODEL_ID,
+            contents=question,
+            config=types.GenerateContentConfig(tools=[search_tool]),
+        )
+        ai_response = response.text
+        search_results = response.candidates[0].grounding_metadata.search_entry_point.rendered_content
+        return ai_response, search_results
+    except Exception as e:
+        return f"Error: {str(e)}", ""
 # -----------------------------
+# 5. GRADIO INTERFACE
 # -----------------------------
+def gradio_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("# Multimodal Gemini Assistant")
+        with gr.Tab("Text & Web Search"):
+            with gr.Row():
+                with gr.Column():
+                    txt_input = gr.Textbox(lines=2, label="Ask a Question")
+                    txt_btn = gr.Button("Submit")
+                with gr.Column():
+                    txt_output = gr.Textbox(label="AI Response")
+                    search_output = gr.HTML(label="Search Results")
+            txt_btn.click(text_search_query, inputs=txt_input, outputs=[txt_output, search_output])
+        with gr.Tab("Image Bounding Boxes"):
+            with gr.Row():
+                with gr.Column():
+                    img_input = gr.Image(type="pil", label="Input Image")
+                    prompt_input = gr.Textbox(lines=2, label="Input Prompt", placeholder="Describe what to detect")
+                    img_btn = gr.Button("Generate")
+                with gr.Column():
+                    img_output = gr.Image(type="pil", label="Output Image")
+            img_btn.click(generate_bounding_boxes, inputs=[prompt_input, img_input], outputs=img_output)
     return demo
 if __name__ == "__main__":
+    app = gradio_interface()
+    app.launch(share=True)