multimodal-chatbot

Sleeping

App Files Files Community

elsayedelmandoh commited on Feb 11

Commit

a1660ff

1 Parent(s): d821914

upload project

Browse files

Files changed (8) hide show

README.md +1 -1
app.py +125 -120
requirements.txt +2 -1
research/notebook.ipynb +22 -0
setup.py +0 -1
src/config/settings.py +3 -2
src/utils/__init__.py +0 -5
src/utils/helpers.py +48 -46

README.md CHANGED Viewed

@@ -67,7 +67,7 @@ conda activate multimodal-chatbot
 conda install pip -y
 pip install -r requirements.txt
 ```
-You may use a .env loader or store vars in the Hugging Face Space secrets.
 ### Run locally
 ```bash

 conda install pip -y
 pip install -r requirements.txt
 ```
+Create a .env file at the project root or store vars in the Hugging Face Space secrets.
 ### Run locally
 ```bash

app.py CHANGED Viewed

@@ -1,129 +1,134 @@
-from src.config.settings import GEMINI_API_KEY
-import os
-import time
-from typing import List, Tuple, Optional
-import google.generativeai as genai
 import gradio as gr
-from PIL import Image
-import tempfile
-import os
-# Components
-gemini_key_component = gr.Textbox(
-    label="Gemini API Key",
-    type="password",
-    placeholder="Enter your Gemini API Key",
-    visible=GEMINI_API_KEY is None
-)
-image_prompt_component = gr.Image(type="pil", label="Input Image (Optional: Figure/Graph)")
-chatbot_component = gr.Chatbot(label="Chatbot", bubble_full_width=False)
-text_prompt_component = gr.Textbox(
-    placeholder="Type your question here...",
-    label="Ask",
-    lines=3
-)
-run_button_component = gr.Button("Submit")
-temperature_component = gr.Slider(
-    minimum=0,
-    maximum=1.0,
-    value=0.4,
-    step=0.05,
-    label="Creativity (Temperature)",
-    info="Controls the randomness of the response. Higher values result in more creative answers."
-)
-max_output_tokens_component = gr.Slider(
-    minimum=1,
-    maximum=2048,
-    value=1024,
-    step=1,
-    label="Response Length (Token Limit)",
-    info="Sets the maximum number of tokens in the output response."
-)
-stop_sequences_component = gr.Textbox(
-    label="Stop Sequences (Optional)",
-    placeholder="Enter stop sequences, e.g., STOP, END",
-    info="Specify sequences to stop the generation."
-)
-top_k_component = gr.Slider(
-    minimum=1,
-    maximum=40,
-    value=32,
-    step=1,
-    label="Top-K Sampling",
-    info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
-)
-top_p_component = gr.Slider(
-    minimum=0,
-    maximum=1,
-    value=1,
-    step=0.01,
-    label="Top-P Sampling",
-    info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
-)
-example_scenarios = [
-    "Describe Multimodal AI",
-    "What are the difference between muliagent llm and multiagent system",
-"Why it's difficult to intgrate multimodality in prompt"]
-example_images = [["ex1.png"],["ex2.png"]]
-# Gradio Interface
-user_inputs = [text_prompt_component, chatbot_component]
-bot_inputs = [
-    gemini_key_component,
-    image_prompt_component,
-    temperature_component,
-    max_output_tokens_component,
-    stop_sequences_component,
-    top_k_component,
-    top_p_component,
-    chatbot_component,
-]
-with gr.Blocks(theme="earneleh/paris") as demo:
-    gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini 2.0 Multimodal Chatbot</h1>")
-    with gr.Row():
-        gemini_key_component.render()
-    with gr.Row():
-        chatbot_component.render()
-    with gr.Row():
-        with gr.Column(scale=0.5):
-           text_prompt_component.render()
-        with gr.Column(scale=0.5):
-           image_prompt_component.render()
-        with gr.Column(scale=0.5):
-            run_button_component.render()
-    with gr.Accordion("🧪Example Text 💬", open=False):
-        example_radio = gr.Radio(
-        choices=example_scenarios,
-        label="Example Queries",
-        info="Select an example query.")
-        # Debug callback
-        example_radio.change(
-        fn=lambda query: query if query else "No query selected.",
-        inputs=[example_radio],
-        outputs=[text_prompt_component])
-       # Custom examples section with blue styling
-    with gr.Accordion("🧪Example Image 🩻", open=False):
-        gr.Examples(
-        examples=example_images,
-        inputs=[image_prompt_component],
-        label="Example Figures",
-        )
-    with gr.Accordion("🛠️Customize", open=False):
-        temperature_component.render()
-        max_output_tokens_component.render()
-        stop_sequences_component.render()
-        top_k_component.render()
-        top_p_component.render()
-    run_button_component.click(
-        fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
-    ).then(
-        fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
-    )
-demo.launch()

+from src.config.settings import MODEL_ID, MODEL_OPTIONS
+from src.utils.helpers import bot, user
 import gradio as gr
+def gradio_interface() -> gr.Blocks:
+    # Components
+    image_prompt_component = gr.Image(
+        type="pil",
+        label="Input Image (Optional: Figure/Graph)"
+    )
+    chatbot_component = gr.Chatbot(
+        label="Chatbot",
+    )
+    text_prompt_component = gr.Textbox(
+        placeholder="Type your question here...",
+        label="Ask",
+        lines=3
+    )
+    run_button_component = gr.Button("Submit")
+    temperature_component = gr.Slider(
+        minimum=0,
+        maximum=1.0,
+        value=0.4,
+        step=0.05,
+        label="Creativity (Temperature)",
+        info="Controls the randomness of the response. Higher values result in more creative answers."
+    )
+    max_output_tokens_component = gr.Slider(
+        minimum=1,
+        maximum=2048,
+        value=1024,
+        step=1,
+        label="Response Length (Token Limit)",
+        info="Sets the maximum number of tokens in the output response."
+    )
+    model_name_component = gr.Dropdown(
+            choices=MODEL_OPTIONS,
+            value=MODEL_ID,
+            label="Model Selection",
+            info="Choose the Gemini model to use for generation."
+    )
+    stop_sequences_component = gr.Textbox(
+        label="Stop Sequences (Optional)",
+        placeholder="Enter stop sequences, e.g., STOP, END",
+        info="Specify sequences to stop the generation."
+    )
+    top_k_component = gr.Slider(
+        minimum=1,
+        maximum=40,
+        value=32,
+        step=1,
+        label="Top-K Sampling",
+        info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
+    )
+    top_p_component = gr.Slider(
+        minimum=0,
+        maximum=1,
+        value=1,
+        step=0.01,
+        label="Top-P Sampling",
+        info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
+    )
+    example_scenarios = [
+        "Describe Multimodal AI",
+        "What are the differences between multi-agent LLMs and multi-agent systems",
+        "Why is it difficult to integrate multimodality in a prompt",
+    ]
+    example_images = [["research/ex1.png"], ["research/ex2.png"]]
+    # Gradio Interface
+    user_inputs = [text_prompt_component, chatbot_component]
+    bot_inputs = [
+        model_name_component,
+        image_prompt_component,
+        temperature_component,
+        max_output_tokens_component,
+        stop_sequences_component,
+        top_k_component,
+        top_p_component,
+        chatbot_component,
+    ]
+    with gr.Blocks() as app:
+        gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini Multimodal Chatbot</h1>")
+        with gr.Row():
+            chatbot_component.render()
+        with gr.Row():
+            with gr.Column(scale=1):
+                text_prompt_component.render()
+            with gr.Column(scale=1):
+                image_prompt_component.render()
+            with gr.Column(scale=1):
+                run_button_component.render()
+        with gr.Accordion("🧪Example Text 💬", open=False):
+            example_radio = gr.Radio(
+                choices=example_scenarios,
+                label="Example Queries",
+                info="Select an example query."
+            )
+            # Debug callback
+            example_radio.change(
+                fn=lambda query: query if query else "No query selected.",
+                inputs=[example_radio],
+                outputs=[text_prompt_component]
+            )
+            # Custom examples section with blue styling
+        with gr.Accordion("🧪Example Image 🩻", open=False):
+            gr.Examples(
+                examples=example_images,
+                inputs=[image_prompt_component],
+                label="Example Figures",
+            )
+        with gr.Accordion("🛠️Customize", open=False):
+            model_name_component.render()
+            temperature_component.render()
+            max_output_tokens_component.render()
+            stop_sequences_component.render()
+            top_k_component.render()
+            top_p_component.render()
+        run_button_component.click(
+            fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
+        ).then(
+            fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
+        )
+    return app
+if __name__ == "__main__":
+    gradio_interface().launch(share=True, theme="earneleh/paris")

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-google-generativeai==0.8.6
 gradio==6.5.1
 python-dotenv==1.2.1
 imageio==2.37.2
 requests==2.32.5

+google-genai==1.62.0
 gradio==6.5.1
 python-dotenv==1.2.1
 imageio==2.37.2
 requests==2.32.5
+pillow==12.1.1

research/notebook.ipynb CHANGED Viewed

@@ -39,6 +39,28 @@
     ")\n",
     "print(response.text)"
    ]
   }
  ],
  "metadata": {

     ")\n",
     "print(response.text)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "00cfccb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, List, Optional\n",
+    "from PIL import Image\n",
+    "from google import genai\n",
+    "from google.genai import types\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c01ae4a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

setup.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

src/config/settings.py CHANGED Viewed

@@ -4,9 +4,10 @@ import os
 load_dotenv()
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-CHATBOT_NAME = os.getenv("CHATBOT_NAME", "gemini-2.5-flash")
 MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
 MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
 IMAGE_WIDTH = 512
 IMAGE_HEIGHT = 512
-SYSTEM_INSTRUCTION_ANALYSIS = "You are an expert in image analysis and computer vision. Analyze any uploaded image in detail, providing specific descriptions of visual elements, composition, and content. Explain how this image can be effectively used in AI applications, including potential use cases for machine learning, computer vision tasks, and multimodal AI systems. Provide actionable insights for optimal image utilization."

 load_dotenv()
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+CHATBOT_NAME = os.getenv("CHATBOT_NAME", "Gemini Multimodal Chatbot")
 MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
+MODEL_TEMPERATURE = float(os.getenv("MODEL_TEMPERATURE", "0.7"))
 MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
 IMAGE_WIDTH = 512
 IMAGE_HEIGHT = 512
+SYSTEM_INSTRUCTION = "You are an expert in image analysis and computer vision. Analyze any uploaded image in detail, providing specific descriptions of visual elements, composition, and content. Explain how this image can be effectively used in AI applications, including potential use cases for machine learning, computer vision tasks, and multimodal AI systems. Provide actionable insights for optimal image utilization."

src/utils/__init__.py CHANGED Viewed

@@ -1,5 +0,0 @@
-from .preprocess import preprocess_stop_sequences, preprocess_image
-from .prompts import user
-from .generator import bot
-__all__ = ["preprocess_stop_sequences", "preprocess_image", "user", "bot"]

src/utils/helpers.py CHANGED Viewed

@@ -1,19 +1,9 @@
-from typing import List, Optional, Tuple, Tuple
 from PIL import Image
-import google.generativeai as genai
 import time
-from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION_ANALYSIS
-def initialize_model(api_key: Optional[str] = None):
-    """Initialize the Gemini generative model."""
-    if api_key:
-        genai.configure(api_key=api_key)
-    elif GEMINI_API_KEY:
-        genai.configure(api_key=GEMINI_API_KEY)
-    model = genai.GenerativeModel(CHATBOT_NAME, system_instruction=SYSTEM_INSTRUCTION_ANALYSIS)
-    return model
 def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
     return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
@@ -22,65 +12,77 @@ def preprocess_image(image: Image.Image) -> Image.Image:
     image_height = int(image.height * IMAGE_WIDTH / image.width)
     return image.resize((IMAGE_WIDTH, image_height))
-def user(text_prompt: str, chatbot: List[Tuple[str, str]]):
-    return "", chatbot + [[text_prompt, None]]
 def bot(
-    gemini_key: str,
     image_prompt: Optional[Image.Image],
     temperature: float,
     max_output_tokens: int,
     stop_sequences: str,
     top_k: int,
     top_p: float,
-    chatbot: List[Tuple[str, str]]
 ):
-    gemini_key = gemini_key or GEMINI_API_KEY
-    if not gemini_key:
-        raise ValueError("GEMINI_API_KEY is not set. Please set it up.")
-    text_prompt = chatbot[-1][0].strip() if chatbot[-1][0] else None
-    # Handle cases for text and/or image input
     if not text_prompt and not image_prompt:
-        chatbot[-1][1] = "Prompt cannot be empty. Please provide input text or an image."
         yield chatbot
         return
     elif image_prompt and not text_prompt:
-        # If only an image is provided
         text_prompt = "Describe the image"
     elif image_prompt and text_prompt:
-        # If both text and image are provided, combine them
         text_prompt = f"{text_prompt}. Also, analyze the provided image."
-    # Configure the model
-    genai.configure(api_key=gemini_key)
-    generation_config = genai.types.GenerationConfig(
         temperature=temperature,
         max_output_tokens=max_output_tokens,
         stop_sequences=preprocess_stop_sequences(stop_sequences),
         top_k=top_k,
         top_p=top_p,
     )
-    # Prepare inputs
-    inputs = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
-    # Generate response
     try:
-        response = model.generate_content(inputs, stream=True, generation_config=generation_config)
-        response.resolve()
     except Exception as e:
-        chatbot[-1][1] = f"Error occurred: {str(e)}"
         yield chatbot
         return
-    # Stream the response back to the chatbot
-    chatbot[-1][1] = ""
-    for chunk in response:
-        for i in range(0, len(chunk.text), 10):
-            chatbot[-1][1] += chunk.text[i:i + 10]
-            time.sleep(0.01)
-            yield chatbot

+from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_TEMPERATURE, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION
+from typing import Dict, List, Optional
 from PIL import Image
+from google import genai
+from google.genai import types
 import time
 def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
     return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
     image_height = int(image.height * IMAGE_WIDTH / image.width)
     return image.resize((IMAGE_WIDTH, image_height))
+def user(text_prompt: str, chatbot: List[Dict[str, str]]):
+    return "", chatbot + [{"role": "user", "content": text_prompt}]
 def bot(
+    model_name: str,
     image_prompt: Optional[Image.Image],
     temperature: float,
     max_output_tokens: int,
     stop_sequences: str,
     top_k: int,
     top_p: float,
+    chatbot: List[Dict[str, str]]
 ):
+    if not GEMINI_API_KEY:
+        chatbot.append({"role": "assistant", "content": "GEMINI_API_KEY is not set. Please add it to your .env file."})
+        yield chatbot
+        return
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    # Gradio v6 may store content as a list of parts or a plain string
+    raw_content = chatbot[-1].get("content") if chatbot else None
+    if isinstance(raw_content, list):
+        text_prompt = " ".join(
+            part.get("text", "") if isinstance(part, dict) else str(part)
+            for part in raw_content
+        ).strip() or None
+    elif isinstance(raw_content, str):
+        text_prompt = raw_content.strip() or None
+    else:
+        text_prompt = None
     if not text_prompt and not image_prompt:
+        chatbot.append({"role": "assistant", "content": "Prompt cannot be empty. Please provide input text or an image."})
         yield chatbot
         return
     elif image_prompt and not text_prompt:
         text_prompt = "Describe the image"
     elif image_prompt and text_prompt:
         text_prompt = f"{text_prompt}. Also, analyze the provided image."
+    contents = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
+    config = types.GenerateContentConfig(
+        system_instruction=SYSTEM_INSTRUCTION,
         temperature=temperature,
         max_output_tokens=max_output_tokens,
         stop_sequences=preprocess_stop_sequences(stop_sequences),
         top_k=top_k,
         top_p=top_p,
+        safety_settings=[
+            types.SafetySetting(
+                category="HARM_CATEGORY_DANGEROUS_CONTENT",
+                threshold="BLOCK_ONLY_HIGH",
+            )
+        ],
     )
+    chatbot.append({"role": "assistant", "content": ""})
     try:
+        for chunk in client.models.generate_content_stream(
+            model=model_name,
+            contents=contents,
+            config=config,
+        ):
+            if chunk.text:
+                for i in range(0, len(chunk.text), 10):
+                    chatbot[-1]["content"] += chunk.text[i:i + 10]
+                    time.sleep(0.01)
+                    yield chatbot
     except Exception as e:
+        chatbot[-1]["content"] = f"Error occurred: {str(e)}"
         yield chatbot
         return