elsayedelmandoh commited on
Commit
a1660ff
·
1 Parent(s): d821914

upload project

Browse files
README.md CHANGED
@@ -67,7 +67,7 @@ conda activate multimodal-chatbot
67
  conda install pip -y
68
  pip install -r requirements.txt
69
  ```
70
- You may use a .env loader or store vars in the Hugging Face Space secrets.
71
 
72
  ### Run locally
73
  ```bash
 
67
  conda install pip -y
68
  pip install -r requirements.txt
69
  ```
70
+ Create a .env file at the project root or store vars in the Hugging Face Space secrets.
71
 
72
  ### Run locally
73
  ```bash
app.py CHANGED
@@ -1,129 +1,134 @@
1
- from src.config.settings import GEMINI_API_KEY
2
-
3
- import os
4
- import time
5
- from typing import List, Tuple, Optional
6
- import google.generativeai as genai
7
  import gradio as gr
8
- from PIL import Image
9
- import tempfile
10
- import os
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Components
14
- gemini_key_component = gr.Textbox(
15
- label="Gemini API Key",
16
- type="password",
17
- placeholder="Enter your Gemini API Key",
18
- visible=GEMINI_API_KEY is None
19
- )
 
 
 
 
 
20
 
21
- image_prompt_component = gr.Image(type="pil", label="Input Image (Optional: Figure/Graph)")
22
- chatbot_component = gr.Chatbot(label="Chatbot", bubble_full_width=False)
23
- text_prompt_component = gr.Textbox(
24
- placeholder="Type your question here...",
25
- label="Ask",
26
- lines=3
27
- )
28
- run_button_component = gr.Button("Submit")
29
- temperature_component = gr.Slider(
30
- minimum=0,
31
- maximum=1.0,
32
- value=0.4,
33
- step=0.05,
34
- label="Creativity (Temperature)",
35
- info="Controls the randomness of the response. Higher values result in more creative answers."
36
- )
37
- max_output_tokens_component = gr.Slider(
38
- minimum=1,
39
- maximum=2048,
40
- value=1024,
41
- step=1,
42
- label="Response Length (Token Limit)",
43
- info="Sets the maximum number of tokens in the output response."
44
- )
45
- stop_sequences_component = gr.Textbox(
46
- label="Stop Sequences (Optional)",
47
- placeholder="Enter stop sequences, e.g., STOP, END",
48
- info="Specify sequences to stop the generation."
49
- )
50
- top_k_component = gr.Slider(
51
- minimum=1,
52
- maximum=40,
53
- value=32,
54
- step=1,
55
- label="Top-K Sampling",
56
- info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
57
- )
58
- top_p_component = gr.Slider(
59
- minimum=0,
60
- maximum=1,
61
- value=1,
62
- step=0.01,
63
- label="Top-P Sampling",
64
- info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
65
- )
66
- example_scenarios = [
67
- "Describe Multimodal AI",
68
- "What are the difference between muliagent llm and multiagent system",
69
- "Why it's difficult to intgrate multimodality in prompt"]
70
- example_images = [["ex1.png"],["ex2.png"]]
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Gradio Interface
74
- user_inputs = [text_prompt_component, chatbot_component]
75
- bot_inputs = [
76
- gemini_key_component,
77
- image_prompt_component,
78
- temperature_component,
79
- max_output_tokens_component,
80
- stop_sequences_component,
81
- top_k_component,
82
- top_p_component,
83
- chatbot_component,
84
- ]
 
85
 
 
 
 
 
 
86
 
87
- with gr.Blocks(theme="earneleh/paris") as demo:
88
- gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini 2.0 Multimodal Chatbot</h1>")
89
- with gr.Row():
90
- gemini_key_component.render()
91
- with gr.Row():
92
- chatbot_component.render()
93
- with gr.Row():
94
- with gr.Column(scale=0.5):
95
- text_prompt_component.render()
96
- with gr.Column(scale=0.5):
97
- image_prompt_component.render()
98
- with gr.Column(scale=0.5):
99
- run_button_component.render()
100
- with gr.Accordion("🧪Example Text 💬", open=False):
101
- example_radio = gr.Radio(
102
- choices=example_scenarios,
103
- label="Example Queries",
104
- info="Select an example query.")
105
- # Debug callback
106
- example_radio.change(
107
- fn=lambda query: query if query else "No query selected.",
108
- inputs=[example_radio],
109
- outputs=[text_prompt_component])
110
- # Custom examples section with blue styling
111
- with gr.Accordion("🧪Example Image 🩻", open=False):
112
- gr.Examples(
113
- examples=example_images,
114
- inputs=[image_prompt_component],
115
- label="Example Figures",
116
- )
117
- with gr.Accordion("🛠️Customize", open=False):
118
- temperature_component.render()
119
- max_output_tokens_component.render()
120
- stop_sequences_component.render()
121
- top_k_component.render()
122
- top_p_component.render()
123
 
124
- run_button_component.click(
125
- fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
126
- ).then(
127
- fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
128
- )
129
- demo.launch()
 
1
+ from src.config.settings import MODEL_ID, MODEL_OPTIONS
2
+ from src.utils.helpers import bot, user
 
 
 
 
3
  import gradio as gr
 
 
 
4
 
5
+ def gradio_interface() -> gr.Blocks:
6
+ # Components
7
+ image_prompt_component = gr.Image(
8
+ type="pil",
9
+ label="Input Image (Optional: Figure/Graph)"
10
+ )
11
+ chatbot_component = gr.Chatbot(
12
+ label="Chatbot",
13
+ )
14
+ text_prompt_component = gr.Textbox(
15
+ placeholder="Type your question here...",
16
+ label="Ask",
17
+ lines=3
18
+ )
19
+ run_button_component = gr.Button("Submit")
20
+
21
+ temperature_component = gr.Slider(
22
+ minimum=0,
23
+ maximum=1.0,
24
+ value=0.4,
25
+ step=0.05,
26
+ label="Creativity (Temperature)",
27
+ info="Controls the randomness of the response. Higher values result in more creative answers."
28
+ )
29
+ max_output_tokens_component = gr.Slider(
30
+ minimum=1,
31
+ maximum=2048,
32
+ value=1024,
33
+ step=1,
34
+ label="Response Length (Token Limit)",
35
+ info="Sets the maximum number of tokens in the output response."
36
+ )
37
+ model_name_component = gr.Dropdown(
38
+ choices=MODEL_OPTIONS,
39
+ value=MODEL_ID,
40
+ label="Model Selection",
41
+ info="Choose the Gemini model to use for generation."
42
+ )
43
+
44
+ stop_sequences_component = gr.Textbox(
45
+ label="Stop Sequences (Optional)",
46
+ placeholder="Enter stop sequences, e.g., STOP, END",
47
+ info="Specify sequences to stop the generation."
48
+ )
49
+ top_k_component = gr.Slider(
50
+ minimum=1,
51
+ maximum=40,
52
+ value=32,
53
+ step=1,
54
+ label="Top-K Sampling",
55
+ info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
56
+ )
57
+ top_p_component = gr.Slider(
58
+ minimum=0,
59
+ maximum=1,
60
+ value=1,
61
+ step=0.01,
62
+ label="Top-P Sampling",
63
+ info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
64
+ )
65
+ example_scenarios = [
66
+ "Describe Multimodal AI",
67
+ "What are the differences between multi-agent LLMs and multi-agent systems",
68
+ "Why is it difficult to integrate multimodality in a prompt",
69
+ ]
70
+ example_images = [["research/ex1.png"], ["research/ex2.png"]]
71
 
72
+ # Gradio Interface
73
+ user_inputs = [text_prompt_component, chatbot_component]
74
+ bot_inputs = [
75
+ model_name_component,
76
+ image_prompt_component,
77
+ temperature_component,
78
+ max_output_tokens_component,
79
+ stop_sequences_component,
80
+ top_k_component,
81
+ top_p_component,
82
+ chatbot_component,
83
+ ]
84
 
85
+ with gr.Blocks() as app:
86
+ gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini Multimodal Chatbot</h1>")
87
+ with gr.Row():
88
+ chatbot_component.render()
89
+ with gr.Row():
90
+ with gr.Column(scale=1):
91
+ text_prompt_component.render()
92
+ with gr.Column(scale=1):
93
+ image_prompt_component.render()
94
+ with gr.Column(scale=1):
95
+ run_button_component.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ with gr.Accordion("🧪Example Text 💬", open=False):
98
+ example_radio = gr.Radio(
99
+ choices=example_scenarios,
100
+ label="Example Queries",
101
+ info="Select an example query."
102
+ )
103
+ # Debug callback
104
+ example_radio.change(
105
+ fn=lambda query: query if query else "No query selected.",
106
+ inputs=[example_radio],
107
+ outputs=[text_prompt_component]
108
+ )
109
+ # Custom examples section with blue styling
110
 
111
+ with gr.Accordion("🧪Example Image 🩻", open=False):
112
+ gr.Examples(
113
+ examples=example_images,
114
+ inputs=[image_prompt_component],
115
+ label="Example Figures",
116
+ )
117
+ with gr.Accordion("🛠️Customize", open=False):
118
+ model_name_component.render()
119
+ temperature_component.render()
120
+ max_output_tokens_component.render()
121
+ stop_sequences_component.render()
122
+ top_k_component.render()
123
+ top_p_component.render()
124
 
125
+ run_button_component.click(
126
+ fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
127
+ ).then(
128
+ fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
129
+ )
130
 
131
+ return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ if __name__ == "__main__":
134
+ gradio_interface().launch(share=True, theme="earneleh/paris")
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- google-generativeai==0.8.6
2
  gradio==6.5.1
3
  python-dotenv==1.2.1
4
  imageio==2.37.2
5
  requests==2.32.5
 
 
1
+ google-genai==1.62.0
2
  gradio==6.5.1
3
  python-dotenv==1.2.1
4
  imageio==2.37.2
5
  requests==2.32.5
6
+ pillow==12.1.1
research/notebook.ipynb CHANGED
@@ -39,6 +39,28 @@
39
  ")\n",
40
  "print(response.text)"
41
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  ],
44
  "metadata": {
 
39
  ")\n",
40
  "print(response.text)"
41
  ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "id": "00cfccb8",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "from typing import Dict, List, Optional\n",
51
+ "from PIL import Image\n",
52
+ "from google import genai\n",
53
+ "from google.genai import types\n",
54
+ "import time"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "id": "9c01ae4a",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": []
64
  }
65
  ],
66
  "metadata": {
setup.py DELETED
@@ -1 +0,0 @@
1
-
 
 
src/config/settings.py CHANGED
@@ -4,9 +4,10 @@ import os
4
  load_dotenv()
5
 
6
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
7
- CHATBOT_NAME = os.getenv("CHATBOT_NAME", "gemini-2.5-flash")
8
  MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
 
9
  MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
10
  IMAGE_WIDTH = 512
11
  IMAGE_HEIGHT = 512
12
- SYSTEM_INSTRUCTION_ANALYSIS = "You are an expert in image analysis and computer vision. Analyze any uploaded image in detail, providing specific descriptions of visual elements, composition, and content. Explain how this image can be effectively used in AI applications, including potential use cases for machine learning, computer vision tasks, and multimodal AI systems. Provide actionable insights for optimal image utilization."
 
4
  load_dotenv()
5
 
6
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
7
+ CHATBOT_NAME = os.getenv("CHATBOT_NAME", "Gemini Multimodal Chatbot")
8
  MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
9
+ MODEL_TEMPERATURE = float(os.getenv("MODEL_TEMPERATURE", "0.7"))
10
  MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
11
  IMAGE_WIDTH = 512
12
  IMAGE_HEIGHT = 512
13
+ SYSTEM_INSTRUCTION = "You are an expert in image analysis and computer vision. Analyze any uploaded image in detail, providing specific descriptions of visual elements, composition, and content. Explain how this image can be effectively used in AI applications, including potential use cases for machine learning, computer vision tasks, and multimodal AI systems. Provide actionable insights for optimal image utilization."
src/utils/__init__.py CHANGED
@@ -1,5 +0,0 @@
1
- from .preprocess import preprocess_stop_sequences, preprocess_image
2
- from .prompts import user
3
- from .generator import bot
4
-
5
- __all__ = ["preprocess_stop_sequences", "preprocess_image", "user", "bot"]
 
 
 
 
 
 
src/utils/helpers.py CHANGED
@@ -1,19 +1,9 @@
1
- from typing import List, Optional, Tuple, Tuple
 
2
  from PIL import Image
3
- import google.generativeai as genai
 
4
  import time
5
- from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION_ANALYSIS
6
-
7
- def initialize_model(api_key: Optional[str] = None):
8
- """Initialize the Gemini generative model."""
9
- if api_key:
10
- genai.configure(api_key=api_key)
11
- elif GEMINI_API_KEY:
12
- genai.configure(api_key=GEMINI_API_KEY)
13
-
14
- model = genai.GenerativeModel(CHATBOT_NAME, system_instruction=SYSTEM_INSTRUCTION_ANALYSIS)
15
- return model
16
-
17
 
18
  def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
19
  return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
@@ -22,65 +12,77 @@ def preprocess_image(image: Image.Image) -> Image.Image:
22
  image_height = int(image.height * IMAGE_WIDTH / image.width)
23
  return image.resize((IMAGE_WIDTH, image_height))
24
 
25
-
26
- def user(text_prompt: str, chatbot: List[Tuple[str, str]]):
27
- return "", chatbot + [[text_prompt, None]]
28
-
29
 
30
  def bot(
31
- gemini_key: str,
32
  image_prompt: Optional[Image.Image],
33
  temperature: float,
34
  max_output_tokens: int,
35
  stop_sequences: str,
36
  top_k: int,
37
  top_p: float,
38
- chatbot: List[Tuple[str, str]]
39
  ):
40
- gemini_key = gemini_key or GEMINI_API_KEY
41
- if not gemini_key:
42
- raise ValueError("GEMINI_API_KEY is not set. Please set it up.")
 
 
 
43
 
44
- text_prompt = chatbot[-1][0].strip() if chatbot[-1][0] else None
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Handle cases for text and/or image input
47
  if not text_prompt and not image_prompt:
48
- chatbot[-1][1] = "Prompt cannot be empty. Please provide input text or an image."
49
  yield chatbot
50
  return
51
  elif image_prompt and not text_prompt:
52
- # If only an image is provided
53
  text_prompt = "Describe the image"
54
  elif image_prompt and text_prompt:
55
- # If both text and image are provided, combine them
56
  text_prompt = f"{text_prompt}. Also, analyze the provided image."
57
 
58
- # Configure the model
59
- genai.configure(api_key=gemini_key)
60
- generation_config = genai.types.GenerationConfig(
 
61
  temperature=temperature,
62
  max_output_tokens=max_output_tokens,
63
  stop_sequences=preprocess_stop_sequences(stop_sequences),
64
  top_k=top_k,
65
  top_p=top_p,
 
 
 
 
 
 
66
  )
67
 
68
- # Prepare inputs
69
- inputs = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
70
-
71
- # Generate response
72
  try:
73
- response = model.generate_content(inputs, stream=True, generation_config=generation_config)
74
- response.resolve()
 
 
 
 
 
 
 
 
75
  except Exception as e:
76
- chatbot[-1][1] = f"Error occurred: {str(e)}"
77
  yield chatbot
78
  return
79
-
80
- # Stream the response back to the chatbot
81
- chatbot[-1][1] = ""
82
- for chunk in response:
83
- for i in range(0, len(chunk.text), 10):
84
- chatbot[-1][1] += chunk.text[i:i + 10]
85
- time.sleep(0.01)
86
- yield chatbot
 
1
+ from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_TEMPERATURE, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION
2
+ from typing import Dict, List, Optional
3
  from PIL import Image
4
+ from google import genai
5
+ from google.genai import types
6
  import time
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
9
  return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
 
12
  image_height = int(image.height * IMAGE_WIDTH / image.width)
13
  return image.resize((IMAGE_WIDTH, image_height))
14
 
15
+ def user(text_prompt: str, chatbot: List[Dict[str, str]]):
16
+ return "", chatbot + [{"role": "user", "content": text_prompt}]
 
 
17
 
18
  def bot(
19
+ model_name: str,
20
  image_prompt: Optional[Image.Image],
21
  temperature: float,
22
  max_output_tokens: int,
23
  stop_sequences: str,
24
  top_k: int,
25
  top_p: float,
26
+ chatbot: List[Dict[str, str]]
27
  ):
28
+ if not GEMINI_API_KEY:
29
+ chatbot.append({"role": "assistant", "content": "GEMINI_API_KEY is not set. Please add it to your .env file."})
30
+ yield chatbot
31
+ return
32
+
33
+ client = genai.Client(api_key=GEMINI_API_KEY)
34
 
35
+ # Gradio v6 may store content as a list of parts or a plain string
36
+ raw_content = chatbot[-1].get("content") if chatbot else None
37
+ if isinstance(raw_content, list):
38
+ text_prompt = " ".join(
39
+ part.get("text", "") if isinstance(part, dict) else str(part)
40
+ for part in raw_content
41
+ ).strip() or None
42
+ elif isinstance(raw_content, str):
43
+ text_prompt = raw_content.strip() or None
44
+ else:
45
+ text_prompt = None
46
 
 
47
  if not text_prompt and not image_prompt:
48
+ chatbot.append({"role": "assistant", "content": "Prompt cannot be empty. Please provide input text or an image."})
49
  yield chatbot
50
  return
51
  elif image_prompt and not text_prompt:
 
52
  text_prompt = "Describe the image"
53
  elif image_prompt and text_prompt:
 
54
  text_prompt = f"{text_prompt}. Also, analyze the provided image."
55
 
56
+ contents = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
57
+
58
+ config = types.GenerateContentConfig(
59
+ system_instruction=SYSTEM_INSTRUCTION,
60
  temperature=temperature,
61
  max_output_tokens=max_output_tokens,
62
  stop_sequences=preprocess_stop_sequences(stop_sequences),
63
  top_k=top_k,
64
  top_p=top_p,
65
+ safety_settings=[
66
+ types.SafetySetting(
67
+ category="HARM_CATEGORY_DANGEROUS_CONTENT",
68
+ threshold="BLOCK_ONLY_HIGH",
69
+ )
70
+ ],
71
  )
72
 
73
+ chatbot.append({"role": "assistant", "content": ""})
 
 
 
74
  try:
75
+ for chunk in client.models.generate_content_stream(
76
+ model=model_name,
77
+ contents=contents,
78
+ config=config,
79
+ ):
80
+ if chunk.text:
81
+ for i in range(0, len(chunk.text), 10):
82
+ chatbot[-1]["content"] += chunk.text[i:i + 10]
83
+ time.sleep(0.01)
84
+ yield chatbot
85
  except Exception as e:
86
+ chatbot[-1]["content"] = f"Error occurred: {str(e)}"
87
  yield chatbot
88
  return