Spaces:
Sleeping
Sleeping
Commit ·
a1660ff
1
Parent(s): d821914
upload project
Browse files- README.md +1 -1
- app.py +125 -120
- requirements.txt +2 -1
- research/notebook.ipynb +22 -0
- setup.py +0 -1
- src/config/settings.py +3 -2
- src/utils/__init__.py +0 -5
- src/utils/helpers.py +48 -46
README.md
CHANGED
|
@@ -67,7 +67,7 @@ conda activate multimodal-chatbot
|
|
| 67 |
conda install pip -y
|
| 68 |
pip install -r requirements.txt
|
| 69 |
```
|
| 70 |
-
|
| 71 |
|
| 72 |
### Run locally
|
| 73 |
```bash
|
|
|
|
| 67 |
conda install pip -y
|
| 68 |
pip install -r requirements.txt
|
| 69 |
```
|
| 70 |
+
Create a .env file at the project root or store vars in the Hugging Face Space secrets.
|
| 71 |
|
| 72 |
### Run locally
|
| 73 |
```bash
|
app.py
CHANGED
|
@@ -1,129 +1,134 @@
|
|
| 1 |
-
from src.config.settings import
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import time
|
| 5 |
-
from typing import List, Tuple, Optional
|
| 6 |
-
import google.generativeai as genai
|
| 7 |
import gradio as gr
|
| 8 |
-
from PIL import Image
|
| 9 |
-
import tempfile
|
| 10 |
-
import os
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
value=0.4,
|
| 33 |
-
step=0.05,
|
| 34 |
-
label="Creativity (Temperature)",
|
| 35 |
-
info="Controls the randomness of the response. Higher values result in more creative answers."
|
| 36 |
-
)
|
| 37 |
-
max_output_tokens_component = gr.Slider(
|
| 38 |
-
minimum=1,
|
| 39 |
-
maximum=2048,
|
| 40 |
-
value=1024,
|
| 41 |
-
step=1,
|
| 42 |
-
label="Response Length (Token Limit)",
|
| 43 |
-
info="Sets the maximum number of tokens in the output response."
|
| 44 |
-
)
|
| 45 |
-
stop_sequences_component = gr.Textbox(
|
| 46 |
-
label="Stop Sequences (Optional)",
|
| 47 |
-
placeholder="Enter stop sequences, e.g., STOP, END",
|
| 48 |
-
info="Specify sequences to stop the generation."
|
| 49 |
-
)
|
| 50 |
-
top_k_component = gr.Slider(
|
| 51 |
-
minimum=1,
|
| 52 |
-
maximum=40,
|
| 53 |
-
value=32,
|
| 54 |
-
step=1,
|
| 55 |
-
label="Top-K Sampling",
|
| 56 |
-
info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
|
| 57 |
-
)
|
| 58 |
-
top_p_component = gr.Slider(
|
| 59 |
-
minimum=0,
|
| 60 |
-
maximum=1,
|
| 61 |
-
value=1,
|
| 62 |
-
step=0.01,
|
| 63 |
-
label="Top-P Sampling",
|
| 64 |
-
info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
|
| 65 |
-
)
|
| 66 |
-
example_scenarios = [
|
| 67 |
-
"Describe Multimodal AI",
|
| 68 |
-
"What are the difference between muliagent llm and multiagent system",
|
| 69 |
-
"Why it's difficult to intgrate multimodality in prompt"]
|
| 70 |
-
example_images = [["ex1.png"],["ex2.png"]]
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini 2.0 Multimodal Chatbot</h1>")
|
| 89 |
-
with gr.Row():
|
| 90 |
-
gemini_key_component.render()
|
| 91 |
-
with gr.Row():
|
| 92 |
-
chatbot_component.render()
|
| 93 |
-
with gr.Row():
|
| 94 |
-
with gr.Column(scale=0.5):
|
| 95 |
-
text_prompt_component.render()
|
| 96 |
-
with gr.Column(scale=0.5):
|
| 97 |
-
image_prompt_component.render()
|
| 98 |
-
with gr.Column(scale=0.5):
|
| 99 |
-
run_button_component.render()
|
| 100 |
-
with gr.Accordion("🧪Example Text 💬", open=False):
|
| 101 |
-
example_radio = gr.Radio(
|
| 102 |
-
choices=example_scenarios,
|
| 103 |
-
label="Example Queries",
|
| 104 |
-
info="Select an example query.")
|
| 105 |
-
# Debug callback
|
| 106 |
-
example_radio.change(
|
| 107 |
-
fn=lambda query: query if query else "No query selected.",
|
| 108 |
-
inputs=[example_radio],
|
| 109 |
-
outputs=[text_prompt_component])
|
| 110 |
-
# Custom examples section with blue styling
|
| 111 |
-
with gr.Accordion("🧪Example Image 🩻", open=False):
|
| 112 |
-
gr.Examples(
|
| 113 |
-
examples=example_images,
|
| 114 |
-
inputs=[image_prompt_component],
|
| 115 |
-
label="Example Figures",
|
| 116 |
-
)
|
| 117 |
-
with gr.Accordion("🛠️Customize", open=False):
|
| 118 |
-
temperature_component.render()
|
| 119 |
-
max_output_tokens_component.render()
|
| 120 |
-
stop_sequences_component.render()
|
| 121 |
-
top_k_component.render()
|
| 122 |
-
top_p_component.render()
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
).then(
|
| 127 |
-
fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
|
| 128 |
-
)
|
| 129 |
-
demo.launch()
|
|
|
|
| 1 |
+
from src.config.settings import MODEL_ID, MODEL_OPTIONS
|
| 2 |
+
from src.utils.helpers import bot, user
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
def gradio_interface() -> gr.Blocks:
|
| 6 |
+
# Components
|
| 7 |
+
image_prompt_component = gr.Image(
|
| 8 |
+
type="pil",
|
| 9 |
+
label="Input Image (Optional: Figure/Graph)"
|
| 10 |
+
)
|
| 11 |
+
chatbot_component = gr.Chatbot(
|
| 12 |
+
label="Chatbot",
|
| 13 |
+
)
|
| 14 |
+
text_prompt_component = gr.Textbox(
|
| 15 |
+
placeholder="Type your question here...",
|
| 16 |
+
label="Ask",
|
| 17 |
+
lines=3
|
| 18 |
+
)
|
| 19 |
+
run_button_component = gr.Button("Submit")
|
| 20 |
+
|
| 21 |
+
temperature_component = gr.Slider(
|
| 22 |
+
minimum=0,
|
| 23 |
+
maximum=1.0,
|
| 24 |
+
value=0.4,
|
| 25 |
+
step=0.05,
|
| 26 |
+
label="Creativity (Temperature)",
|
| 27 |
+
info="Controls the randomness of the response. Higher values result in more creative answers."
|
| 28 |
+
)
|
| 29 |
+
max_output_tokens_component = gr.Slider(
|
| 30 |
+
minimum=1,
|
| 31 |
+
maximum=2048,
|
| 32 |
+
value=1024,
|
| 33 |
+
step=1,
|
| 34 |
+
label="Response Length (Token Limit)",
|
| 35 |
+
info="Sets the maximum number of tokens in the output response."
|
| 36 |
+
)
|
| 37 |
+
model_name_component = gr.Dropdown(
|
| 38 |
+
choices=MODEL_OPTIONS,
|
| 39 |
+
value=MODEL_ID,
|
| 40 |
+
label="Model Selection",
|
| 41 |
+
info="Choose the Gemini model to use for generation."
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
stop_sequences_component = gr.Textbox(
|
| 45 |
+
label="Stop Sequences (Optional)",
|
| 46 |
+
placeholder="Enter stop sequences, e.g., STOP, END",
|
| 47 |
+
info="Specify sequences to stop the generation."
|
| 48 |
+
)
|
| 49 |
+
top_k_component = gr.Slider(
|
| 50 |
+
minimum=1,
|
| 51 |
+
maximum=40,
|
| 52 |
+
value=32,
|
| 53 |
+
step=1,
|
| 54 |
+
label="Top-K Sampling",
|
| 55 |
+
info="Limits token selection to the top K most probable tokens. Lower values produce conservative outputs."
|
| 56 |
+
)
|
| 57 |
+
top_p_component = gr.Slider(
|
| 58 |
+
minimum=0,
|
| 59 |
+
maximum=1,
|
| 60 |
+
value=1,
|
| 61 |
+
step=0.01,
|
| 62 |
+
label="Top-P Sampling",
|
| 63 |
+
info="Limits token selection to tokens with a cumulative probability up to P. Lower values produce conservative outputs."
|
| 64 |
+
)
|
| 65 |
+
example_scenarios = [
|
| 66 |
+
"Describe Multimodal AI",
|
| 67 |
+
"What are the differences between multi-agent LLMs and multi-agent systems",
|
| 68 |
+
"Why is it difficult to integrate multimodality in a prompt",
|
| 69 |
+
]
|
| 70 |
+
example_images = [["research/ex1.png"], ["research/ex2.png"]]
|
| 71 |
|
| 72 |
+
# Gradio Interface
|
| 73 |
+
user_inputs = [text_prompt_component, chatbot_component]
|
| 74 |
+
bot_inputs = [
|
| 75 |
+
model_name_component,
|
| 76 |
+
image_prompt_component,
|
| 77 |
+
temperature_component,
|
| 78 |
+
max_output_tokens_component,
|
| 79 |
+
stop_sequences_component,
|
| 80 |
+
top_k_component,
|
| 81 |
+
top_p_component,
|
| 82 |
+
chatbot_component,
|
| 83 |
+
]
|
| 84 |
|
| 85 |
+
with gr.Blocks() as app:
|
| 86 |
+
gr.Markdown("<h1 style='font-size: 36px; font-weight: bold; font-family: Arial;'>Gemini Multimodal Chatbot</h1>")
|
| 87 |
+
with gr.Row():
|
| 88 |
+
chatbot_component.render()
|
| 89 |
+
with gr.Row():
|
| 90 |
+
with gr.Column(scale=1):
|
| 91 |
+
text_prompt_component.render()
|
| 92 |
+
with gr.Column(scale=1):
|
| 93 |
+
image_prompt_component.render()
|
| 94 |
+
with gr.Column(scale=1):
|
| 95 |
+
run_button_component.render()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
with gr.Accordion("🧪Example Text 💬", open=False):
|
| 98 |
+
example_radio = gr.Radio(
|
| 99 |
+
choices=example_scenarios,
|
| 100 |
+
label="Example Queries",
|
| 101 |
+
info="Select an example query."
|
| 102 |
+
)
|
| 103 |
+
# Debug callback
|
| 104 |
+
example_radio.change(
|
| 105 |
+
fn=lambda query: query if query else "No query selected.",
|
| 106 |
+
inputs=[example_radio],
|
| 107 |
+
outputs=[text_prompt_component]
|
| 108 |
+
)
|
| 109 |
+
# Custom examples section with blue styling
|
| 110 |
|
| 111 |
+
with gr.Accordion("🧪Example Image 🩻", open=False):
|
| 112 |
+
gr.Examples(
|
| 113 |
+
examples=example_images,
|
| 114 |
+
inputs=[image_prompt_component],
|
| 115 |
+
label="Example Figures",
|
| 116 |
+
)
|
| 117 |
+
with gr.Accordion("🛠️Customize", open=False):
|
| 118 |
+
model_name_component.render()
|
| 119 |
+
temperature_component.render()
|
| 120 |
+
max_output_tokens_component.render()
|
| 121 |
+
stop_sequences_component.render()
|
| 122 |
+
top_k_component.render()
|
| 123 |
+
top_p_component.render()
|
| 124 |
|
| 125 |
+
run_button_component.click(
|
| 126 |
+
fn=user, inputs=user_inputs, outputs=[text_prompt_component, chatbot_component]
|
| 127 |
+
).then(
|
| 128 |
+
fn=bot, inputs=bot_inputs, outputs=[chatbot_component]
|
| 129 |
+
)
|
| 130 |
|
| 131 |
+
return app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
gradio_interface().launch(share=True, theme="earneleh/paris")
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
google-
|
| 2 |
gradio==6.5.1
|
| 3 |
python-dotenv==1.2.1
|
| 4 |
imageio==2.37.2
|
| 5 |
requests==2.32.5
|
|
|
|
|
|
| 1 |
+
google-genai==1.62.0
|
| 2 |
gradio==6.5.1
|
| 3 |
python-dotenv==1.2.1
|
| 4 |
imageio==2.37.2
|
| 5 |
requests==2.32.5
|
| 6 |
+
pillow==12.1.1
|
research/notebook.ipynb
CHANGED
|
@@ -39,6 +39,28 @@
|
|
| 39 |
")\n",
|
| 40 |
"print(response.text)"
|
| 41 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
],
|
| 44 |
"metadata": {
|
|
|
|
| 39 |
")\n",
|
| 40 |
"print(response.text)"
|
| 41 |
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": 2,
|
| 46 |
+
"id": "00cfccb8",
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"outputs": [],
|
| 49 |
+
"source": [
|
| 50 |
+
"from typing import Dict, List, Optional\n",
|
| 51 |
+
"from PIL import Image\n",
|
| 52 |
+
"from google import genai\n",
|
| 53 |
+
"from google.genai import types\n",
|
| 54 |
+
"import time"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": null,
|
| 60 |
+
"id": "9c01ae4a",
|
| 61 |
+
"metadata": {},
|
| 62 |
+
"outputs": [],
|
| 63 |
+
"source": []
|
| 64 |
}
|
| 65 |
],
|
| 66 |
"metadata": {
|
setup.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|
src/config/settings.py
CHANGED
|
@@ -4,9 +4,10 @@ import os
|
|
| 4 |
load_dotenv()
|
| 5 |
|
| 6 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 7 |
-
CHATBOT_NAME = os.getenv("CHATBOT_NAME", "
|
| 8 |
MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
|
|
|
|
| 9 |
MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
|
| 10 |
IMAGE_WIDTH = 512
|
| 11 |
IMAGE_HEIGHT = 512
|
| 12 |
-
|
|
|
|
| 4 |
load_dotenv()
|
| 5 |
|
| 6 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 7 |
+
CHATBOT_NAME = os.getenv("CHATBOT_NAME", "Gemini Multimodal Chatbot")
|
| 8 |
MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash")
|
| 9 |
+
MODEL_TEMPERATURE = float(os.getenv("MODEL_TEMPERATURE", "0.7"))
|
| 10 |
MODEL_OPTIONS = ["gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"]
|
| 11 |
IMAGE_WIDTH = 512
|
| 12 |
IMAGE_HEIGHT = 512
|
| 13 |
+
SYSTEM_INSTRUCTION = "You are an expert in image analysis and computer vision. Analyze any uploaded image in detail, providing specific descriptions of visual elements, composition, and content. Explain how this image can be effectively used in AI applications, including potential use cases for machine learning, computer vision tasks, and multimodal AI systems. Provide actionable insights for optimal image utilization."
|
src/utils/__init__.py
CHANGED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .preprocess import preprocess_stop_sequences, preprocess_image
|
| 2 |
-
from .prompts import user
|
| 3 |
-
from .generator import bot
|
| 4 |
-
|
| 5 |
-
__all__ = ["preprocess_stop_sequences", "preprocess_image", "user", "bot"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/helpers.py
CHANGED
|
@@ -1,19 +1,9 @@
|
|
| 1 |
-
from
|
|
|
|
| 2 |
from PIL import Image
|
| 3 |
-
|
|
|
|
| 4 |
import time
|
| 5 |
-
from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION_ANALYSIS
|
| 6 |
-
|
| 7 |
-
def initialize_model(api_key: Optional[str] = None):
|
| 8 |
-
"""Initialize the Gemini generative model."""
|
| 9 |
-
if api_key:
|
| 10 |
-
genai.configure(api_key=api_key)
|
| 11 |
-
elif GEMINI_API_KEY:
|
| 12 |
-
genai.configure(api_key=GEMINI_API_KEY)
|
| 13 |
-
|
| 14 |
-
model = genai.GenerativeModel(CHATBOT_NAME, system_instruction=SYSTEM_INSTRUCTION_ANALYSIS)
|
| 15 |
-
return model
|
| 16 |
-
|
| 17 |
|
| 18 |
def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
|
| 19 |
return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
|
|
@@ -22,65 +12,77 @@ def preprocess_image(image: Image.Image) -> Image.Image:
|
|
| 22 |
image_height = int(image.height * IMAGE_WIDTH / image.width)
|
| 23 |
return image.resize((IMAGE_WIDTH, image_height))
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
return "", chatbot + [[text_prompt, None]]
|
| 28 |
-
|
| 29 |
|
| 30 |
def bot(
|
| 31 |
-
|
| 32 |
image_prompt: Optional[Image.Image],
|
| 33 |
temperature: float,
|
| 34 |
max_output_tokens: int,
|
| 35 |
stop_sequences: str,
|
| 36 |
top_k: int,
|
| 37 |
top_p: float,
|
| 38 |
-
chatbot: List[
|
| 39 |
):
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
# Handle cases for text and/or image input
|
| 47 |
if not text_prompt and not image_prompt:
|
| 48 |
-
chatbot
|
| 49 |
yield chatbot
|
| 50 |
return
|
| 51 |
elif image_prompt and not text_prompt:
|
| 52 |
-
# If only an image is provided
|
| 53 |
text_prompt = "Describe the image"
|
| 54 |
elif image_prompt and text_prompt:
|
| 55 |
-
# If both text and image are provided, combine them
|
| 56 |
text_prompt = f"{text_prompt}. Also, analyze the provided image."
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
temperature=temperature,
|
| 62 |
max_output_tokens=max_output_tokens,
|
| 63 |
stop_sequences=preprocess_stop_sequences(stop_sequences),
|
| 64 |
top_k=top_k,
|
| 65 |
top_p=top_p,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
-
|
| 69 |
-
inputs = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
|
| 70 |
-
|
| 71 |
-
# Generate response
|
| 72 |
try:
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
except Exception as e:
|
| 76 |
-
chatbot[-1][
|
| 77 |
yield chatbot
|
| 78 |
return
|
| 79 |
-
|
| 80 |
-
# Stream the response back to the chatbot
|
| 81 |
-
chatbot[-1][1] = ""
|
| 82 |
-
for chunk in response:
|
| 83 |
-
for i in range(0, len(chunk.text), 10):
|
| 84 |
-
chatbot[-1][1] += chunk.text[i:i + 10]
|
| 85 |
-
time.sleep(0.01)
|
| 86 |
-
yield chatbot
|
|
|
|
| 1 |
+
from src.config.settings import GEMINI_API_KEY, CHATBOT_NAME, MODEL_ID, MODEL_TEMPERATURE, MODEL_OPTIONS, IMAGE_WIDTH, IMAGE_HEIGHT, SYSTEM_INSTRUCTION
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
from PIL import Image
|
| 4 |
+
from google import genai
|
| 5 |
+
from google.genai import types
|
| 6 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
|
| 9 |
return [seq.strip() for seq in stop_sequences.split(",")] if stop_sequences else None
|
|
|
|
| 12 |
image_height = int(image.height * IMAGE_WIDTH / image.width)
|
| 13 |
return image.resize((IMAGE_WIDTH, image_height))
|
| 14 |
|
| 15 |
+
def user(text_prompt: str, chatbot: List[Dict[str, str]]):
|
| 16 |
+
return "", chatbot + [{"role": "user", "content": text_prompt}]
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def bot(
|
| 19 |
+
model_name: str,
|
| 20 |
image_prompt: Optional[Image.Image],
|
| 21 |
temperature: float,
|
| 22 |
max_output_tokens: int,
|
| 23 |
stop_sequences: str,
|
| 24 |
top_k: int,
|
| 25 |
top_p: float,
|
| 26 |
+
chatbot: List[Dict[str, str]]
|
| 27 |
):
|
| 28 |
+
if not GEMINI_API_KEY:
|
| 29 |
+
chatbot.append({"role": "assistant", "content": "GEMINI_API_KEY is not set. Please add it to your .env file."})
|
| 30 |
+
yield chatbot
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 34 |
|
| 35 |
+
# Gradio v6 may store content as a list of parts or a plain string
|
| 36 |
+
raw_content = chatbot[-1].get("content") if chatbot else None
|
| 37 |
+
if isinstance(raw_content, list):
|
| 38 |
+
text_prompt = " ".join(
|
| 39 |
+
part.get("text", "") if isinstance(part, dict) else str(part)
|
| 40 |
+
for part in raw_content
|
| 41 |
+
).strip() or None
|
| 42 |
+
elif isinstance(raw_content, str):
|
| 43 |
+
text_prompt = raw_content.strip() or None
|
| 44 |
+
else:
|
| 45 |
+
text_prompt = None
|
| 46 |
|
|
|
|
| 47 |
if not text_prompt and not image_prompt:
|
| 48 |
+
chatbot.append({"role": "assistant", "content": "Prompt cannot be empty. Please provide input text or an image."})
|
| 49 |
yield chatbot
|
| 50 |
return
|
| 51 |
elif image_prompt and not text_prompt:
|
|
|
|
| 52 |
text_prompt = "Describe the image"
|
| 53 |
elif image_prompt and text_prompt:
|
|
|
|
| 54 |
text_prompt = f"{text_prompt}. Also, analyze the provided image."
|
| 55 |
|
| 56 |
+
contents = [text_prompt] if image_prompt is None else [text_prompt, preprocess_image(image_prompt)]
|
| 57 |
+
|
| 58 |
+
config = types.GenerateContentConfig(
|
| 59 |
+
system_instruction=SYSTEM_INSTRUCTION,
|
| 60 |
temperature=temperature,
|
| 61 |
max_output_tokens=max_output_tokens,
|
| 62 |
stop_sequences=preprocess_stop_sequences(stop_sequences),
|
| 63 |
top_k=top_k,
|
| 64 |
top_p=top_p,
|
| 65 |
+
safety_settings=[
|
| 66 |
+
types.SafetySetting(
|
| 67 |
+
category="HARM_CATEGORY_DANGEROUS_CONTENT",
|
| 68 |
+
threshold="BLOCK_ONLY_HIGH",
|
| 69 |
+
)
|
| 70 |
+
],
|
| 71 |
)
|
| 72 |
|
| 73 |
+
chatbot.append({"role": "assistant", "content": ""})
|
|
|
|
|
|
|
|
|
|
| 74 |
try:
|
| 75 |
+
for chunk in client.models.generate_content_stream(
|
| 76 |
+
model=model_name,
|
| 77 |
+
contents=contents,
|
| 78 |
+
config=config,
|
| 79 |
+
):
|
| 80 |
+
if chunk.text:
|
| 81 |
+
for i in range(0, len(chunk.text), 10):
|
| 82 |
+
chatbot[-1]["content"] += chunk.text[i:i + 10]
|
| 83 |
+
time.sleep(0.01)
|
| 84 |
+
yield chatbot
|
| 85 |
except Exception as e:
|
| 86 |
+
chatbot[-1]["content"] = f"Error occurred: {str(e)}"
|
| 87 |
yield chatbot
|
| 88 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|