Spaces:
Running
Running
Dongxu Li
commited on
Commit
·
4ecd25d
1
Parent(s):
0314a2f
change ui
Browse files
app.py
CHANGED
|
@@ -125,12 +125,20 @@ def inference_caption(
|
|
| 125 |
return output[0]
|
| 126 |
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
title = """<h1 align="center">BLIP-2</h1>"""
|
| 129 |
-
description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them
|
| 130 |
-
<
|
| 131 |
article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
|
| 132 |
<br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
|
| 133 |
<br> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
|
|
|
|
| 134 |
"""
|
| 135 |
|
| 136 |
endpoint = Endpoint()
|
|
@@ -147,6 +155,7 @@ with gr.Blocks() as iface:
|
|
| 147 |
gr.Markdown(title)
|
| 148 |
gr.Markdown(description)
|
| 149 |
gr.Markdown(article)
|
|
|
|
| 150 |
with gr.Row():
|
| 151 |
with gr.Column():
|
| 152 |
image_input = gr.Image(type="pil")
|
|
@@ -189,54 +198,61 @@ with gr.Blocks() as iface:
|
|
| 189 |
with gr.Column():
|
| 190 |
|
| 191 |
with gr.Column():
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
with gr.
|
| 210 |
-
|
| 211 |
-
with gr.Row():
|
| 212 |
chatbot = gr.Chatbot(label="Chat Output (from FlanT5)")
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
with gr.
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
examples = gr.Examples(
|
| 242 |
examples=examples,
|
|
|
|
| 125 |
return output[0]
|
| 126 |
|
| 127 |
|
| 128 |
+
def clear_fn(image_input, chatbot, chat_input, caption_output, state):
|
| 129 |
+
if image_input is None:
|
| 130 |
+
return (None, "", "", [])
|
| 131 |
+
else:
|
| 132 |
+
return chatbot, chat_input, caption_output, state
|
| 133 |
+
|
| 134 |
+
|
| 135 |
title = """<h1 align="center">BLIP-2</h1>"""
|
| 136 |
+
description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
|
| 137 |
+
<br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
|
| 138 |
article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
|
| 139 |
<br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
|
| 140 |
<br> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
|
| 141 |
+
<br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
|
| 142 |
"""
|
| 143 |
|
| 144 |
endpoint = Endpoint()
|
|
|
|
| 155 |
gr.Markdown(title)
|
| 156 |
gr.Markdown(description)
|
| 157 |
gr.Markdown(article)
|
| 158 |
+
|
| 159 |
with gr.Row():
|
| 160 |
with gr.Column():
|
| 161 |
image_input = gr.Image(type="pil")
|
|
|
|
| 198 |
with gr.Column():
|
| 199 |
|
| 200 |
with gr.Column():
|
| 201 |
+
caption_output = gr.Textbox(lines=1, label="Caption Output")
|
| 202 |
+
caption_button = gr.Button(
|
| 203 |
+
value="Caption it!", interactive=True, variant="primary"
|
| 204 |
+
)
|
| 205 |
+
caption_button.click(
|
| 206 |
+
inference_caption,
|
| 207 |
+
[
|
| 208 |
+
image_input,
|
| 209 |
+
sampling,
|
| 210 |
+
temperature,
|
| 211 |
+
len_penalty,
|
| 212 |
+
rep_penalty,
|
| 213 |
+
],
|
| 214 |
+
[caption_output],
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
gr.Markdown("""Trying prompting your input for chat; e.g. recommended prompt for QA, \"Question: {} Answer:\"""")
|
| 218 |
+
with gr.Row():
|
| 219 |
+
with gr.Column():
|
|
|
|
| 220 |
chatbot = gr.Chatbot(label="Chat Output (from FlanT5)")
|
| 221 |
+
|
| 222 |
+
# with gr.Row():
|
| 223 |
+
with gr.Column():
|
| 224 |
+
chat_input = gr.Textbox(lines=1, label="Chat Input")
|
| 225 |
+
|
| 226 |
+
with gr.Row():
|
| 227 |
+
clear_button = gr.Button(value="Clear", interactive=True)
|
| 228 |
+
clear_button.click(
|
| 229 |
+
lambda: ("", [], []),
|
| 230 |
+
[],
|
| 231 |
+
[chat_input, chatbot, state],
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
submit_button = gr.Button(
|
| 235 |
+
value="Submit", interactive=True, variant="primary"
|
| 236 |
+
)
|
| 237 |
+
submit_button.click(
|
| 238 |
+
inference_chat,
|
| 239 |
+
[
|
| 240 |
+
image_input,
|
| 241 |
+
chat_input,
|
| 242 |
+
sampling,
|
| 243 |
+
temperature,
|
| 244 |
+
len_penalty,
|
| 245 |
+
rep_penalty,
|
| 246 |
+
state,
|
| 247 |
+
],
|
| 248 |
+
[chatbot, state],
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
image_input.change(
|
| 252 |
+
clear_fn,
|
| 253 |
+
[image_input, chatbot, chat_input, caption_output, state],
|
| 254 |
+
[chatbot, chat_input, caption_output, state]
|
| 255 |
+
)
|
| 256 |
|
| 257 |
examples = gr.Examples(
|
| 258 |
examples=examples,
|