update demo UX
Browse files
app.py
CHANGED
|
@@ -1202,29 +1202,48 @@ def answer_question(image, question):
|
|
| 1202 |
yield re.sub("<$", "", re.sub("END$", "", buffer))
|
| 1203 |
|
| 1204 |
|
| 1205 |
-
gr.
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
moondream1 is a tiny (1.6B parameter) vision language model trained by
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
-
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
-
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1202 |
yield re.sub("<$", "", re.sub("END$", "", buffer))
|
| 1203 |
|
| 1204 |
|
| 1205 |
+
with gr.Blocks() as demo:
|
| 1206 |
+
gr.HTML("<h1 class='gradio-heading'><center>🌔 moondream</center></h1>")
|
| 1207 |
+
gr.HTML(
|
| 1208 |
+
"<p class='gradio-sub-heading'><center>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</center></p>"
|
| 1209 |
+
)
|
| 1210 |
+
with gr.Group():
|
| 1211 |
+
with gr.Row():
|
| 1212 |
+
prompt = gr.Textbox(
|
| 1213 |
+
label="Question", placeholder="e.g. What is this?", scale=4
|
| 1214 |
+
)
|
| 1215 |
+
submit = gr.Button(
|
| 1216 |
+
"Submit",
|
| 1217 |
+
scale=1,
|
| 1218 |
+
)
|
| 1219 |
+
with gr.Row():
|
| 1220 |
+
img = gr.Image(type="pil", label="Upload or Drag an Image")
|
| 1221 |
+
output = gr.TextArea(label="Answer")
|
| 1222 |
+
|
| 1223 |
+
# handling events
|
| 1224 |
+
submit.click(answer_question, [img, prompt], output)
|
| 1225 |
+
prompt.submit(answer_question, [img, prompt], output)
|
| 1226 |
+
|
| 1227 |
+
demo.queue().launch(debug=True)
|
| 1228 |
+
|
| 1229 |
+
# gr.Interface(
|
| 1230 |
+
# title="🌔 moondream1",
|
| 1231 |
+
# description="""
|
| 1232 |
+
# moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
|
| 1233 |
+
# """,
|
| 1234 |
+
# fn=answer_question,
|
| 1235 |
+
# inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
|
| 1236 |
+
# examples=[
|
| 1237 |
+
# [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
|
| 1238 |
+
# [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
|
| 1239 |
+
# [
|
| 1240 |
+
# Image.open("assets/demo-3.jpg"),
|
| 1241 |
+
# "What kind of public transportation is in the image?",
|
| 1242 |
+
# ],
|
| 1243 |
+
# [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
|
| 1244 |
+
# [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
|
| 1245 |
+
# ],
|
| 1246 |
+
# outputs=gr.TextArea(label="Answer"),
|
| 1247 |
+
# allow_flagging="never",
|
| 1248 |
+
# cache_examples=False,
|
| 1249 |
+
# ).launch()
|