vision

Sleeping

App Files Files Community

vikhyatk commited on Jan 25, 2024

Commit

77e99d6

1 Parent(s): 0f11c75

update demo UX

Browse files

Files changed (1) hide show

app.py +45 -26

app.py CHANGED Viewed

@@ -1202,29 +1202,48 @@ def answer_question(image, question):
             yield re.sub("<$", "", re.sub("END$", "", buffer))
-gr.Interface(
-    title="🌔 moondream1",
-    description="""
-        moondream1 is a tiny (1.6B parameter) vision language model trained by
-        <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with
-        models twice its size. It is trained on the LLaVa training dataset, and
-        initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.
-        Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace
-        model card</a> for more details.
-    """,
-    fn=answer_question,
-    inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
-    examples=[
-        [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
-        [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
-        [
-            Image.open("assets/demo-3.jpg"),
-            "What kind of public transportation is in the image?",
-        ],
-        [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
-        [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
-    ],
-    outputs=gr.TextArea(label="Answer"),
-    allow_flagging="never",
-    cache_examples=False,
-).launch()

             yield re.sub("<$", "", re.sub("END$", "", buffer))
+with gr.Blocks() as demo:
+    gr.HTML("<h1 class='gradio-heading'><center>🌔 moondream</center></h1>")
+    gr.HTML(
+        "<p class='gradio-sub-heading'><center>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.  Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</center></p>"
+    )
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Textbox(
+                label="Question", placeholder="e.g. What is this?", scale=4
+            )
+            submit = gr.Button(
+                "Submit",
+                scale=1,
+            )
+        with gr.Row():
+            img = gr.Image(type="pil", label="Upload or Drag an Image")
+            output = gr.TextArea(label="Answer")
+    # handling events
+    submit.click(answer_question, [img, prompt], output)
+    prompt.submit(answer_question, [img, prompt], output)
+demo.queue().launch(debug=True)
+# gr.Interface(
+#     title="🌔 moondream1",
+#     description="""
+#         moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.  Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
+#     """,
+#     fn=answer_question,
+#     inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
+#     examples=[
+#         [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
+#         [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
+#         [
+#             Image.open("assets/demo-3.jpg"),
+#             "What kind of public transportation is in the image?",
+#         ],
+#         [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
+#         [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
+#     ],
+#     outputs=gr.TextArea(label="Answer"),
+#     allow_flagging="never",
+#     cache_examples=False,
+# ).launch()