Spaces:

chendl
/

compositional_test

Runtime error

App Files Files Community

chendl commited on Nov 1, 2023

Commit

020e358

1 Parent(s): d263fbc

update chat

Browse files

Files changed (2) hide show

app.py +25 -9
multimodal/open_flamingo/chat/conversation.py +46 -10

app.py CHANGED Viewed

@@ -237,30 +237,36 @@ def upload_img(gr_img, text_input, chat_state,chatbot):
         value="Start Chatting", interactive=False), chat_state, img_list,chatbot
-def gradio_ask(user_message, chatbot, chat_state):
     if len(user_message) == 0:
         return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
-    chat.ask(user_message, chat_state)
     chatbot = chatbot + [[user_message, None]]
     return '', chatbot, chat_state
-def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
     llm_message,image = \
     chat.answer(conv=chat_state, img_list=img_list, max_new_tokens=300, num_beams=1, temperature=temperature,
-                max_length=2000)
     chatbot[-1][1] = llm_message
     if image==None:
         return chatbot, chat_state, img_list
     else:
         path = build_image(image)
-        chatbot = chatbot + [[(path,), None]]
         return chatbot, chat_state, img_list
 with gr.Blocks() as demo:
     gr.Markdown(title)
@@ -273,6 +279,9 @@ with gr.Blocks() as demo:
             image = gr.Image(type="pil")
             upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
             clear = gr.Button("Restart")
             num_beams = gr.Slider(
                 minimum=1,
@@ -296,13 +305,20 @@ with gr.Blocks() as demo:
             chat_state = gr.State()
             img_list = gr.State()
             chatbot = gr.Chatbot(label='Compositional-VLM')
-            text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
     upload_button.click(upload_img, [image, text_input, chat_state,chatbot],
                         [image, text_input, upload_button, chat_state, img_list,chatbot])
-    text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
-        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
     )
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
                 queue=False)

         value="Start Chatting", interactive=False), chat_state, img_list,chatbot
+def gradio_ask(user_message, chatbot, chat_state,radio):
     if len(user_message) == 0:
         return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+    chat.ask(user_message, chat_state,radio)
     chatbot = chatbot + [[user_message, None]]
     return '', chatbot, chat_state
+def gradio_answer(chatbot, chat_state, img_list,  radio, text,num_beams, temperature,radio):
     llm_message,image = \
     chat.answer(conv=chat_state, img_list=img_list, max_new_tokens=300, num_beams=1, temperature=temperature,
+                max_length=2000,radio = radio,text_input = text)
     chatbot[-1][1] = llm_message
     if image==None:
         return chatbot, chat_state, img_list
     else:
         path = build_image(image)
+        chatbot = chatbot + [[None,(path,)]]
         return chatbot, chat_state, img_list
+task_template = {
+                "Cap": "Summarize the content of the photo <image>.",
+                "VQA": "For this image <image>, I want a simple and direct answer to my question: <question>",
+                "REC": "Can you point out <expr> in the image <image> and provide the coordinates of its location?",
+                "GC": "Can you give me a description of the region <boxes> in image <image>?",
+                "Advanced": "<question>",
+            }
 with gr.Blocks() as demo:
     gr.Markdown(title)
             image = gr.Image(type="pil")
             upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
             clear = gr.Button("Restart")
+            radio = gr.Radio(
+                ["Cap", "VQA", "REC", "Advanced"], label="Task Template", value='Cap',
+            )
             num_beams = gr.Slider(
                 minimum=1,
             chat_state = gr.State()
             img_list = gr.State()
             chatbot = gr.Chatbot(label='Compositional-VLM')
+            # template = gr.Textbox(label='Template', show_label=True, lines=1, interactive=False,
+            #                       value='Provide a comprehensive description of the image <image> and specify the positions of any mentioned objects in square brackets.')
+            # text_input = gr.Textbox(label='<question>', show_label=True, placeholder="Please upload your image first， then input...", lines=3,
+            #                         value=None, visible=False, interactive=False)
+            text_input = gr.Textbox(label='User', placeholder='Please upload your image first， then input...', interactive=False)
     upload_button.click(upload_img, [image, text_input, chat_state,chatbot],
                         [image, text_input, upload_button, chat_state, img_list,chatbot])
+    text_input.submit(gradio_ask, [text_input, chatbot, chat_state,radio], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_list,  radio, text_input,num_beams, temperature, radio], [chatbot, chat_state, img_list]
     )
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
                 queue=False)

multimodal/open_flamingo/chat/conversation.py CHANGED Viewed

@@ -278,18 +278,34 @@ class Chat:
         #                   torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
         # self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
-    def ask(self, text, conv):
-        conv.append(({
-            "from": "human",
-            "value": text,
-        }))
         # if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
         #         and conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
         #     conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
         # else:
         #     conv.append_message(conv.roles[0], text)
-    def answer(self, conv, img_list, max_new_tokens=200, num_beams=5, min_length=1, top_p=0.9,
                repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
         # conv.append_message(conv.roles[1], None)
         # embs = self.get_context_emb(conv, img_list)
@@ -315,7 +331,14 @@ class Chat:
         # output_text = output_text.split('###')[0]  # remove the stop sign '###'
         # output_text = output_text.split('Assistant:')[-1].strip()
         # conv.messages[-1][1] = output_text
         media_token_id = self.tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
         box_token_id = self.tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
         endofobject_token_id = self.tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
@@ -336,10 +359,23 @@ class Chat:
         # conversation = []
         human_sentence = None
-        conv.append({
                     "from": "gpt",
-                    "value": "",
-                })
         # while True:
         #     human_sentence = input("### Human: ")
         #     if human_sentence == "#end#":

         #                   torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
         # self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    def ask(self, text, conv,radio):
+        if radio in ["Cap"]:
+            conv.append({
+                "from": "human",
+                "value": "",
+            })
+        elif radio in ["VQA"]:
+            conv.append({
+                "from": "human",
+                "value": f"Answer the question using a single word or phrase.{text}",
+            })
+        elif radio in ["REC"]:
+            conv.append({
+                "from": "human",
+                "value": f"Please provide the bounding box coordinate of the region this sentence describes: {text}.",
+            })
+        else:
+            conv.append({
+                "from": "human",
+                "value": text,
+            })
         # if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
         #         and conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
         #     conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
         # else:
         #     conv.append_message(conv.roles[0], text)
+    def answer(self, conv, img_list, radio, text_input, max_new_tokens=200, num_beams=5, min_length=1, top_p=0.9,
                repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
         # conv.append_message(conv.roles[1], None)
         # embs = self.get_context_emb(conv, img_list)
         # output_text = output_text.split('###')[0]  # remove the stop sign '###'
         # output_text = output_text.split('Assistant:')[-1].strip()
         # conv.messages[-1][1] = output_text
+        visual_token = "<|#visual#|>"
+        previsual_token = "<|#previsual#|>"
+        box_token = "<|#box#|>"
+        prebox_token = "<|#prebox#|>"
+        end_token = "<|#endofobject#|>"
+        object_token = "<|#object#|>"
+        end_of_attr_token = "<|#endofattr#|>"
+        preend_of_attr_token = "<|#preendofattr#|>"
         media_token_id = self.tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
         box_token_id = self.tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
         endofobject_token_id = self.tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
         # conversation = []
         human_sentence = None
+        if radio in ["Cap","VQA"]:
+            conv.append({
+                "from": "gpt",
+                "value": "",
+            })
+        elif radio in ["REC"]:
+            conv.append(
+                {
                     "from": "gpt",
+                    "value": object_token + text_input + end_token + visual_token
+                }
+            )
+        else:
+            conv.append({
+                        "from": "gpt",
+                        "value": "",
+                    })
         # while True:
         #     human_sentence = input("### Human: ")
         #     if human_sentence == "#end#":