import gradio as gr from PIL import Image from inference.main import MultiModalPhi2 messages = [] multimodal_phi2 = MultiModalPhi2( modelname_or_path="Navyabhat/Llava-Phi2", temperature=0.2, max_new_tokens=1024, device="cpu", ) def add_content(chatbot, input_data, input_type) -> gr.Chatbot: textflag, imageflag, audioflag = False, False, False if input_type == "text": chatbot.append((text, None)) textflag = True if input_type == "image": chatbot.append(((image,), None)) imageflag = True if input_type == "audio": chatbot.append(((audio_mic,), None)) audioflag = True # else: # if audio_upload is not None: # chatbot.append(((audio_upload,), None)) # audioflag = True if not any([textflag, imageflag, audioflag]): # Raise an error if neither text nor file is provided raise gr.Error("Enter a valid text, image or audio") return chatbot def clear_data(): return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []} def run(history, text, image, audio_upload, audio_mic): if text in [None, ""]: text = None if audio_upload is not None: audio = audio_upload elif audio_mic is not None: audio = audio_mic else: audio = None print("text", text) print("image", image) print("audio", audio) if image is not None: image = Image.open(image) outputs = multimodal_phi2(text, audio, image) # outputs = "" history.append((None, outputs.title())) return history, None, None, None, None with gr.Blocks() as demo: chatbot = gr.Chatbot( [], elem_id="chatbot", bubble_full_width=False, avatar_images=(None, (os.path.join(os.path.dirname(__file__), "avatar.png"))), ) with gr.Row(): txt = gr.Textbox( scale=4, show_label=False, placeholder="Enter text and press enter", container=False, ) img_audio = gr.UploadButton("📁", file_types=["image", "audio"], label="Upload Image or Audio") txt_msg = txt.submit(add_content, [chatbot, txt], [chatbot, txt, "text"], queue=False).then( bot, chatbot, chatbot, api_name="bot_response" ) img_audio_msg = img_audio.upload(add_input, [chatbot, img_audio], [chatbot, "image"], queue=False).then( bot, chatbot, chatbot ) # chatbot.like(print_like_dislike, None, None) submit.click( add_content, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot], ).success( run, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot, prompt, image, audio_upload, audio_mic], ) clear.click( clear_data, outputs=[prompt, image, audio_upload, audio_mic, chatbot], ) demo.launch() import gradio as gr from PIL import Image from inference.main import MultiModalPhi2 import os messages = [] multimodal_phi2 = MultiModalPhi2( modelname_or_path="Navyabhat/Llava-Phi2", temperature=0.2, max_new_tokens=1024, device="cpu", ) def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot: textflag, imageflag, audioflag = False, False, False if text not in ["", None]: chatbot.append((text, None)) textflag = True if image is not None: chatbot.append(((image,), None)) imageflag = True if audio_mic is not None: chatbot.append(((audio_mic,), None)) audioflag = True else: if audio_upload is not None: chatbot.append(((audio_upload,), None)) audioflag = True if not any([textflag, imageflag, audioflag]): # Raise an error if neither text nor file is provided raise gr.Error("Enter a valid text, image or audio") return chatbot def clear_data(): return {"text": None, "image": None, "audio_upload": None, "audio_mic": None, "chatbot": []} def run(history, text, image, audio_upload, audio_mic): if text in [None, ""]: text = None if audio_upload is not None: audio = audio_upload elif audio_mic is not None: audio = audio_mic else: audio = None print("text", text) print("image", image) print("audio", audio) if image is not None: image = Image.open(image) outputs = multimodal_phi2(text, audio, image) history.append((None, outputs.title())) return history, None, None, None, None # def print_like_dislike(x: gr.LikeData): # print(x.index, x.value, x.liked) def add_text(history, text): history = history + [(text, None)] return history, gr.Textbox(value="", interactive=False) def add_file(history, file): history = history + [((file.name,), None)] return history def bot(history): response = "**That's cool!**" history[-1][1] = "" for character in response: history[-1][1] += character time.sleep(0.05) yield history with gr.Blocks() as demo: chatbot = gr.Chatbot( [], elem_id="chatbot", bubble_full_width=False, avatar_images=(None, (os.path.join(os.path.dirname(__file__), "avatar.png"))), ) with gr.Row(): txt = gr.Textbox( scale=4, show_label=False, placeholder="Enter text and press enter", container=False, ) img_audio = gr.UploadButton("📁", file_types=["image", "audio"], label="Upload Image or Audio") with gr.Row(): # Adding a Button submit = gr.Button() clear = gr.Button(value="Clear") txt_msg = txt.submit(add_input, [chatbot, txt], [chatbot, txt, "text"], queue=False).then( bot, chatbot, chatbot, api_name="bot_response" ) img_audio_msg = img_audio.upload(add_input, [chatbot, img_audio], [chatbot, "image"], queue=False).then( bot, chatbot, chatbot ) # submit.click( # add_content, # inputs=[chatbot, txt, image, audio_upload, audio_mic], # outputs=[chatbot], # ).success( # run, # inputs=[chatbot, txt, image, audio_upload, audio_mic], # outputs=[chatbot, txt, image, audio_upload, audio_mic], # ) clear.click( clear_data, outputs=[prompt, image, audio_upload, audio_mic, chatbot], ) # chatbot.like(print_like_dislike, None, None) # demo.queue() demo.launch()