import gradio as gr from PIL import Image from transformers import BlipProcessor, BlipForQuestionAnswering # 預先載入可選模型(避免每次都下載) AVAILABLE_MODELS = { "BLIP VQA Base": "Salesforce/blip-vqa-base", "BLIP VQA Large (CapFilt)": "Salesforce/blip-vqa-capfilt-large", } # 預設模型 current_model_name = list(AVAILABLE_MODELS.keys())[0] processor = BlipProcessor.from_pretrained(AVAILABLE_MODELS[current_model_name]) model = BlipForQuestionAnswering.from_pretrained(AVAILABLE_MODELS[current_model_name]) # 模型切換功能 def change_model(model_choice): global processor, model, current_model_name current_model_name = model_choice model_id = AVAILABLE_MODELS[model_choice] processor = BlipProcessor.from_pretrained(model_id) model = BlipForQuestionAnswering.from_pretrained(model_id) return f"✅ Switched to: {model_choice}" # 問答邏輯 def answer_question(history, image, question): if image is None: return history + [("Please upload an image first.", None)] if not question.strip(): return history + [("Please enter a question.", None)] inputs = processor(image, question, return_tensors="pt") out = model.generate(**inputs, max_new_tokens=50) answer = processor.decode(out[0], skip_special_tokens=True) reply = f"🤖({current_model_name}) Answer: {answer}" return history + [(question, reply)] # 上傳新圖片時重設聊天 def reset_chat(_): return [] # 建立 Gradio 介面 def build_ui(): with gr.Blocks(title="Vision-Language Chatbot") as demo: gr.Markdown("## 🤖 Vision-Language Chatbot") gr.Markdown("Upload an image and ask multiple questions about it!") # 模型選擇 model_selector = gr.Dropdown( choices=list(AVAILABLE_MODELS.keys()), value=current_model_name, label="Select Model", ) model_status = gr.Markdown(f"✅ Current model: {current_model_name}") # 版面配置 with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil", label="Upload Image") with gr.Column(scale=2): question_input = gr.Textbox( placeholder="Ask something about the image...", label="Question", ) ask_btn = gr.Button("Ask", variant="primary") clear_btn = gr.Button("Clear Chat") chatbot = gr.Chatbot(height=400, label="Chat History") # 邏輯綁定 ask_btn.click( fn=answer_question, inputs=[chatbot, image_input, question_input], outputs=chatbot, ) clear_btn.click(fn=lambda: [], outputs=chatbot) image_input.change(fn=reset_chat, inputs=image_input, outputs=chatbot) model_selector.change( fn=change_model, inputs=model_selector, outputs=model_status, ) # 範例 gr.Examples( examples=[ ["sample_images/app.jpg", "How many apples are in the picture?"], ["sample_images/cat_dog.jpg", "What animals are in the image?"], ["sample_images/city.jpg", "What is the man doing?"] ], inputs=[image_input, question_input], label="🏞️ Example Inputs", ) return demo if __name__ == "__main__": build_ui().launch()