File size: 3,493 Bytes
691bb52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering

# 預先載入可選模型(避免每次都下載)
AVAILABLE_MODELS = {
    "BLIP VQA Base": "Salesforce/blip-vqa-base",
    "BLIP VQA Large (CapFilt)": "Salesforce/blip-vqa-capfilt-large",
}

# 預設模型
current_model_name = list(AVAILABLE_MODELS.keys())[0]
processor = BlipProcessor.from_pretrained(AVAILABLE_MODELS[current_model_name])
model = BlipForQuestionAnswering.from_pretrained(AVAILABLE_MODELS[current_model_name])


# 模型切換功能
def change_model(model_choice):
    global processor, model, current_model_name
    current_model_name = model_choice
    model_id = AVAILABLE_MODELS[model_choice]
    processor = BlipProcessor.from_pretrained(model_id)
    model = BlipForQuestionAnswering.from_pretrained(model_id)
    return f"✅ Switched to: {model_choice}"


# 問答邏輯
def answer_question(history, image, question):
    if image is None:
        return history + [("Please upload an image first.", None)]
    if not question.strip():
        return history + [("Please enter a question.", None)]

    inputs = processor(image, question, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=50)
    answer = processor.decode(out[0], skip_special_tokens=True)
    reply = f"🤖({current_model_name}) Answer: {answer}"
    return history + [(question, reply)]


# 上傳新圖片時重設聊天
def reset_chat(_):
    return []


# 建立 Gradio 介面
def build_ui():
    with gr.Blocks(title="Vision-Language Chatbot") as demo:
        gr.Markdown("## 🤖 Vision-Language Chatbot")
        gr.Markdown("Upload an image and ask multiple questions about it!")

        # 模型選擇
        model_selector = gr.Dropdown(
            choices=list(AVAILABLE_MODELS.keys()),
            value=current_model_name,
            label="Select Model",
        )
        model_status = gr.Markdown(f"✅ Current model: {current_model_name}")

        # 版面配置
        with gr.Row():
            with gr.Column(scale=1):
                image_input = gr.Image(type="pil", label="Upload Image")

            with gr.Column(scale=2):
                question_input = gr.Textbox(
                    placeholder="Ask something about the image...",
                    label="Question",
                )
                ask_btn = gr.Button("Ask", variant="primary")
                clear_btn = gr.Button("Clear Chat")

                chatbot = gr.Chatbot(height=400, label="Chat History")

        # 邏輯綁定
        ask_btn.click(
            fn=answer_question,
            inputs=[chatbot, image_input, question_input],
            outputs=chatbot,
        )

        clear_btn.click(fn=lambda: [], outputs=chatbot)
        image_input.change(fn=reset_chat, inputs=image_input, outputs=chatbot)

        model_selector.change(
            fn=change_model,
            inputs=model_selector,
            outputs=model_status,
        )

        # 範例
        gr.Examples(
            examples=[
                ["sample_images/app.jpg", "How many apples are in the picture?"],
                ["sample_images/cat_dog.jpg", "What animals are in the image?"],
                ["sample_images/city.jpg", "What is the man doing?"]                
            ],
            inputs=[image_input, question_input],
            label="🏞️ Example Inputs",
        )

    return demo


if __name__ == "__main__":
    build_ui().launch()