File size: 3,626 Bytes
9fe03bb
e0ade18
 
c7af37d
361522e
e0ade18
9fe03bb
e0ade18
13cece1
b812f5d
e0ade18
361522e
e0ade18
5bf0cd8
e0ade18
0712e7e
e0ade18
361522e
c7af37d
13cece1
361522e
9fe03bb
0712e7e
c7af37d
 
 
 
7e75cb8
5bf0cd8
7e75cb8
 
13cece1
0712e7e
361522e
e0ade18
0712e7e
e0ade18
 
361522e
0712e7e
c7af37d
 
0712e7e
c7af37d
 
 
 
0712e7e
 
 
5bf0cd8
0712e7e
 
 
 
 
 
 
 
 
5bf0cd8
0712e7e
c7af37d
 
 
0712e7e
 
 
 
 
 
 
e0ade18
9fe03bb
e0ade18
5bf0cd8
c7af37d
9fe03bb
0712e7e
9fe03bb
c7af37d
9fe03bb
c7af37d
0712e7e
c7af37d
0712e7e
 
 
 
c7af37d
5bf0cd8
c7af37d
 
5bf0cd8
0712e7e
 
 
5bf0cd8
c7af37d
0712e7e
c7af37d
 
0712e7e
 
 
 
 
 
c7af37d
5bf0cd8
0712e7e
c7af37d
 
9fe03bb
e0ade18
 
9fe03bb
 
5bf0cd8
 
 
 
 
 
361522e
0712e7e
ddb86c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

# =========================
# Model config
# =========================
MODEL_ID = "vikhyatk/moondream2"
REVISION = None 
DEVICE = "cpu"  

# =========================
# Load model
# =========================
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    trust_remote_code=True
)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    trust_remote_code=True,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
).to(DEVICE)

model.eval()
print("Model loaded successfully!")

# =========================
# Inference function
# =========================
def understand_image(image, prompt):
    if image is None:
        return "โŒ Please upload an image."
    
    if not prompt or prompt.strip() == "":
        return "โŒ Please enter a question."
    
    try:
        image = image.convert("RGB")
        
        print(f"Processing question: {prompt}")
        
        # Moondream2ใฎๆญฃใ—ใ„API
        with torch.no_grad():
            # ็”ปๅƒใ‚’ใ‚จใƒณใ‚ณใƒผใƒ‰
            image_embeds = model.encode_image(image)
            
            # ่ณชๅ•ใซๅ›ž็ญ”
            answer = model.answer_question(
                image_embeds=image_embeds,
                question=prompt,
                tokenizer=tokenizer
            )
        
        print(f"Answer generated: {answer}")
        return answer
    
    except Exception as e:
        error_msg = str(e)
        print(f"Error occurred: {error_msg}")
        
        # ใƒ‡ใƒใƒƒใ‚ฐๆƒ…ๅ ฑใ‚’่ฟฝๅŠ 
        available_methods = [method for method in dir(model) if not method.startswith('_')]
        return f"โŒ Error: {error_msg}\n\n๐Ÿ” Available model methods:\n{', '.join(available_methods[:20])}"

# =========================
# Gradio UI
# =========================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ๐ŸŒ“ Moondream2 Image Understanding")
    gr.Markdown(
        "Upload an image and ask questions about it. โš ๏ธ CPU processing may take 20-40 seconds."
    )
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="๐Ÿ“ธ Upload Image")
            text_input = gr.Textbox(
                label="โ“ Your Question",
                placeholder="What do you see in this image?",
                value="Describe this image in detail.",
                lines=2
            )
            btn = gr.Button("๐Ÿ” Analyze Image", variant="primary", size="lg")
        
        with gr.Column():
            output = gr.Textbox(
                label="๐Ÿ’ฌ Answer", 
                lines=10,
                placeholder="The AI's response will appear here..."
            )
    
    gr.Markdown("### ๐Ÿ’ก Example Questions:")
    gr.Examples(
        examples=[
            ["Describe what you see in this image."],
            ["What objects are in this image?"],
            ["What is the main subject?"],
            ["What colors are most prominent?"],
            ["Is this indoors or outdoors?"],
            ["How many people are in the image?"]
        ],
        inputs=text_input,
        label="Click to use these questions"
    )
    
    btn.click(
        understand_image,
        inputs=[image_input, text_input],
        outputs=output
    )
    
    text_input.submit(
        understand_image,
        inputs=[image_input, text_input],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()