Spaces:

stpete2
/

image_understand

Sleeping

File size: 3,626 Bytes

9fe03bb
e0ade18
 
c7af37d
361522e
e0ade18
9fe03bb
e0ade18
13cece1
b812f5d
e0ade18
361522e
e0ade18
5bf0cd8
e0ade18
0712e7e
e0ade18
361522e
c7af37d
13cece1
361522e
9fe03bb
0712e7e
c7af37d
 
 
 
7e75cb8
5bf0cd8
7e75cb8
 
13cece1
0712e7e
361522e
e0ade18
0712e7e
e0ade18
 
361522e
0712e7e
c7af37d
 
0712e7e
c7af37d
 
 
 
0712e7e
 
 
5bf0cd8
0712e7e
 
 
 
 
 
 
 
 
5bf0cd8
0712e7e
c7af37d
 
 
0712e7e
 
 
 
 
 
 
e0ade18
9fe03bb
e0ade18
5bf0cd8
c7af37d
9fe03bb
0712e7e
9fe03bb
c7af37d
9fe03bb
c7af37d
0712e7e
c7af37d
0712e7e
 
 
 
c7af37d
5bf0cd8
c7af37d
 
5bf0cd8
0712e7e
 
 
5bf0cd8
c7af37d
0712e7e
c7af37d
 
0712e7e
 
 
 
 
 
c7af37d
5bf0cd8
0712e7e
c7af37d
 
9fe03bb
e0ade18
 
9fe03bb
 
5bf0cd8
 
 
 
 
 
361522e
0712e7e
ddb86c5

import gradio as gr
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

# =========================
# Model config
# =========================
MODEL_ID = "vikhyatk/moondream2"
REVISION = None 
DEVICE = "cpu"  

# =========================
# Load model
# =========================
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    trust_remote_code=True
)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    trust_remote_code=True,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
).to(DEVICE)

model.eval()
print("Model loaded successfully!")

# =========================
# Inference function
# =========================
def understand_image(image, prompt):
    if image is None:
        return "❌ Please upload an image."
    
    if not prompt or prompt.strip() == "":
        return "❌ Please enter a question."
    
    try:
        image = image.convert("RGB")
        
        print(f"Processing question: {prompt}")
        
        # Moondream2の正しいAPI
        with torch.no_grad():
            # 画像をエンコード
            image_embeds = model.encode_image(image)
            
            # 質問に回答
            answer = model.answer_question(
                image_embeds=image_embeds,
                question=prompt,
                tokenizer=tokenizer
            )
        
        print(f"Answer generated: {answer}")
        return answer
    
    except Exception as e:
        error_msg = str(e)
        print(f"Error occurred: {error_msg}")
        
        # デバッグ情報を追加
        available_methods = [method for method in dir(model) if not method.startswith('_')]
        return f"❌ Error: {error_msg}\n\n🔍 Available model methods:\n{', '.join(available_methods[:20])}"

# =========================
# Gradio UI
# =========================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌓 Moondream2 Image Understanding")
    gr.Markdown(
        "Upload an image and ask questions about it. ⚠️ CPU processing may take 20-40 seconds."
    )
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="📸 Upload Image")
            text_input = gr.Textbox(
                label="❓ Your Question",
                placeholder="What do you see in this image?",
                value="Describe this image in detail.",
                lines=2
            )
            btn = gr.Button("🔍 Analyze Image", variant="primary", size="lg")
        
        with gr.Column():
            output = gr.Textbox(
                label="💬 Answer", 
                lines=10,
                placeholder="The AI's response will appear here..."
            )
    
    gr.Markdown("### 💡 Example Questions:")
    gr.Examples(
        examples=[
            ["Describe what you see in this image."],
            ["What objects are in this image?"],
            ["What is the main subject?"],
            ["What colors are most prominent?"],
            ["Is this indoors or outdoors?"],
            ["How many people are in the image?"]
        ],
        inputs=text_input,
        label="Click to use these questions"
    )
    
    btn.click(
        understand_image,
        inputs=[image_input, text_input],
        outputs=output
    )
    
    text_input.submit(
        understand_image,
        inputs=[image_input, text_input],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()