Spaces:

stpete2
/

image_understand

Sleeping

App Files Files Community

stpete2 commited on Dec 14, 2025

Commit

5bf0cd8

verified ·

1 Parent(s): 7e75cb8

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -19

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ REVISION = "2024-08-26"
 DEVICE = "cpu"
 # =========================
-# Load model (FIXED - device_mapを削除)
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
@@ -24,13 +24,13 @@ model = AutoModelForCausalLM.from_pretrained(
     revision=REVISION,
     trust_remote_code=True,
     torch_dtype=torch.float32,
-    low_cpu_mem_usage=True  # device_mapの代わりにこれを使用
 ).to(DEVICE)
 model.eval()
 # =========================
-# Inference function
 # =========================
 def understand_image(image, prompt):
     if image is None:
@@ -40,27 +40,26 @@ def understand_image(image, prompt):
         return "Please enter a question."
     try:
         image = image.convert("RGB")
-        # Moondream2の推論
-        enc_image = model.encode_image(image)
-        answer = model.answer_question(
-            enc_image,
-            prompt,
-            tokenizer
-        )
         return answer
     except Exception as e:
-        return f"Error: {str(e)}"
 # =========================
 # Gradio UI
 # =========================
-with gr.Blocks() as demo:
     gr.Markdown("# 🌓 Moondream2 Image Understanding")
     gr.Markdown(
-        "⚠️ This space runs on CPU. Processing may take a few seconds."
     )
     with gr.Row():
@@ -71,19 +70,27 @@ with gr.Blocks() as demo:
                 placeholder="What is in this image?",
                 value="Describe this image."
             )
-            btn = gr.Button("Run", variant="primary")
         with gr.Column():
-            output = gr.Textbox(label="Answer", lines=5)
     # Examples
     gr.Examples(
         examples=[
-            ["What objects are in this image?"],
-            ["Describe the scene in detail."],
-            ["What colors do you see?"]
         ],
-        inputs=text_input
     )
     btn.click(
@@ -91,5 +98,12 @@ with gr.Blocks() as demo:
         inputs=[image_input, text_input],
         outputs=output
     )
 demo.launch()

 DEVICE = "cpu"
 # =========================
+# Load model
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     revision=REVISION,
     trust_remote_code=True,
     torch_dtype=torch.float32,
+    low_cpu_mem_usage=True
 ).to(DEVICE)
 model.eval()
 # =========================
+# Inference function (修正版)
 # =========================
 def understand_image(image, prompt):
     if image is None:
         return "Please enter a question."
     try:
+        # 画像をRGBに変換
         image = image.convert("RGB")
+        # Moondream2の正しい使用方法
+        # encode_imageではなく、直接queryメソッドを使用
+        with torch.no_grad():
+            answer = model.query(image, prompt, tokenizer)
         return answer
     except Exception as e:
+        return f"Error: {str(e)}\n\nPlease try a different question or image."
 # =========================
 # Gradio UI
 # =========================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🌓 Moondream2 Image Understanding")
     gr.Markdown(
+        "Upload an image and ask questions about it. Processing runs on CPU and may take 10-30 seconds."
     )
     with gr.Row():
                 placeholder="What is in this image?",
                 value="Describe this image."
             )
+            btn = gr.Button("🔍 Analyze Image", variant="primary", size="lg")
         with gr.Column():
+            output = gr.Textbox(
+                label="Answer",
+                lines=8,
+                placeholder="The answer will appear here..."
+            )
     # Examples
+    gr.Markdown("### Example Questions:")
     gr.Examples(
         examples=[
+            ["Describe this image in detail."],
+            ["What objects are visible in this image?"],
+            ["What colors are prominent in this image?"],
+            ["What is the main subject of this image?"],
+            ["Are there any people in this image?"]
         ],
+        inputs=text_input,
+        label="Click to use"
     )
     btn.click(
         inputs=[image_input, text_input],
         outputs=output
     )
+    # Enter keyでも実行できるように
+    text_input.submit(
+        understand_image,
+        inputs=[image_input, text_input],
+        outputs=output
+    )
 demo.launch()