Spaces:

stpete2
/

image_understand

Sleeping

App Files Files Community

stpete2 commited on Dec 14, 2025

Commit

c7af37d

verified ·

1 Parent(s): a536dc5

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -30

app.py CHANGED Viewed

@@ -1,29 +1,31 @@
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoTokenizer
 # =========================
 # Model config
 # =========================
 MODEL_ID = "vikhyatk/moondream2"
 DEVICE = "cpu"
 # =========================
-# Load model (IMPORTANT)
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     trust_remote_code=True
 )
-model = torch.load(
-    torch.hub.load_state_dict_from_url(
-        f"https://huggingface.co/{MODEL_ID}/resolve/main/moondream.pt",
-        map_location=DEVICE
-    )
 )
 model.eval()
 # =========================
@@ -32,43 +34,61 @@ model.eval()
 def understand_image(image, prompt):
     if image is None:
         return "Please upload an image."
-    image = image.convert("RGB")
-    with torch.no_grad():
         answer = model.answer_question(
-            image,
             prompt,
             tokenizer
         )
-    return answer
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🌓 Moondream2 Image Understanding (Free Tier)")
     gr.Markdown(
-        "⚠️ Uploaded images are processed in memory and not stored permanently."
     )
     with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        text_input = gr.Textbox(
-            label="Question",
-            placeholder="What is in this image?"
-        )
-    output = gr.Textbox(label="Answer")
-    btn = gr.Button("Run")
     btn.click(
         understand_image,
         inputs=[image_input, text_input],
         outputs=output
     )
-demo.launch()

 import gradio as gr
 import torch
 from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
 # =========================
 # Model config
 # =========================
 MODEL_ID = "vikhyatk/moondream2"
+REVISION = "2024-08-26"  # 安定版のリビジョン
 DEVICE = "cpu"
 # =========================
+# Load model (FIXED)
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
+    revision=REVISION,
     trust_remote_code=True
 )
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    revision=REVISION,
+    trust_remote_code=True,
+    torch_dtype=torch.float32,  # CPUの場合はfloat32
+    device_map={"": DEVICE}
 )
 model.eval()
 # =========================
 def understand_image(image, prompt):
     if image is None:
         return "Please upload an image."
+    if not prompt or prompt.strip() == "":
+        return "Please enter a question."
+    try:
+        image = image.convert("RGB")
+        # Moondream2の推論
+        enc_image = model.encode_image(image)
         answer = model.answer_question(
+            enc_image,
             prompt,
             tokenizer
         )
+        return answer
+    except Exception as e:
+        return f"Error: {str(e)}"
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🌓 Moondream2 Image Understanding")
     gr.Markdown(
+        "⚠️ This space runs on CPU. Processing may take a few seconds."
     )
     with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image")
+            text_input = gr.Textbox(
+                label="Question",
+                placeholder="What is in this image?",
+                value="Describe this image."
+            )
+            btn = gr.Button("Run", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Answer", lines=5)
+    # Examples
+    gr.Examples(
+        examples=[
+            ["What objects are in this image?"],
+            ["Describe the scene in detail."],
+            ["What colors do you see?"]
+        ],
+        inputs=text_input
+    )
     btn.click(
         understand_image,
         inputs=[image_input, text_input],
         outputs=output
     )
+demo.launch()