Spaces:

stpete2
/

image_understand

Sleeping

App Files Files Community

stpete2 commited on 26 days ago

Commit

0712e7e

verified ·

1 Parent(s): 3eece5f

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -35

app.py CHANGED Viewed

@@ -7,18 +7,20 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 # Model config
 # =========================
 MODEL_ID = "vikhyatk/moondream2"
-REVISION = "2024-08-26"
 DEVICE = "cpu"
 # =========================
 # Load model
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     revision=REVISION,
     trust_remote_code=True
 )
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     revision=REVISION,
@@ -28,76 +30,85 @@ model = AutoModelForCausalLM.from_pretrained(
 ).to(DEVICE)
 model.eval()
 # =========================
-# Inference function (修正版)
 # =========================
 def understand_image(image, prompt):
     if image is None:
-        return "Please upload an image."
     if not prompt or prompt.strip() == "":
-        return "Please enter a question."
     try:
         image = image.convert("RGB")
-        # 代替方法1: answer_questionを直接使用
         with torch.no_grad():
-            enc_image = model.encode_image(image)
-            answer = model.answer_question(enc_image, prompt, tokenizer)
         return answer
-    except AttributeError as e:
-        # 代替方法2: 別のAPIを試す
-        try:
-            with torch.no_grad():
-                answer = model(image, prompt)
-            return answer
-        except:
-            return f"Model API Error: {str(e)}\n\nThis model version may not be compatible. Please check the Moondream2 documentation."
     except Exception as e:
-        return f"Error: {str(e)}"
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🌓 Moondream2 Image Understanding")
     gr.Markdown(
-        "Upload an image and ask questions about it. Processing runs on CPU and may take 10-30 seconds."
     )
     with gr.Row():
         with gr.Column():
-            image_input = gr.Image(type="pil", label="Upload Image")
             text_input = gr.Textbox(
-                label="Question",
-                placeholder="What is in this image?",
-                value="Describe this image."
             )
             btn = gr.Button("🔍 Analyze Image", variant="primary", size="lg")
         with gr.Column():
             output = gr.Textbox(
-                label="Answer",
-                lines=8,
-                placeholder="The answer will appear here..."
             )
-    # Examples
-    gr.Markdown("### Example Questions:")
     gr.Examples(
         examples=[
-            ["Describe this image in detail."],
-            ["What objects are visible in this image?"],
-            ["What colors are prominent in this image?"],
-            ["What is the main subject of this image?"],
-            ["Are there any people in this image?"]
         ],
         inputs=text_input,
-        label="Click to use"
     )
     btn.click(
@@ -106,11 +117,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         outputs=output
     )
-    # Enter keyでも実行できるように
     text_input.submit(
         understand_image,
         inputs=[image_input, text_input],
         outputs=output
     )
-demo.launch()

 # Model config
 # =========================
 MODEL_ID = "vikhyatk/moondream2"
+REVISION = "2025-01-09"  # より新しいリビジョン
 DEVICE = "cpu"
 # =========================
 # Load model
 # =========================
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     revision=REVISION,
     trust_remote_code=True
 )
+print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     revision=REVISION,
 ).to(DEVICE)
 model.eval()
+print("Model loaded successfully!")
 # =========================
+# Inference function
 # =========================
 def understand_image(image, prompt):
     if image is None:
+        return "❌ Please upload an image."
     if not prompt or prompt.strip() == "":
+        return "❌ Please enter a question."
     try:
         image = image.convert("RGB")
+        print(f"Processing question: {prompt}")
+        # Moondream2の正しいAPI
         with torch.no_grad():
+            # 画像をエンコード
+            image_embeds = model.encode_image(image)
+            # 質問に回答
+            answer = model.answer_question(
+                image_embeds=image_embeds,
+                question=prompt,
+                tokenizer=tokenizer
+            )
+        print(f"Answer generated: {answer}")
         return answer
     except Exception as e:
+        error_msg = str(e)
+        print(f"Error occurred: {error_msg}")
+        # デバッグ情報を追加
+        available_methods = [method for method in dir(model) if not method.startswith('_')]
+        return f"❌ Error: {error_msg}\n\n🔍 Available model methods:\n{', '.join(available_methods[:20])}"
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🌓 Moondream2 Image Understanding")
     gr.Markdown(
+        "Upload an image and ask questions about it. ⚠️ CPU processing may take 20-40 seconds."
     )
     with gr.Row():
         with gr.Column():
+            image_input = gr.Image(type="pil", label="📸 Upload Image")
             text_input = gr.Textbox(
+                label="❓ Your Question",
+                placeholder="What do you see in this image?",
+                value="Describe this image in detail.",
+                lines=2
             )
             btn = gr.Button("🔍 Analyze Image", variant="primary", size="lg")
         with gr.Column():
             output = gr.Textbox(
+                label="💬 Answer",
+                lines=10,
+                placeholder="The AI's response will appear here..."
             )
+    gr.Markdown("### 💡 Example Questions:")
     gr.Examples(
         examples=[
+            ["Describe what you see in this image."],
+            ["What objects are in this image?"],
+            ["What is the main subject?"],
+            ["What colors are most prominent?"],
+            ["Is this indoors or outdoors?"],
+            ["How many people are in the image?"]
         ],
         inputs=text_input,
+        label="Click to use these questions"
     )
     btn.click(
         outputs=output
     )
     text_input.submit(
         understand_image,
         inputs=[image_input, text_input],
         outputs=output
     )
+if __name__ == "__main__":
+    demo.launch()