Spaces:

stpete2
/

image_understand

Sleeping

App Files Files Community

stpete2 commited on 24 days ago

Commit

e0ade18

verified ·

1 Parent(s): 9fe03bb

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -42

app.py CHANGED Viewed

@@ -1,81 +1,74 @@
-import torch
 import gradio as gr
-from transformers import AutoModelForVision2Seq, AutoProcessor
-# ===============================
 # Model config
-# ===============================
 MODEL_ID = "vikhyatk/moondream2"
-device = "cpu"
-# Processor & Model
-processor = AutoProcessor.from_pretrained(
     MODEL_ID,
     trust_remote_code=True
 )
-model = AutoModelForVision2Seq.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float32,  # CPU安全
-).to(device)
 model.eval()
-# ===============================
 # Inference function
-# ===============================
-def infer(image, prompt):
     if image is None:
         return "Please upload an image."
-    if prompt is None or prompt.strip() == "":
-        prompt = "Describe this image."
     with torch.no_grad():
-        answer = model.answer(
-            image=image,
-            question=prompt
         )
     return answer
-# ===============================
 # Gradio UI
-# ===============================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🖼️ Vision Language Demo (moondream2 · CPU)")
     gr.Markdown(
         "⚠️ Uploaded images are processed in memory and not stored permanently."
     )
     with gr.Row():
-        image = gr.Image(
-            type="pil",
-            label="Upload Image"
         )
-        with gr.Column():
-            textbox = gr.Textbox(
-                label="Prompt",
-                value="Describe this image."
-            )
-            btn = gr.Button("Run")
-    output = gr.Textbox(
-        label="Output",
-        lines=6
-    )
     btn.click(
-        fn=infer,
-        inputs=[image, textbox],
         outputs=output
     )
 demo.launch()

 import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoTokenizer
+# =========================
 # Model config
+# =========================
 MODEL_ID = "vikhyatk/moondream2"
+DEVICE = "cpu"
+# =========================
+# Load model (IMPORTANT)
+# =========================
+tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     trust_remote_code=True
 )
+model = torch.load(
+    torch.hub.load_state_dict_from_url(
+        f"https://huggingface.co/{MODEL_ID}/resolve/main/moondream.pt",
+        map_location=DEVICE
+    )
+)
 model.eval()
+# =========================
 # Inference function
+# =========================
+def understand_image(image, prompt):
     if image is None:
         return "Please upload an image."
+    image = image.convert("RGB")
     with torch.no_grad():
+        answer = model.answer_question(
+            image,
+            prompt,
+            tokenizer
         )
     return answer
+# =========================
 # Gradio UI
+# =========================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🌓 Moondream2 Image Understanding (Free Tier)")
     gr.Markdown(
         "⚠️ Uploaded images are processed in memory and not stored permanently."
     )
     with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        text_input = gr.Textbox(
+            label="Question",
+            placeholder="What is in this image?"
         )
+    output = gr.Textbox(label="Answer")
+    btn = gr.Button("Run")
     btn.click(
+        understand_image,
+        inputs=[image_input, text_input],
         outputs=output
     )
 demo.launch()