Spaces:

stpete2
/

image_understand

Sleeping

stpete2 commited on 27 days ago

Commit

13cece1

verified ·

1 Parent(s): 6a19471

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,48 +1,53 @@
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForVision2Seq
-MODEL_ID = "llava-hf/llava-1.5-7b-hf"
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(
     MODEL_ID,
-    dtype=torch.float32,
-    device_map="cpu"
 )
-def image_understand(image, text):
     if image is None:
-        return "Please upload an image."
     image = image.convert("RGB")
-    prompt = f"USER: <image>\n{text}\nASSISTANT:"
-    inputs = processor(
-        images=image,
-        text=prompt,
-        return_tensors="pt"
-    )
     with torch.no_grad():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=200
         )
-    return processor.decode(output[0], skip_special_tokens=True)
-demo = gr.Interface(
-    fn=image_understand,
-    inputs=[
-        gr.Image(type="pil", label="Image"),
-        gr.Textbox(label="Question")
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="Free Vision LLM Demo (HF Spaces CPU)"
-)
 demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
+MODEL_ID = "vikhyatk/moondream2"
+# ---- Load model (CPU) ----
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
+    torch_dtype=torch.float32,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True
 )
+model.eval()
+# ---- Inference ----
+def caption_image(image, prompt):
     if image is None:
+        return "No image provided."
     image = image.convert("RGB")
     with torch.no_grad():
+        answer = model.answer_question(
+            image,
+            prompt if prompt else "Describe the image.",
+            tokenizer
         )
+    return answer
+# ---- Gradio UI ----
+with gr.Blocks() as demo:
+    gr.Markdown("# 🖼️ Vision Chatbot (moondream2, CPU)")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Describe the image / What is happening?"
+            )
+            output = gr.Textbox(label="Model Output")
+    btn = gr.Button("Run")
+    btn.click(
+        fn=caption_image,
+        inputs=[image_input, text_input],
+        outputs=output
+    )
 demo.launch()