Spaces:

stpete2
/

image_understand

Sleeping

stpete2 commited on Dec 14, 2025

Commit

361522e

verified ·

1 Parent(s): a284910

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,46 +1,47 @@
 import gradio as gr
-import base64
-import os
-from openai import OpenAI
-client = OpenAI(api_key=os.environ["openai_api_key"])
-def image_understand(image, prompt):
-    # PIL Image → base64
-    import io
-    buf = io.BytesIO()
-    image.save(buf, format="PNG")
-    img_b64 = base64.b64encode(buf.getvalue()).decode()
-    response = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{img_b64}"
-                        },
-                    },
-                ],
-            }
-        ],
-        max_tokens=300,
     )
-    return response.choices[0].message.content
-gr.Interface(
     fn=image_understand,
     inputs=[
-        gr.Image(type="pil", label="Image"),
-        gr.Textbox(
-            value="Describe this image objectively.",
-            label="Prompt"
-        )
     ],
-    outputs=gr.Textbox(label="Result"),
-    title="Image Understanding Demo (GPT-4o mini)"
-).launch()

 import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+MODEL_ID = "llava-hf/llava-1.5-7b-hf"
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float32,
+    device_map="cpu"
+)
+def image_understand(image, text):
+    if image is None:
+        return "Please upload an image."
+    image = image.convert("RGB")
+    prompt = f"USER: <image>\n{text}\nASSISTANT:"
+    inputs = processor(
+        images=image,
+        text=prompt,
+        return_tensors="pt"
     )
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=200
+        )
+    return processor.decode(output[0], skip_special_tokens=True)
+demo = gr.Interface(
     fn=image_understand,
     inputs=[
+        gr.Image(type="pil"),
+        gr.Textbox(label="Question")
     ],
+    outputs=gr.Textbox(label="Answer"),
+    title="Free Vision LLM Demo (HF Spaces, CPU)"
+)
+demo.launch()