File size: 3,736 Bytes
c7547a1
d35bac6
 
 
 
 
 
 
 
c7547a1
d35bac6
 
 
 
 
 
 
 
c7547a1
 
 
 
d35bac6
c7547a1
 
 
d35bac6
c7547a1
d35bac6
c7547a1
d35bac6
c7547a1
 
 
 
 
 
 
 
 
 
 
d35bac6
 
c7547a1
d35bac6
 
 
 
 
 
 
 
 
c7547a1
d35bac6
 
 
c7547a1
d35bac6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7547a1
d35bac6
c7547a1
d35bac6
 
 
 
 
 
 
c7547a1
d35bac6
 
c7547a1
 
d35bac6
 
 
c7547a1
 
 
d35bac6
 
 
c7547a1
d35bac6
c7547a1
d35bac6
c7547a1
 
d35bac6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# app.py — CPU-only image→text QA via Transformers pipeline + Gradio

from packaging import version
import transformers
from transformers import pipeline
import torch
import gradio as gr
from PIL import Image

# ---- Governance: ensure pipeline task is supported ----
MIN_TF = "4.46.0"
if version.parse(transformers.__version__) < version.parse(MIN_TF):
    raise RuntimeError(
        f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
        f"Found {transformers.__version__}. Upgrade:\n"
        f"  pip install -U 'transformers>={MIN_TF},<5'"
    )

# -------- Choose a CPU-friendly model here --------
# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MODEL_ID = "vikhyatk/moondream2"
# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct"  # example tiny option

# ---- Force CPU posture ----
DEVICE = "cpu"
DTYPE = torch.float32  # CPU-safe

# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
try:
    import torchvision  # noqa: F401
except Exception:
    pass  # If your chosen model needs it, install torchvision

# ---- Bootstrap pipeline (CPU only) ----
pipe = pipeline(
    task="image-text-to-text",
    model=MODEL_ID,
    device=DEVICE,      # <- forces CPU
    dtype=DTYPE,        # <- CPU dtype
    trust_remote_code=True,
    use_fast=True,      # if supported by the model’s processor
)

def _extract_text(obj):
    """Normalize pipeline outputs to plain text (handles chat-style payloads)."""
    if obj is None:
        return ""
    if isinstance(obj, str):
        return obj
    if isinstance(obj, dict):
        gen = obj.get("generated_text")
        if isinstance(gen, str):
            return gen
        if isinstance(gen, (list, tuple)) and gen:
            # Prefer assistant turns if present
            for turn in reversed(gen):
                if isinstance(turn, dict) and turn.get("role") == "assistant":
                    content = turn.get("content")
                    return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
            return _extract_text(gen[0])
        if "text" in obj and isinstance(obj["text"], str):
            return obj["text"]
        return str(obj)
    if isinstance(obj, (list, tuple)) and obj:
        return _extract_text(obj[0])
    return str(obj)

def infer(image: Image.Image, question: str) -> str:
    if image is None:
        return "Please upload an image."
    q = (question or "").strip()
    if not q:
        return "Please enter a question."

    # Preferred: chat-style messages (auto-injects image tokens correctly)
    try:
        out = pipe(
            text=[{
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": q},
                ],
            }],
            max_new_tokens=96,
        )
    except Exception:
        # Fallback: dict API — ensure a LIST for images
        out = pipe({"images": [image], "text": q}, max_new_tokens=96)

    return _extract_text(out).strip() or "(empty response)"

# ---- Gradio UI ----
with gr.Blocks(title="CPU-only Vision QA") as demo:
    gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an image")
        with gr.Column():
            prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
            submit = gr.Button("Ask")
            out = gr.TextArea(label="Answer", lines=6)

    submit.click(infer, [img, prompt], out)
    prompt.submit(infer, [img, prompt], out)

if __name__ == "__main__":
    demo.queue().launch(debug=True)