File size: 2,219 Bytes
3b7dd4e
 
 
 
138c4d4
7d00133
138c4d4
3b7dd4e
 
138c4d4
 
 
 
 
3b7dd4e
 
138c4d4
7d00133
 
 
3b7dd4e
 
7d00133
 
3b7dd4e
138c4d4
7d00133
138c4d4
3b7dd4e
 
138c4d4
 
 
7d00133
138c4d4
 
3b7dd4e
 
 
 
138c4d4
3b7dd4e
 
 
 
 
 
 
 
 
 
 
 
 
7d00133
 
3b7dd4e
 
138c4d4
 
7d00133
 
138c4d4
3b7dd4e
 
 
 
 
 
 
 
 
138c4d4
3b7dd4e
 
138c4d4
 
 
3b7dd4e
 
 
 
7d00133
3b7dd4e
 
7d00133
 
3b7dd4e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# =========================
# Load model (CPU optimized)
# =========================
model_id = "microsoft/GUI-Actor-Verifier-2B"

processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True
)

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float32,   # CPU needs float32
    device_map="cpu",            # force CPU
    low_cpu_mem_usage=True
)

model.eval()


# =========================
# Inference
# =========================
def run_model(image, prompt):
    try:
        if image is None:
            return "❌ Please upload an image."

        if not prompt or prompt.strip() == "":
            prompt = "Describe this image."

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt}
                ]
            }
        ]

        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )

        # Move tensors to CPU explicitly
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,   # IMPORTANT: keep small for CPU
                do_sample=False
            )

        result = processor.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return result

    except Exception as e:
        return f"❌ Error: {str(e)}"


# =========================
# UI
# =========================
demo = gr.Interface(
    fn=run_model,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Your Question")
    ],
    outputs=gr.Textbox(label="Model Output"),
    title="GUI Actor Verifier (CPU Mode)",
    description="⚠️ Running on CPU — responses may be slow."
)

demo.launch()