Spaces:

Mungert
/

GUI-Actor

Paused

App Files Files Community

Mungert commited on Aug 12, 2025

Commit

fba038c

verified ·

1 Parent(s): 9942ab7

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -85

app.py CHANGED Viewed

@@ -1,16 +1,24 @@
-import base64, os
-# import spaces
-import json
 import torch
 import gradio as gr
-from typing import Optional
-from PIL import Image, ImageDraw
 import numpy as np
 import matplotlib.pyplot as plt
-from qwen_vl_utils import process_vision_info
-from datasets import load_dataset
 from transformers import AutoProcessor
-from gui_actor.constants import chat_template
 from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
@@ -24,7 +32,6 @@ def resize_image(image, resize_to_pixels=MAX_PIXELS):
         image = image.resize((image_width_resized, image_height_resized))
     return image
-# @spaces.GPU
 @torch.inference_mode()
 def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
     overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
@@ -33,79 +40,91 @@ def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)
     overlay_draw.ellipse(
         [(x - radius, y - radius), (x + radius, y + radius)],
         outline=color,
-        width=5  # Adjust thickness as needed
     )
     image = image.convert('RGBA')
     combined = Image.alpha_composite(image, overlay)
     combined = combined.convert('RGB')
     return combined
-# @spaces.GPU
 @torch.inference_mode()
 def get_attn_map(image, attn_scores, n_width, n_height):
     w, h = image.size
     scores = np.array(attn_scores[0]).reshape(n_height, n_width)
-    scores_norm = (scores - scores.min()) / (scores.max() - scores.min())
-    # Resize score map to match image size
-    score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) # BILINEAR)
-    # Apply colormap
     colormap = plt.get_cmap('jet')
-    colored_score_map = colormap(np.array(score_map) / 255.0)  # returns RGBA
-    colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8)
-    colored_overlay = Image.fromarray(colored_score_map)
-    # Blend with original image
     blended = Image.blend(image, colored_overlay, alpha=0.3)
     return blended
-# load model
-if torch.cuda.is_available():
-    # os.system('pip install flash-attn --no-build-isolation')
-    model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
-    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
-    tokenizer = data_processor.tokenizer
-    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16,
-        device_map="cuda:0",
-        attn_implementation="flash_attention_2"
-    ).eval()
-else:
     model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
     data_processor = AutoProcessor.from_pretrained(model_name_or_path)
     tokenizer = data_processor.tokenizer
-    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
         model_name_or_path,
-        torch_dtype=torch.bfloat16,
-        device_map="cpu"
     ).eval()
-title = "GUI-Actor"
-header = """
-<div align="center">
-    <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
-    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
-        Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
-        Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
-    </div>
-    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
-        <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">📄 arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">💻 Github Repo</a><br/>
-    </div>
-</div>
-"""
-theme = "soft"
-css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
-            #anno-img .mask.active {opacity: 0.7}"""
-# @spaces.GPU
 @torch.inference_mode()
 def process(image, instruction):
-    # resize image
     w, h = image.size
     if w * h > MAX_PIXELS:
         image = resize_image(image)
     conversation = [
         {
@@ -113,32 +132,39 @@ def process(image, instruction):
             "content": [
                 {
                     "type": "text",
-                    "text": "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>).",
                 }
-            ]
         },
         {
             "role": "user",
             "content": [
-                {
-                    "type": "image",
-                    "image": image, # PIL.Image.Image or str to path
-                    # "image_url": "https://xxxxx.png" or "https://xxxxx.jpg" or "file://xxxxx.png" or "data:image/png;base64,xxxxxxxx", will be split by "base64,"
-                },
-                {
-                    "type": "text",
-                    "text": instruction,
-                },
             ],
         },
     ]
     try:
-        pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3)
     except Exception as e:
-        print(e)
         return image, f"Error: {e}", None
     px, py = pred["topk_points"][0]
     output_coord = f"({px:.4f}, {py:.4f})"
     img_with_point = draw_point(image, (px * w, py * h))
@@ -146,20 +172,37 @@ def process(image, instruction):
     n_width, n_height = pred["n_width"], pred["n_height"]
     attn_scores = pred["attn_scores"]
     att_map = get_attn_map(image, attn_scores, n_width, n_height)
     return img_with_point, output_coord, att_map
-with gr.Blocks(title=title, css=css) as demo:
     gr.Markdown(header)
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(
-                type='pil', label='Upload image')
-            # text box
-            input_instruction = gr.Textbox(label='Instruction', placeholder='Text your (low-level) instruction here')
-            submit_button = gr.Button(
-                value='Submit', variant='primary')
         with gr.Column():
             image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
             with gr.Accordion('Detailed prediction'):
@@ -168,13 +211,11 @@ with gr.Blocks(title=title, css=css) as demo:
     submit_button.click(
         fn=process,
-        inputs=[
-            input_image,
-            input_instruction
-        ],
-        outputs=[image_with_point, pred_xy, att_map]
     )
-# demo.launch(debug=False, show_error=True, share=True)
-# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
-demo.queue().launch(share=False)

+import base64, os, json
+from typing import Optional
 import torch
 import gradio as gr
 import numpy as np
 import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
+# ---- Hugging Face Spaces GPU decorator (safe fallback when not on Spaces) ----
+try:
+    import spaces
+    GPU_DECORATOR = spaces.GPU
+except Exception:
+    def GPU_DECORATOR(fn):  # no-op locally
+        return fn
+from qwen_vl_utils import process_vision_info  # noqa: F401 (kept for parity if used elsewhere)
+from datasets import load_dataset              # noqa: F401
 from transformers import AutoProcessor
+from gui_actor.constants import chat_template  # noqa: F401
 from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
         image = image.resize((image_width_resized, image_height_resized))
     return image
 @torch.inference_mode()
 def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
     overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
     overlay_draw.ellipse(
         [(x - radius, y - radius), (x + radius, y + radius)],
         outline=color,
+        width=5
     )
     image = image.convert('RGBA')
     combined = Image.alpha_composite(image, overlay)
     combined = combined.convert('RGB')
     return combined
 @torch.inference_mode()
 def get_attn_map(image, attn_scores, n_width, n_height):
     w, h = image.size
     scores = np.array(attn_scores[0]).reshape(n_height, n_width)
+    scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
+    score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST)
     colormap = plt.get_cmap('jet')
+    colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3]
+    colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8))
     blended = Image.blend(image, colored_overlay, alpha=0.3)
     return blended
+# ----------------------------
+# Model/device init for Spaces
+# ----------------------------
+def _pick_gpu_dtype() -> torch.dtype:
+    if not torch.cuda.is_available():
+        return torch.float32
+    major, minor = torch.cuda.get_device_capability()
+    # Ampere (8.x) / Hopper (9.x) support bf16 well
+    return torch.bfloat16 if major >= 8 else torch.float16
+# Global holders initialized in load_model()
+model = None
+tokenizer = None
+data_processor = None
+@GPU_DECORATOR   # <-- This is what Spaces looks for at startup
+def load_model():
+    """
+    Allocates the GPU on Spaces and loads the model on the right device/dtype.
+    Runs once at startup.
+    """
+    global model, tokenizer, data_processor
     model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    dtype = _pick_gpu_dtype()
+    # Enable some healthy defaults on GPU
+    if device.startswith("cuda"):
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.set_grad_enabled(False)
     data_processor = AutoProcessor.from_pretrained(model_name_or_path)
     tokenizer = data_processor.tokenizer
+    # Use SDPA attention to avoid flash-attn dependency
+    attn_impl = "sdpa"
+    model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
         model_name_or_path,
+        torch_dtype=dtype,
+        attn_implementation=attn_impl,
     ).eval()
+    # Move to device explicitly (avoid accelerate unless you need sharding)
+    model_local.to(device)
+    model = model_local
+    return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})"
+# Trigger model loading on import so Spaces allocates GPU immediately
+_ = load_model()
+@GPU_DECORATOR
 @torch.inference_mode()
 def process(image, instruction):
+    # Safety: ensure model is loaded
+    if model is None:
+        _ = load_model()
+    # Resize if needed
     w, h = image.size
     if w * h > MAX_PIXELS:
         image = resize_image(image)
+        w, h = image.size
     conversation = [
         {
             "content": [
                 {
                     "type": "text",
+                    "text": (
+                        "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
+                        "your task is to locate the screen element that corresponds to the instruction. "
+                        "Output a PyAutoGUI action with a special token that points to the correct location."
+                    ),
                 }
+            ],
         },
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": instruction},
             ],
         },
     ]
+    device = next(model.parameters()).device
     try:
+        pred = inference(
+            conversation,
+            model,
+            tokenizer,
+            data_processor,
+            use_placeholder=True,
+            topk=3,
+            device=str(device),
+        )
     except Exception as e:
+        print("inference error:", e)
         return image, f"Error: {e}", None
     px, py = pred["topk_points"][0]
     output_coord = f"({px:.4f}, {py:.4f})"
     img_with_point = draw_point(image, (px * w, py * h))
     n_width, n_height = pred["n_width"], pred["n_height"]
     attn_scores = pred["attn_scores"]
     att_map = get_attn_map(image, attn_scores, n_width, n_height)
     return img_with_point, output_coord, att_map
+# ----------------------------
+# Gradio UI
+# ----------------------------
+title = "GUI-Actor"
+header = """
+<div align="center">
+    <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
+    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
+        Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
+        Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
+    </div>
+    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
+        <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">📄 arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">💻 Github Repo</a><br/>
+    </div>
+</div>
+"""
+theme = "soft"
+css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
+#anno-img .mask.active {opacity: 0.7}"""
+with gr.Blocks(title=title, css=css, theme=theme) as demo:
     gr.Markdown(header)
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(type='pil', label='Upload image')
+            input_instruction = gr.Textbox(label='Instruction', placeholder='Type your (low-level) instruction here')
+            submit_button = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
             with gr.Accordion('Detailed prediction'):
     submit_button.click(
         fn=process,
+        inputs=[input_image, input_instruction],
+        outputs=[image_with_point, pred_xy, att_map],
+        queue=True,
+        api_name="predict",
     )
+# On Spaces, queue is required to get GPU scheduling; set a modest concurrency
+demo.queue(concurrency_count=1, max_size=8).launch(share=False)