Spaces:

Stremly
/

uitars

Runtime error

App Files Files Community

Abs6187 commited on Jul 8, 2025

Commit

7860e5b

1 Parent(s): 0301cd4

Adding Prompt and UI Changes

Browse files

Files changed (1) hide show

app.py +112 -84

app.py CHANGED Viewed

@@ -4,123 +4,151 @@ import ast
 import torch
 from PIL import Image, ImageDraw
 import gradio as gr
-import base64
-from io import BytesIO
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
     torch_dtype=torch.float16
 )
 _PROCESSOR = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
-    size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},  # sane res
     use_fast=True,
 )
 model = _MODEL
 processor = _PROCESSOR
-def draw_point(image: Image.Image, point=None, radius: int = 5):
-    """Overlay a red dot on the screenshot where the model clicked."""
     img = image.copy()
-    if point:
         x, y = point[0] * img.width, point[1] * img.height
-        ImageDraw.Draw(img).ellipse(
-            (x - radius, y - radius, x + radius, y + radius), fill="red"
         )
     return img
 @spaces.GPU
 def navigate(screenshot, task: str):
-    """Run one inference step on the GUI‑reasoning model.
-    Args:
-        screenshot (PIL.Image): Latest UI screenshot.
-        task (str): Natural‑language task description
-        history (list | str | None): Previous messages list. Accepts either an
-            actual Python list (via gr.JSON) or a JSON/Python‑literal string.
-    """
-    # ───────────────────── normalise history input ──────────────────────────
-    messages=[]
-    prompt_header = (
-            "You are a GUI agent. You are given a task and your action history, with screenshots."
-            "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
-            f"## User Instruction\n{task}"
-        )
-    current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
-    messages.append(current)
-    #New Comment 1
-    # ─────────────────────────── model forward ─────────────────────────────
     images, videos = process_vision_info(messages)
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     inputs = processor(
-        text=[text],
-        images=images,
-        videos=videos,
-        padding=True,
-        return_tensors="pt",
     ).to("cuda")
-    generated = model.generate(**inputs, max_new_tokens=128)
-    trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
-    ]
-    raw_out = processor.batch_decode(
-        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    # ─────── draw predicted click for quick visual verification (optional) ──────
     try:
-        actions = ast.literal_eval(raw_out)
-        for act in actions if isinstance(actions, list) else [actions]:
-            pos = act.get("position")
-            if pos and isinstance(pos, list) and len(pos) == 2:
                 screenshot = draw_point(screenshot, pos)
-    except Exception:
-        # decoding failed → just return original screenshot
         pass
     return screenshot, raw_out, messages
-# ────────────────────────── Gradio interface ───────────────────────────────
-demo = gr.Interface(
-    fn=navigate,
-    inputs=[
-        gr.Image(type="pil", label="Screenshot"),
-        gr.Textbox(
-            lines=1,
-            placeholder="e.g. Search the weather for New York",
-            label="Task",
-        )
-    ],
-    outputs=[
-        gr.Image(label="With Click Point"),
-        gr.Textbox(label="Raw Action JSON"),
-        gr.JSON(label="Updated Conversation History")
-    ],
-    title="UI-Tars Navigation Demo",
-)
-demo.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    share=False,      # or True if you need a public link
-    ssr_mode=False,   # turn off experimental SSR so the process blocks
-)

 import torch
 from PIL import Image, ImageDraw
 import gradio as gr
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
+# --- Model and Processor Initialization ---
+# This setup is standard and remains unchanged.
 _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
     torch_dtype=torch.float16
 )
 _PROCESSOR = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
+    size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
     use_fast=True,
 )
 model = _MODEL
 processor = _PROCESSOR
+def draw_point(image: Image.Image, point=None, radius: int = 15):
+    """Overlays a larger, more visible red dot on the screenshot."""
     img = image.copy()
+    if point and isinstance(point, list) and len(point) == 2:
         x, y = point[0] * img.width, point[1] * img.height
+        draw = ImageDraw.Draw(img)
+        # Draw a larger ellipse for better visibility on high-res screens
+        draw.ellipse(
+            (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
         )
     return img
 @spaces.GPU
 def navigate(screenshot, task: str):
+    """Runs a single inference step of the GUI reasoning model."""
+    if not screenshot or not task:
+        # Added basic validation to prevent errors with empty inputs
+        return None, "Please provide both a screenshot and a task.", []
+    messages = []
+    # --- KEY CHANGE: Refined Prompt for Concise Reasoning ---
+    # The 'Note' section is updated to guide the model towards a shorter, more direct "Thought" process.
+    prompt_header = (
+        "You are a GUI agent. You are given a task and a screenshot. Your goal is to determine the next action.\n\n"
+        "## Output Format\n```\nThought: ...\nAction: ...\n```\n\n"
+        "## Action Space\n"
+        "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
+        "type(content='...')\n"
+        "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
+        "finished(content='...')\n\n"
+        "## Note\n"
+        "- In the `Thought` part, briefly state your reasoning in a single, direct sentence.\n"
+        "- Always use 'win' instead of 'meta' for hotkeys.\n\n"
+        f"## User Instruction\n{task}"
+    )
+    content = [
+        {"type": "text", "text": prompt_header},
+        {"type": "image_url", "image_url": screenshot}
+    ]
+    messages.append({"role": "user", "content": content})
     images, videos = process_vision_info(messages)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
+        text=[text], images=images, videos=videos, padding=True, return_tensors="pt"
     ).to("cuda")
+    generated = model.generate(**inputs, max_new_tokens=256)
+    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)]
+    raw_out = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     try:
+        if "Action:" in raw_out:
+            action_part = raw_out.split("Action:")[1].strip()
+            # The model sometimes wraps its output in ```, so we remove it.
+            if action_part.startswith("```") and action_part.endswith("```"):
+                action_part = action_part[3:-3].strip()
+            action_dict = ast.literal_eval(action_part)
+            box_str = action_dict.get("start_box")
+            if box_str and isinstance(box_str, str) and "( " in box_str:
+                coords_part = box_str.split('( ')[1].split(' )')[0]
+                x_str, y_str = coords_part.split(', ')
+                pos = [float(x_str), float(y_str)]
                 screenshot = draw_point(screenshot, pos)
+    except (Exception, SyntaxError) as e:
+        print(f"Could not parse action or draw point: {e}")
         pass
     return screenshot, raw_out, messages
+# --- KEY CHANGE: Enhanced Gradio UI ---
+# The interface is rebuilt using gr.Blocks for a cleaner layout and better user guidance.
+with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
+    gr.Markdown(
+        """
+        # ✨ Enhanced UI-Tars Navigation Demo
+        **Upload a screenshot and provide a task to see how the AI plans its next action.**
+        The model will analyze the image and your instruction, then output its thought process and the specific action it would take. A red dot will indicate the target location for clicks or scrolls.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            screenshot_in = gr.Image(type="pil", label="Screenshot")
+            task_in = gr.Textbox(
+                lines=2,
+                placeholder="e.g., Click on the 'Sign In' button.",
+                label="Task Instruction",
+            )
+            submit_btn = gr.Button("Analyze Action", variant="primary")
+            gr.Examples(
+                examples=[
+                    ["examples/google.png", "Search for 'latest AI news'"],
+                    ["examples/github.png", "Find the search bar and type 'Qwen'"],
+                    ["examples/figma.png", "Select the blue rectangle on the canvas"],
+                ],
+                inputs=[screenshot_in, task_in],
+                label="Example Use Cases"
+            )
+        with gr.Column(scale=2):
+            screenshot_out = gr.Image(label="Result: Screenshot with Click Point", interactive=False)
+            with gr.Accordion("Model Output Details", open=False):
+                raw_out = gr.Textbox(label="Full Model Output (Thought & Action)", interactive=False)
+                history_out = gr.JSON(label="Conversation History for Debugging", interactive=False)
+    submit_btn.click(
+        fn=navigate,
+        inputs=[screenshot_in, task_in],
+        outputs=[screenshot_out, raw_out, history_out],
+    )
+    gr.Markdown(
+        """
+        ---
+        *Model: [ByteDance-Seed/UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B)*
+        """
+    )
+if __name__ == "__main__":
+    # To run this, you'll need to create an 'examples' directory with the sample images.
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )