Spaces:

dpv007
/

ui

Runtime error

App Files Files Community

dpv007 commited on Apr 2

Commit

2fbdc5f

verified ·

1 Parent(s): 3c7ea1e

Create app.py

Browse files

Files changed (1) hide show

app.py +119 -0

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import re
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from PIL import Image
+import gradio as gr
+MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B"
+# Load model + processor
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+# ----------------------------
+# Coordinate Extraction
+# ----------------------------
+def extract_coordinates(text, image_size):
+    """
+    Extracts coordinates from model output.
+    Supports:
+    - (x, y)
+    - [x1, y1, x2, y2]
+    - normalized (0.0–1.0)
+    """
+    width, height = image_size
+    # Match (x, y)
+    match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text)
+    if match:
+        x, y = float(match.group(1)), float(match.group(2))
+        # If normalized (0–1), convert to pixels
+        if x <= 1 and y <= 1:
+            x = int(x * width)
+            y = int(y * height)
+        else:
+            x = int(x)
+            y = int(y)
+        return (x, y)
+    # Match bounding box [x1, y1, x2, y2]
+    match_box = re.search(r"\[([\d\.,\s]+)\]", text)
+    if match_box:
+        nums = list(map(float, match_box.group(1).split(",")))
+        if len(nums) == 4:
+            x1, y1, x2, y2 = nums
+            # Normalize if needed
+            if max(nums) <= 1:
+                x1, x2 = int(x1 * width), int(x2 * width)
+                y1, y2 = int(y1 * height), int(y2 * height)
+            else:
+                x1, y1, x2, y2 = map(int, nums)
+            return (x1, y1, x2, y2)
+    return None
+# ----------------------------
+# Prediction Function
+# ----------------------------
+def predict(image, prompt):
+    if image is None:
+        return "Please upload an image.", "No coordinates"
+    image_pil = Image.fromarray(image).convert("RGB")
+    width, height = image_pil.size
+    inputs = processor(
+        images=image_pil,
+        text=prompt,
+        return_tensors="pt"
+    ).to(model.device)
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=200
+        )
+    result = processor.batch_decode(output, skip_special_tokens=True)[0]
+    coords = extract_coordinates(result, (width, height))
+    if coords:
+        coord_text = f"{coords}  (Origin = top-left, x→right, y↓)"
+    else:
+        coord_text = "No coordinates detected"
+    return result, coord_text
+# ----------------------------
+# Gradio UI
+# ----------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 UI-TARS-1.5 GUI Agent Demo")
+    with gr.Row():
+        image_input = gr.Image(type="numpy", label="Upload Image / Screenshot")
+        text_input = gr.Textbox(label="Instruction / Prompt", placeholder="e.g. Click the login button")
+    run_btn = gr.Button("Run")
+    output_text = gr.Textbox(label="Model Output")
+    coord_output = gr.Textbox(label="Detected Coordinates")
+    run_btn.click(
+        fn=predict,
+        inputs=[image_input, text_input],
+        outputs=[output_text, coord_output]
+    )
+demo.launch()