Spaces:

dpv007
/

ui

Runtime error

File size: 2,730 Bytes

2fbdc5f
 
68be95b
2fbdc5f
 
 
 
 
68be95b
 
 
2fbdc5f
68be95b
 
2fbdc5f
68be95b
 
2fbdc5f
 
68be95b
 
2fbdc5f
 
 
 
 
 
 
 
 
 
 
 
 
 
68be95b
2fbdc5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68be95b
2fbdc5f
 
 
68be95b
2fbdc5f
 
 
 
 
 
 
 
68be95b
2fbdc5f
 
 
 
68be95b
2fbdc5f
 
 
 
 
 
68be95b
 
 
 
2fbdc5f
 
 
 
 
68be95b
2fbdc5f
 
68be95b
2fbdc5f
 
68be95b
 
2fbdc5f
68be95b
2fbdc5f
 
68be95b
2fbdc5f
68be95b
2fbdc5f

import torch
import re
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import gradio as gr

MODEL_ID = "ByteDance-Seed/UI-TARS-1.5-7B"

# ----------------------------
# Load model (CPU optimized)
# ----------------------------
processor = AutoProcessor.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,        # CPU safe
    low_cpu_mem_usage=True
)

model.eval()

# ----------------------------
# Coordinate Extraction
# ----------------------------
def extract_coordinates(text, image_size):
    width, height = image_size

    match = re.search(r"\(([\d\.]+),\s*([\d\.]+)\)", text)
    if match:
        x, y = float(match.group(1)), float(match.group(2))

        if x <= 1 and y <= 1:
            x = int(x * width)
            y = int(y * height)
        else:
            x, y = int(x), int(y)

        return (x, y)

    match_box = re.search(r"\[([\d\.,\s]+)\]", text)
    if match_box:
        nums = list(map(float, match_box.group(1).split(",")))
        if len(nums) == 4:
            x1, y1, x2, y2 = nums

            if max(nums) <= 1:
                x1, x2 = int(x1 * width), int(x2 * width)
                y1, y2 = int(y1 * height), int(y2 * height)
            else:
                x1, y1, x2, y2 = map(int, nums)

            return (x1, y1, x2, y2)

    return None


# ----------------------------
# Prediction
# ----------------------------
def predict(image, prompt):
    if image is None:
        return "Upload image", "No coordinates"

    image_pil = Image.fromarray(image).convert("RGB")
    width, height = image_pil.size

    inputs = processor(
        images=image_pil,
        text=prompt,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150
        )

    result = processor.batch_decode(output, skip_special_tokens=True)[0]

    coords = extract_coordinates(result, (width, height))

    coord_text = (
        f"{coords} (origin: top-left, x→right, y↓)"
        if coords else "No coordinates detected"
    )

    return result, coord_text


# ----------------------------
# UI
# ----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# UI-TARS CPU Demo (Slow ⚠️)")

    with gr.Row():
        image_input = gr.Image(type="numpy", label="Image")
        text_input = gr.Textbox(label="Prompt")

    btn = gr.Button("Run")

    output_text = gr.Textbox(label="Model Output")
    coord_output = gr.Textbox(label="Coordinates")

    btn.click(
        fn=predict,
        inputs=[image_input, text_input],
        outputs=[output_text, coord_output]
    )

demo.launch()