File size: 5,255 Bytes
67e1af1 f13e510 67e1af1 f13e510 67e1af1 f13e510 67e1af1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """Gradio demo for Local UI Locator — standalone HF Space version.
Upload a Windows screenshot → detect interactive elements → view overlay + JSON.
Self-contained: downloads model weights from HF Hub automatically.
"""
from __future__ import annotations
import json
from collections import Counter
import cv2
import gradio as gr
import numpy as np
from huggingface_hub import hf_hub_download
from ultralytics import YOLO
CLASS_NAMES = [
"button", "textbox", "checkbox", "dropdown", "icon", "tab", "menu_item",
]
CLASS_COLORS = {
"button": (255, 127, 0),
"textbox": ( 0, 200, 0),
"checkbox": ( 0, 127, 255),
"dropdown": (200, 0, 200),
"icon": ( 0, 150, 255),
"tab": (255, 0, 100),
"menu_item": (100, 255, 255),
}
# Download model weights from HF Hub on startup.
_weights_path = hf_hub_download(
repo_id="IndextDataLab/windows-ui-locator",
filename="best.pt",
)
_model = YOLO(_weights_path)
def _draw_overlay(img_rgb: np.ndarray, results: list[dict]) -> np.ndarray:
overlay = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
for r in results:
x1, y1, x2, y2 = r["bbox"]
color = CLASS_COLORS.get(r["type"], (200, 200, 200))
label = f"{r['type']} {r['score']:.0%}"
cv2.rectangle(overlay, (x1, y1), (x2, y2), color, 2)
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(overlay, (x1, y1 - th - 6), (x1 + tw + 4, y1), color, -1)
cv2.putText(overlay, label, (x1 + 2, y1 - 4),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
return cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB)
def detect(
image: np.ndarray | None,
conf: float,
iou: float,
class_filter: list[str],
) -> tuple[np.ndarray | None, str, str]:
if image is None:
return None, "Upload an image first.", "[]"
preds = _model.predict(
source=image, conf=conf, iou=iou, verbose=False, max_det=300,
)
results = []
if preds and len(preds) > 0:
boxes = preds[0].boxes
if boxes is not None:
xyxy = boxes.xyxy.cpu().numpy()
confs = boxes.conf.cpu().numpy()
clss = boxes.cls.cpu().numpy().astype(int)
for i, (box, c, cls_id) in enumerate(zip(xyxy, confs, clss)):
cls_name = CLASS_NAMES[cls_id] if cls_id < len(CLASS_NAMES) else f"class_{cls_id}"
if class_filter and cls_name not in class_filter:
continue
results.append({
"id": i,
"type": cls_name,
"bbox": [int(box[0]), int(box[1]), int(box[2]), int(box[3])],
"score": round(float(c), 4),
"center": [int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)],
})
overlay = _draw_overlay(image, results)
counts = Counter(d["type"] for d in results)
summary_parts = [f"**{len(results)} elements detected**"]
for cls_name in sorted(counts):
summary_parts.append(f"- {cls_name}: {counts[cls_name]}")
return overlay, "\n".join(summary_parts), json.dumps(results, indent=2)
with gr.Blocks(title="Windows UI Element Detector") as demo:
gr.Markdown(
"# Windows UI Element Detector\n"
"Upload a Windows screenshot to detect interactive UI elements "
"(buttons, textboxes, checkboxes, dropdowns, icons, tabs, menu items).\n\n"
"**Model:** YOLO11s | **Classes:** 7 | **Dataset:** 3 000 synthetic images"
)
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(label="Screenshot", type="numpy")
with gr.Row():
conf_slider = gr.Slider(
minimum=0.05, maximum=0.95, value=0.3, step=0.05,
label="Confidence threshold",
)
iou_slider = gr.Slider(
minimum=0.1, maximum=0.9, value=0.5, step=0.05,
label="IoU threshold (NMS)",
)
class_filter = gr.CheckboxGroup(
choices=CLASS_NAMES,
label="Filter classes (empty = all)",
)
detect_btn = gr.Button("Detect", variant="primary")
with gr.Column(scale=1):
output_image = gr.Image(label="Detection overlay")
summary_md = gr.Markdown(label="Summary")
with gr.Accordion("JSON output", open=False):
json_output = gr.Code(language="json", label="Detections JSON")
detect_btn.click(
fn=detect,
inputs=[input_image, conf_slider, iou_slider, class_filter],
outputs=[output_image, summary_md, json_output],
)
gr.Markdown(
"---\n"
"MIT License | "
"[GitHub](https://github.com/Indext-Data-Lab/windows-ui-synth) | "
"YOLO11s + EasyOCR + rapidfuzz | "
"Commission a similar tool or a fully integrated AI solution for your business -> "
"[Visit indext.io](https://indext.io/) | "
"[Connect on LinkedIn](https://www.linkedin.com/company/indext-data-lab/)"
)
if __name__ == "__main__":
demo.launch()
|