Upload Qwen-Click-DiT model
Browse files
README.md
CHANGED
|
@@ -22,24 +22,69 @@ This model predicts click coordinates given a screenshot and natural language in
|
|
| 22 |
- **Qwen2.5-VL-3B** as a frozen vision-language backbone
|
| 23 |
- **DiT (Diffusion Transformer)** action head using flow matching
|
| 24 |
|
| 25 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
```python
|
|
|
|
|
|
|
| 28 |
from transformers import AutoProcessor, AutoConfig
|
|
|
|
|
|
|
|
|
|
| 29 |
from src.model import Qwen2_5_VLForClickPrediction
|
| 30 |
|
| 31 |
# Load model
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
```
|
| 41 |
|
| 42 |
-
See [GitHub repo](https://github.com/
|
| 43 |
|
| 44 |
## Metrics
|
| 45 |
|
|
@@ -69,11 +114,11 @@ See [GitHub repo](https://github.com/TESS-Computer/qwen-click-dit) for full infe
|
|
| 69 |
## Citation
|
| 70 |
|
| 71 |
```bibtex
|
| 72 |
-
@misc{
|
| 73 |
title = {Qwen-Click-DiT: Vision-Language Model with Diffusion Transformer for GUI Click Prediction},
|
| 74 |
author = {Lezzaik, Hussein},
|
| 75 |
-
year = {
|
| 76 |
-
howpublished = {\url{https://github.com/HusseinLezzaik/
|
| 77 |
}
|
| 78 |
```
|
| 79 |
|
|
|
|
| 22 |
- **Qwen2.5-VL-3B** as a frozen vision-language backbone
|
| 23 |
- **DiT (Diffusion Transformer)** action head using flow matching
|
| 24 |
|
| 25 |
+
## Quick Start
|
| 26 |
+
|
| 27 |
+
### Installation
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
pip install torch transformers accelerate qwen-vl-utils pillow
|
| 31 |
+
git clone https://github.com/HusseinLezzaik/Qwen-Click-DiT.git
|
| 32 |
+
cd Qwen-Click-DiT
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### Inference
|
| 36 |
|
| 37 |
```python
|
| 38 |
+
import torch
|
| 39 |
+
from PIL import Image
|
| 40 |
from transformers import AutoProcessor, AutoConfig
|
| 41 |
+
from qwen_vl_utils import process_vision_info
|
| 42 |
+
|
| 43 |
+
# Clone the repo first to get the model class
|
| 44 |
from src.model import Qwen2_5_VLForClickPrediction
|
| 45 |
|
| 46 |
# Load model
|
| 47 |
+
model_id = "TESS-Computer/qwen-click-dit"
|
| 48 |
+
config = AutoConfig.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
|
| 49 |
+
config.dit_hidden_size = 512
|
| 50 |
+
config.dit_num_layers = 6
|
| 51 |
+
config.dit_num_heads = 8
|
| 52 |
+
config.dit_dropout = 0.1
|
| 53 |
+
config.num_inference_steps = 16
|
| 54 |
+
|
| 55 |
+
model = Qwen2_5_VLForClickPrediction.from_pretrained(
|
| 56 |
+
model_id, config=config, torch_dtype=torch.bfloat16
|
| 57 |
+
)
|
| 58 |
+
model = model.to("cuda").eval()
|
| 59 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
|
| 60 |
+
|
| 61 |
+
# Prepare input
|
| 62 |
+
image = Image.open("screenshot.png").convert("RGB")
|
| 63 |
+
prompt = "Click on the search button"
|
| 64 |
+
|
| 65 |
+
messages = [{
|
| 66 |
+
"role": "user",
|
| 67 |
+
"content": [
|
| 68 |
+
{"type": "image", "image": image, "min_pixels": 200704, "max_pixels": 401408},
|
| 69 |
+
{"type": "text", "text": prompt},
|
| 70 |
+
],
|
| 71 |
+
}]
|
| 72 |
+
|
| 73 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 74 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
| 75 |
+
inputs = processor(text=text, images=image_inputs, videos=video_inputs, return_tensors="pt", **video_kwargs)
|
| 76 |
+
inputs = {k: v.to("cuda") if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 77 |
+
|
| 78 |
+
# Predict click coordinates
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
click_xy = model.predict(**inputs)
|
| 81 |
+
|
| 82 |
+
x, y = click_xy[0].cpu().tolist()
|
| 83 |
+
print(f"Normalized: ({x:.4f}, {y:.4f})")
|
| 84 |
+
print(f"Pixels: ({int(x * image.width)}, {int(y * image.height)})")
|
| 85 |
```
|
| 86 |
|
| 87 |
+
See [GitHub repo](https://github.com/HusseinLezzaik/Qwen-Click-DiT) for more examples.
|
| 88 |
|
| 89 |
## Metrics
|
| 90 |
|
|
|
|
| 114 |
## Citation
|
| 115 |
|
| 116 |
```bibtex
|
| 117 |
+
@misc{lezzaik2026qwenclickdit,
|
| 118 |
title = {Qwen-Click-DiT: Vision-Language Model with Diffusion Transformer for GUI Click Prediction},
|
| 119 |
author = {Lezzaik, Hussein},
|
| 120 |
+
year = {2026},
|
| 121 |
+
howpublished = {\url{https://github.com/HusseinLezzaik/Qwen-Click-DiT}},
|
| 122 |
}
|
| 123 |
```
|
| 124 |
|