| |
| """ |
| Example: How to use OmniParser WITH image captioning enabled |
| =========================================================== |
| """ |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
|
|
| import torch |
| from transformers import AutoProcessor, AutoModelForCausalLM |
| from PIL import Image |
| import cv2 |
|
|
| def florence_caption_example(): |
| """Demonstration of how Florence-2 captions UI elements""" |
| |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model = AutoModelForCausalLM.from_pretrained( |
| "microsoft/Florence-2-large", |
| trust_remote_code=True |
| ).to(device) |
| processor = AutoProcessor.from_pretrained( |
| "microsoft/Florence-2-large", |
| trust_remote_code=True |
| ) |
| |
| |
| detected_boxes = [ |
| (0.43, 0.51, 0.56, 0.58), |
| (0.22, 0.34, 0.32, 0.36), |
| (0.15, 0.61, 0.45, 0.68), |
| ] |
| |
| |
| screenshot = Image.open("/workspaces/omoi/Screenshot.png") |
| width, height = screenshot.size |
| |
| |
| captions = [] |
| for box in detected_boxes: |
| |
| x1_norm, y1_norm, x2_norm, y2_norm = box |
| x1 = int(x1_norm * width) |
| y1 = int(y1_norm * height) |
| x2 = int(x2_norm * width) |
| y2 = int(y2_norm * height) |
| |
| cropped = screenshot.crop((x1, y1, x2, y2)) |
| cropped = cropped.resize((64, 64)) |
| |
| |
| prompt = "<CAPTION>" |
| inputs = processor( |
| text=[prompt], |
| images=[cropped], |
| return_tensors="pt" |
| ).to(device) |
| |
| |
| with torch.no_grad(): |
| generated_ids = model.generate( |
| input_ids=inputs["input_ids"], |
| pixel_values=inputs["pixel_values"], |
| max_new_tokens=20, |
| num_beams=1, |
| ) |
| |
| |
| caption = processor.batch_decode( |
| generated_ids, |
| skip_special_tokens=True |
| )[0] |
| |
| captions.append(caption) |
| print(f"Box {box} -> Caption: '{caption}'") |
| |
| return captions |
|
|
| |
| |
| |
| |
|
|
|
|
| |
| |
|
|
| def ocr_text_fallback_example(): |
| """What I implemented instead - using OCR text""" |
| |
| |
| ocr_text = ["Select File", "JPG Converter", "Download link"] |
| ocr_bbox = [ |
| (0.43, 0.51, 0.56, 0.58), |
| (0.22, 0.34, 0.32, 0.36), |
| (0.10, 0.60, 0.40, 0.67), |
| ] |
| |
| |
| detected_boxes = [ |
| (0.43, 0.51, 0.56, 0.58), |
| (0.22, 0.34, 0.32, 0.36), |
| (0.15, 0.61, 0.45, 0.68), |
| ] |
| |
| |
| labels = [] |
| for ui_box in detected_boxes: |
| label = "Icon" |
| |
| |
| for ocr_t, ocr_b in zip(ocr_text, ocr_bbox): |
| ui_x1, ui_y1, ui_x2, ui_y2 = ui_box |
| ocr_x1, ocr_y1, ocr_x2, ocr_y2 = ocr_b |
| |
| |
| if (ui_x1 < ocr_x2 and ui_x2 > ocr_x1 and |
| ui_y1 < ocr_y2 and ui_y2 > ocr_y1): |
| label = ocr_t |
| break |
| |
| labels.append(label) |
| print(f"Box {ui_box} -> Label: '{label}'") |
| |
| return labels |
|
|
| |
| |
| |
| |
|
|
|
|
| |
| |
|
|
| comparison = """ |
| βββββββββββββββββββββββ¬βββββββββββββββββββββββ¬ββββββββββββββββββββββββ |
| β Method β OCR-only (Fast) β Florence (Semantic) β |
| βββββββββββββββββββββββΌβββββββββββββββββββββββΌββββββββββββββββββββββββ€ |
| β Speed β Instant (0.1s) β Slow (30s per batch) β |
| β Quality β Text-only labels β Semantic descriptions β |
| β Works on CPU? β YES β β NO (too slow) β β |
| β Icon without text β "Icon N" (fallback) β "Download button" β β |
| β Requires GPU? β NO β YES (recommended) β |
| β Model size β 0 (OCR built-in) β 14GB β |
| βββββββββββββββββββββββ΄βββββββββββββββββββββββ΄ββββββββββββββββββββββββ |
| |
| For this demo: |
| β’ Screenshot size: 1365x767 |
| β’ Detected elements: 120 |
| β’ OCR approach: Complete in ~20 seconds total |
| β’ Florence approach: Would take ~15 minutes on CPU |
| """ |
|
|
| print(comparison) |
|
|