omniparsev2-ui-detector / caption_examples.py
makeitfr's picture
Upload caption_examples.py with huggingface_hub
d1969e7 verified
Raw
History Blame Contribute Delete
6.39 kB
#!/usr/bin/env python3
"""
Example: How to use OmniParser WITH image captioning enabled
===========================================================
"""
# EXAMPLE 1: Start server WITH Florence captions
# ================================================
# In the original omniparserserver.py (before my changes):
# omniparser = Omniparser(config)
#
# This initializes Florence model for captioning:
# class Omniparser:
# def __init__(self, config):
# self.caption_model_processor = get_caption_model_processor(
# model_name='florence2',
# model_name_or_path='weights/icon_caption_florence',
# device='cuda' # or 'cpu'
# )
#
# Then parsing goes through:
# parse() β†’ get_som_labeled_img() β†’ get_parsed_content_icon()
# β†’ Florence model generates captions for each UI element
# EXAMPLE 2: How Florence Captioning Works (Pseudocode)
# ========================================================
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import cv2
def florence_caption_example():
"""Demonstration of how Florence-2 captions UI elements"""
# 1. Initialize model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large",
trust_remote_code=True
).to(device)
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-large",
trust_remote_code=True
)
# 2. Simulate detected UI elements (boxes from YOLO)
detected_boxes = [
(0.43, 0.51, 0.56, 0.58), # Select File button
(0.22, 0.34, 0.32, 0.36), # JPG Converter text
(0.15, 0.61, 0.45, 0.68), # Some icon/image element
]
# 3. Load screenshot
screenshot = Image.open("/workspaces/omoi/Screenshot.png")
width, height = screenshot.size
# 4. Process each element
captions = []
for box in detected_boxes:
# Crop the box region
x1_norm, y1_norm, x2_norm, y2_norm = box
x1 = int(x1_norm * width)
y1 = int(y1_norm * height)
x2 = int(x2_norm * width)
y2 = int(y2_norm * height)
cropped = screenshot.crop((x1, y1, x2, y2))
cropped = cropped.resize((64, 64)) # Normalize size
# Pass to Florence
prompt = "<CAPTION>" # Special Florence prompt
inputs = processor(
text=[prompt],
images=[cropped],
return_tensors="pt"
).to(device)
# Generate caption
with torch.no_grad():
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=20,
num_beams=1,
)
# Decode result
caption = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
captions.append(caption)
print(f"Box {box} -> Caption: '{caption}'")
return captions
# Expected output:
# Box (0.43, 0.51, 0.56, 0.58) -> Caption: 'Select File button'
# Box (0.22, 0.34, 0.32, 0.36) -> Caption: 'JPG Converter text'
# Box (0.15, 0.61, 0.45, 0.68) -> Caption: 'Image or icon element'
# EXAMPLE 3: How my OCR-only approach works (faster alternative)
# ================================================================
def ocr_text_fallback_example():
"""What I implemented instead - using OCR text"""
# Already have from PaddleOCR phase:
ocr_text = ["Select File", "JPG Converter", "Download link"]
ocr_bbox = [
(0.43, 0.51, 0.56, 0.58), # Matches first box!
(0.22, 0.34, 0.32, 0.36), # Matches second box!
(0.10, 0.60, 0.40, 0.67),
]
# Detected UI elements
detected_boxes = [
(0.43, 0.51, 0.56, 0.58), # Select File button
(0.22, 0.34, 0.32, 0.36), # JPG Converter text
(0.15, 0.61, 0.45, 0.68), # Some icon/image element
]
# Simple bbox intersection
labels = []
for ui_box in detected_boxes:
label = "Icon" # default
# Check if any OCR text overlaps with this UI element
for ocr_t, ocr_b in zip(ocr_text, ocr_bbox):
ui_x1, ui_y1, ui_x2, ui_y2 = ui_box
ocr_x1, ocr_y1, ocr_x2, ocr_y2 = ocr_b
# Check intersection
if (ui_x1 < ocr_x2 and ui_x2 > ocr_x1 and
ui_y1 < ocr_y2 and ui_y2 > ocr_y1):
label = ocr_t
break
labels.append(label)
print(f"Box {ui_box} -> Label: '{label}'")
return labels
# Output:
# Box (0.43, 0.51, 0.56, 0.58) -> Label: 'Select File'
# Box (0.22, 0.34, 0.32, 0.36) -> Label: 'JPG Converter text'
# Box (0.15, 0.61, 0.45, 0.68) -> Label: 'Icon' # Fallback, no OCR match
# EXAMPLE 4: Comparison
# =====================
comparison = """
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Method β”‚ OCR-only (Fast) β”‚ Florence (Semantic) β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Speed β”‚ Instant (0.1s) β”‚ Slow (30s per batch) β”‚
β”‚ Quality β”‚ Text-only labels β”‚ Semantic descriptions β”‚
β”‚ Works on CPU? β”‚ YES βœ“ β”‚ NO (too slow) βœ— β”‚
β”‚ Icon without text β”‚ "Icon N" (fallback) β”‚ "Download button" βœ“ β”‚
β”‚ Requires GPU? β”‚ NO β”‚ YES (recommended) β”‚
β”‚ Model size β”‚ 0 (OCR built-in) β”‚ 14GB β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
For this demo:
β€’ Screenshot size: 1365x767
β€’ Detected elements: 120
β€’ OCR approach: Complete in ~20 seconds total
β€’ Florence approach: Would take ~15 minutes on CPU
"""
print(comparison)