Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

CUA-GUI-Operator / app.py

prithivMLmods

Update app.py

5c21f23 verified 10 days ago

raw

history blame

11.6 kB

	import os
	import re
	import json
	import time
	import shutil
	import uuid
	import tempfile
	import unicodedata
	import gc
	from io import BytesIO
	from typing import Tuple, Optional, List, Dict, Any

	import gradio as gr
	import numpy as np
	import torch
	import spaces
	from PIL import Image, ImageDraw, ImageFont

	# Transformers & Qwen Utils
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info

	# -----------------------------------------------------------------------------
	# 1. CONSTANTS & SYSTEM PROMPT
	# -----------------------------------------------------------------------------

	# Available Models
	MODELS = {
	"Fara-7B": "microsoft/Fara-7B",
	"UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
	}

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# System Prompt asking for JSON format
	OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
	You need to generate the next action to complete the task.

	Output your action inside a <tool_call> block using JSON format.
	Include "coordinate": [x, y] in pixels for interactions.

	Examples:
	<tool_call>
	{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
	</tool_call>

	<tool_call>
	{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
	</tool_call>
	"""

	# -----------------------------------------------------------------------------
	# 2. MODEL MANAGEMENT
	# -----------------------------------------------------------------------------

	class ModelManager:
	def __init__(self):
	self.current_model_id = None
	self.model = None
	self.processor = None

	def load_model(self, model_key):
	model_id = MODELS.get(model_key)
	if not model_id:
	raise ValueError(f"Unknown model: {model_key}")

	# If already loaded, skip
	if self.current_model_id == model_id and self.model is not None:
	return

	print(f"--- Swapping model to {model_key} ({model_id}) ---")

	# Unload previous model to save VRAM
	if self.model is not None:
	del self.model
	del self.processor
	self.model = None
	self.processor = None
	gc.collect()
	torch.cuda.empty_cache()
	print("Previous model unloaded.")

	print(f"Loading {model_id}...")
	try:
	self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
	self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
	device_map="auto" if DEVICE == "cuda" else None,
	)
	if DEVICE == "cpu":
	self.model.to("cpu")
	self.model.eval()
	self.current_model_id = model_id
	print(f"Successfully loaded {model_key}")
	except Exception as e:
	print(f"Error loading model {model_id}: {e}")
	raise e

	def generate(self, model_key, messages, max_new_tokens=512):
	# Ensure correct model is loaded
	self.load_model(model_key)

	# Prepare inputs
	text = self.processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = self.processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(self.model.device)

	# Generate
	with torch.no_grad():
	generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)

	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	return self.processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Global instance
	model_manager = ModelManager()

	# -----------------------------------------------------------------------------
	# 3. PARSING & VISUALIZATION LOGIC
	# -----------------------------------------------------------------------------

	def array_to_image(image_array: np.ndarray) -> Image.Image:
	if image_array is None:
	raise ValueError("No image provided. Please upload an image.")
	return Image.fromarray(np.uint8(image_array))

	def get_navigation_prompt(task, image):
	return [
	{"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
	{"role": "user", "content": [
	{"type": "image", "image": image},
	{"type": "text", "text": f"Instruction: {task}"},
	]},
	]

	def parse_tool_calls(response: str) -> list[dict]:
	"""
	Parses <tool_call>{JSON}</tool_call> tags.
	Also attempts fallback regex for plain coordinate output just in case.
	"""
	actions = []

	# 1. Try Specific JSON Tool Call
	json_matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
	for match in json_matches:
	try:
	data = json.loads(match.strip())
	args = data.get("arguments", {})
	coords = args.get("coordinate", [])
	action_type = args.get("action", "unknown")
	text_content = args.get("text", "")

	if coords and isinstance(coords, list) and len(coords) == 2:
	actions.append({
	"type": action_type,
	"x": float(coords[0]),
	"y": float(coords[1]),
	"text": text_content,
	"source": "json"
	})
	except:
	pass

	# 2. Fallback: Search for any [x, y] or (x, y) pattern if JSON parsing yielded nothing
	if not actions:
	# Regex for [123, 456] or (123, 456)
	coord_matches = re.findall(r"[\[\(](\d+(?:\.\d+)?),\s*(\d+(?:\.\d+)?)[\]\)]", response)
	for x, y in coord_matches:
	actions.append({
	"type": "click", # Assume click for raw coords
	"x": float(x),
	"y": float(y),
	"text": "",
	"source": "regex"
	})

	return actions

	def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
	"""Draws markers on the image based on parsed pixel coordinates."""
	if not actions:
	return None

	img_copy = original_image.copy()
	draw = ImageDraw.Draw(img_copy)
	width, height = img_copy.size

	try:
	font = ImageFont.load_default()
	except:
	font = None

	colors = {
	'type': 'blue',
	'click': 'red',
	'left_click': 'red',
	'right_click': 'purple',
	'double_click': 'orange',
	'unknown': 'green'
	}

	for act in actions:
	x = act['x']
	y = act['y']

	# Coordinate Normalization check
	if x <= 1.0 and y <= 1.0 and x > 0:
	pixel_x = int(x * width)
	pixel_y = int(y * height)
	else:
	pixel_x = int(x)
	pixel_y = int(y)

	action_type = act['type']
	color = colors.get(action_type, 'green')

	# Draw Target
	r = 12
	draw.ellipse(
	[pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
	outline=color,
	width=4
	)
	draw.ellipse(
	[pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
	fill=color
	)

	# Label
	label_text = f"{action_type}"
	if act['text']:
	label_text += f": '{act['text']}'"

	text_pos = (pixel_x + 15, pixel_y - 10)
	# Bounding box for text background
	if font:
	bbox = draw.textbbox(text_pos, label_text, font=font)
	draw.rectangle(bbox, fill="black")
	draw.text(text_pos, label_text, fill="white", font=font)
	else:
	draw.text(text_pos, label_text, fill="black") # fallback

	return img_copy

	# -----------------------------------------------------------------------------
	# 4. GRADIO LOGIC
	# -----------------------------------------------------------------------------

	@spaces.GPU(duration=120)
	def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
	if input_numpy_image is None:
	return "⚠️ Please upload an image first.", None

	# Convert to PIL
	input_pil_image = array_to_image(input_numpy_image)

	# Build Prompt
	prompt = get_navigation_prompt(task, input_pil_image)

	# Generate Response
	print(f"Generating response with {model_choice}...")
	try:
	raw_response = model_manager.generate(model_choice, prompt, max_new_tokens=500)
	except Exception as e:
	return f"Error generating response: {str(e)}", None

	print(f"Raw Output:\n{raw_response}")

	# Parse Actions
	actions = parse_tool_calls(raw_response)

	# Visualize
	output_image = input_pil_image
	if actions:
	visualized = create_localized_image(input_pil_image, actions)
	if visualized:
	output_image = visualized

	return raw_response, output_image

	# -----------------------------------------------------------------------------
	# 5. GRADIO UI SETUP
	# -----------------------------------------------------------------------------

	title = "CUA GUI Operator 🖥️"
	description = """
	This demo uses Vision Language Models to understand GUI screenshots and generate actions.
	Select a model, upload a screenshot, and define a task.
	"""

	custom_css = """
	#out_img { height: 600px; object-fit: contain; }
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
	gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
	gr.Markdown(description)

	with gr.Row():
	with gr.Column():
	# Model Selector
	model_selector = gr.Dropdown(
	label="Choose CUA Model",
	choices=["Fara-7B", "UI-TARS-1.5-7B"],
	value="Fara-7B",
	interactive=True
	)

	input_image = gr.Image(label="Upload Screenshot", height=500)
	task_input = gr.Textbox(
	label="Task Instruction",
	placeholder="e.g. Input the server address readyforquantum.com...",
	lines=2
	)
	submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary")

	with gr.Column():
	output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
	output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)

	# Wire up the button
	submit_btn.click(
	fn=process_screenshot,
	inputs=[model_selector, input_image, task_input],
	outputs=[output_text, output_image]
	)

	# Example for quick testing
	gr.Examples(
	examples=[
	["Fara-7B", "./assets/google.png", "Search for 'Hugging Face'"],
	],
	inputs=[model_selector, input_image, task_input],
	label="Quick Examples"
	)

	if __name__ == "__main__":
	demo.queue().launch()