Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import re | |
| import json | |
| import time | |
| import unicodedata | |
| import gc | |
| from io import BytesIO | |
| from typing import Iterable | |
| from typing import Tuple, Optional, List, Dict, Any | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import spaces | |
| from PIL import Image, ImageDraw, ImageFont | |
| # Transformers & Qwen Utils | |
| from transformers import ( | |
| Qwen2_5_VLForConditionalGeneration, | |
| AutoProcessor, | |
| AutoModelForImageTextToText | |
| ) | |
| from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize | |
| from qwen_vl_utils import process_vision_info | |
| # Gradio Theme | |
| from gradio.themes import Soft | |
| from gradio.themes.utils import colors, fonts, sizes | |
| # ----------------------------------------------------------------------------- | |
| # 1. THEME CONFIGURATION | |
| # ----------------------------------------------------------------------------- | |
| colors.steel_blue = colors.Color( | |
| name="steel_blue", | |
| c50="#EBF3F8", | |
| c100="#D3E5F0", | |
| c200="#A8CCE1", | |
| c300="#7DB3D2", | |
| c400="#529AC3", | |
| c500="#4682B4", | |
| c600="#3E72A0", | |
| c700="#36638C", | |
| c800="#2E5378", | |
| c900="#264364", | |
| c950="#1E3450", | |
| ) | |
| class SteelBlueTheme(Soft): | |
| def __init__( | |
| self, | |
| *, | |
| primary_hue: colors.Color | str = colors.gray, | |
| secondary_hue: colors.Color | str = colors.steel_blue, | |
| neutral_hue: colors.Color | str = colors.slate, | |
| text_size: sizes.Size | str = sizes.text_lg, | |
| font: fonts.Font | str | Iterable[fonts.Font | str] = ( | |
| fonts.GoogleFont("Outfit"), "Arial", "sans-serif", | |
| ), | |
| font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( | |
| fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace", | |
| ), | |
| ): | |
| super().__init__( | |
| primary_hue=primary_hue, | |
| secondary_hue=secondary_hue, | |
| neutral_hue=neutral_hue, | |
| text_size=text_size, | |
| font=font, | |
| font_mono=font_mono, | |
| ) | |
| super().set( | |
| background_fill_primary="*primary_50", | |
| background_fill_primary_dark="*primary_900", | |
| body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)", | |
| body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)", | |
| button_primary_text_color="white", | |
| button_primary_text_color_hover="white", | |
| button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)", | |
| button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)", | |
| button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)", | |
| button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)", | |
| block_title_text_weight="600", | |
| block_border_width="3px", | |
| block_shadow="*shadow_drop_lg", | |
| button_primary_shadow="*shadow_drop_lg", | |
| button_large_padding="11px", | |
| ) | |
| steel_blue_theme = SteelBlueTheme() | |
| css = "#main-title h1 { font-size: 2.3em !important; } #out_img { height: 600px; object-fit: contain; }" | |
| # ----------------------------------------------------------------------------- | |
| # 2. GLOBAL MODEL LOADING | |
| # ----------------------------------------------------------------------------- | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Running on device: {device}") | |
| # --- Load Fara-7B --- | |
| print("🔄 Loading Fara-7B...") | |
| MODEL_ID_V = "microsoft/Fara-7B" | |
| try: | |
| processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True) | |
| model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID_V, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 | |
| ).to(device).eval() | |
| except Exception as e: | |
| print(f"Failed to load Fara: {e}") | |
| model_v = None | |
| processor_v = None | |
| # --- Load UI-TARS-1.5-7B --- | |
| print("🔄 Loading UI-TARS-1.5-7B...") | |
| MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B" | |
| try: | |
| processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False) | |
| model_x = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID_X, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
| ).to(device).eval() | |
| except Exception as e: | |
| print(f"Failed to load UI-TARS: {e}") | |
| model_x = None | |
| processor_x = None | |
| # --- Load Holo2-8B --- | |
| print("🔄 Loading Holo2-8B...") | |
| MODEL_ID_H = "Hcompany/Holo2-8B" | |
| try: | |
| processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True) | |
| model_h = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID_H, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 | |
| ).to(device).eval() | |
| except Exception as e: | |
| print(f"Failed to load Holo2: {e}") | |
| model_h = None | |
| processor_h = None | |
| print("✅ All Models Loaded Sequence Complete.") | |
| # ----------------------------------------------------------------------------- | |
| # 3. UTILS & PROMPTS | |
| # ----------------------------------------------------------------------------- | |
| def array_to_image(image_array: np.ndarray) -> Image.Image: | |
| if image_array is None: raise ValueError("No image provided.") | |
| return Image.fromarray(np.uint8(image_array)) | |
| # --- Fara Prompt --- | |
| def get_fara_prompt(task, image): | |
| OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status. | |
| You need to generate the next action to complete the task. | |
| Output your action inside a <tool_call> block using JSON format. | |
| Include "coordinate": [x, y] in pixels for interactions. | |
| Examples: | |
| <tool_call>{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}</tool_call> | |
| <tool_call>{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}</tool_call> | |
| """ | |
| return [ | |
| {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]}, | |
| {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]}, | |
| ] | |
| # --- UI-TARS Prompt --- | |
| def get_uitars_prompt(task, image): | |
| guidelines = ( | |
| "Localize an element on the GUI image according to my instructions and " | |
| "output a click position as Click(x, y) with x num pixels from the left edge " | |
| "and y num pixels from the top edge." | |
| ) | |
| return [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": f"{guidelines}\n{task}"} | |
| ], | |
| } | |
| ] | |
| # --- Holo2 Prompt --- | |
| def get_holo2_prompt(task, image): | |
| # Holo2 typically uses standard chat formatting | |
| return [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": task} | |
| ] | |
| } | |
| ] | |
| def get_image_proc_params(processor) -> Dict[str, int]: | |
| ip = getattr(processor, "image_processor", None) | |
| return { | |
| "patch_size": getattr(ip, "patch_size", 14), | |
| "merge_size": getattr(ip, "merge_size", 2), | |
| "min_pixels": getattr(ip, "min_pixels", 256 * 256), | |
| "max_pixels": getattr(ip, "max_pixels", 1280 * 1280), | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # 4. PARSING LOGIC | |
| # ----------------------------------------------------------------------------- | |
| def parse_uitars_response(text: str) -> List[Dict]: | |
| """Parse UI-TARS specific output formats""" | |
| actions = [] | |
| text = text.strip() | |
| m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE) | |
| if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""}) | |
| m = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE) | |
| for p in m: actions.append({"type": "click", "x": int(p[0]), "y": int(p[1]), "text": ""}) | |
| m = re.search(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE) | |
| if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""}) | |
| return actions | |
| def parse_fara_response(response: str) -> List[Dict]: | |
| """Parse Fara <tool_call> JSON format""" | |
| actions = [] | |
| matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL) | |
| for match in matches: | |
| try: | |
| data = json.loads(match.strip()) | |
| args = data.get("arguments", {}) | |
| coords = args.get("coordinate", []) | |
| action_type = args.get("action", "unknown") | |
| text_content = args.get("text", "") | |
| if coords and len(coords) == 2: | |
| actions.append({ | |
| "type": action_type, "x": float(coords[0]), "y": float(coords[1]), "text": text_content | |
| }) | |
| except: pass | |
| return actions | |
| def parse_holo2_response(generated_ids, processor, input_len) -> Tuple[str, str, List[Dict]]: | |
| """Parse Holo2 reasoning tokens and JSON content""" | |
| all_ids = generated_ids[0].tolist() | |
| # Token IDs for <|thought_start|> and <|thought_end|> (Qwen/Holo specific) | |
| THOUGHT_START = 151667 | |
| THOUGHT_END = 151668 | |
| thinking_content = "" | |
| content = "" | |
| try: | |
| if THOUGHT_START in all_ids: | |
| start_idx = all_ids.index(THOUGHT_START) | |
| try: | |
| end_idx = all_ids.index(THOUGHT_END) | |
| except ValueError: | |
| end_idx = len(all_ids) | |
| thinking_ids = all_ids[start_idx+1:end_idx] | |
| thinking_content = processor.decode(thinking_ids, skip_special_tokens=True).strip() | |
| # Content is everything after thought_end | |
| content_ids = all_ids[end_idx+1:] | |
| content = processor.decode(content_ids, skip_special_tokens=True).strip() | |
| else: | |
| # Fallback if no reasoning tokens found (just raw output) | |
| # Slice off input tokens first | |
| output_ids = all_ids[input_len:] | |
| content = processor.decode(output_ids, skip_special_tokens=True).strip() | |
| except Exception as e: | |
| print(f"Holo Parsing Error: {e}") | |
| content = processor.decode(all_ids[input_len:], skip_special_tokens=True).strip() | |
| # Parse JSON Content | |
| actions = [] | |
| try: | |
| # Holo2 outputs strictly valid JSON usually | |
| # E.g. {"x": 500, "y": 300, "description": "search bar"} | |
| # Or {"action": "click", "point": [100, 200]} | |
| # Flattening to common format | |
| if "{" in content and "}" in content: | |
| # Find JSON block if surrounded by text | |
| json_str = re.search(r"(\{.*\})", content, re.DOTALL).group(1) | |
| data = json.loads(json_str) | |
| x, y = 0, 0 | |
| if "x" in data and "y" in data: | |
| x, y = data["x"], data["y"] | |
| elif "point" in data: | |
| x, y = data["point"][0], data["point"][1] | |
| elif "coordinate" in data: | |
| x, y = data["coordinate"][0], data["coordinate"][1] | |
| if x or y: | |
| # Holo2 output is 0-1000 scale | |
| actions.append({ | |
| "type": "click", | |
| "x": float(x), | |
| "y": float(y), | |
| "text": data.get("description", "") or data.get("text", ""), | |
| "scale_base": 1000 # Flag to indicate this needs normalization from 1000 | |
| }) | |
| except Exception as e: | |
| print(f"Holo JSON Parse Failed: {e}") | |
| return content, thinking_content, actions | |
| def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]: | |
| if not actions: return None | |
| img_copy = original_image.copy() | |
| draw = ImageDraw.Draw(img_copy) | |
| width, height = img_copy.size | |
| try: font = ImageFont.load_default() | |
| except: font = None | |
| for act in actions: | |
| x = act['x'] | |
| y = act['y'] | |
| # Holo2 Special Case (0-1000 scaling) | |
| if act.get('scale_base') == 1000: | |
| pixel_x = int((x / 1000) * width) | |
| pixel_y = int((y / 1000) * height) | |
| # Normalized (0-1) | |
| elif x <= 1.0 and y <= 1.0 and x > 0: | |
| pixel_x = int(x * width) | |
| pixel_y = int(y * height) | |
| # Absolute Pixels | |
| else: | |
| pixel_x = int(x) | |
| pixel_y = int(y) | |
| color = 'red' if 'click' in act['type'].lower() else 'blue' | |
| # Draw Visuals | |
| r = 15 | |
| draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4) | |
| draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color) | |
| # Draw Cross | |
| draw.line([pixel_x - 10, pixel_y, pixel_x + 10, pixel_y], fill=color, width=2) | |
| draw.line([pixel_x, pixel_y - 10, pixel_x, pixel_y + 10], fill=color, width=2) | |
| # Label | |
| label = f"{act['type']}" | |
| if act['text']: label += f": {act['text']}" | |
| text_pos = (pixel_x + 20, pixel_y - 10) | |
| bbox = draw.textbbox(text_pos, label, font=font) | |
| draw.rectangle((bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2), fill="black") | |
| draw.text(text_pos, label, fill="white", font=font) | |
| return img_copy | |
| # ----------------------------------------------------------------------------- | |
| # 5. CORE LOGIC | |
| # ----------------------------------------------------------------------------- | |
| def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str): | |
| if input_numpy_image is None: return "⚠️ Please upload an image.", None, None | |
| input_pil_image = array_to_image(input_numpy_image) | |
| orig_w, orig_h = input_pil_image.size | |
| actions = [] | |
| raw_response = "" | |
| reasoning_text = None | |
| # --- UI-TARS Logic --- | |
| if model_choice == "UI-TARS-1.5-7B": | |
| if model_x is None: return "Error: UI-TARS model failed to load.", None, None | |
| print("Running UI-TARS...") | |
| ip_params = get_image_proc_params(processor_x) | |
| resized_h, resized_w = smart_resize( | |
| input_pil_image.height, input_pil_image.width, | |
| factor=ip_params["patch_size"] * ip_params["merge_size"], | |
| min_pixels=ip_params["min_pixels"], max_pixels=ip_params["max_pixels"] | |
| ) | |
| proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS) | |
| messages = get_uitars_prompt(task, proc_image) | |
| text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| generated_ids = model_x.generate(**inputs, max_new_tokens=128) | |
| generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)] | |
| raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| actions = parse_uitars_response(raw_response) | |
| # Rescale | |
| scale_x = orig_w / resized_w | |
| scale_y = orig_h / resized_h | |
| for a in actions: | |
| a['x'] = int(a['x'] * scale_x) | |
| a['y'] = int(a['y'] * scale_y) | |
| # --- Holo2 Logic --- | |
| elif model_choice == "Holo2-8B": | |
| if model_h is None: return "Error: Holo2 model failed to load.", None, None | |
| print("Running Holo2...") | |
| messages = get_holo2_prompt(task, input_pil_image) | |
| text_prompt = processor_h.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor_h(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") | |
| inputs = inputs.to(device) | |
| with torch.no_grad(): | |
| generated_ids = model_h.generate(**inputs, max_new_tokens=512) | |
| # Parse Reasoning + Content | |
| input_len = len(inputs.input_ids[0]) | |
| content, thinking, parsed_actions = parse_holo2_response(generated_ids, processor_h, input_len) | |
| raw_response = content | |
| reasoning_text = thinking | |
| actions = parsed_actions | |
| # --- Fara Logic --- | |
| else: | |
| if model_v is None: return "Error: Fara model failed to load.", None, None | |
| print("Running Fara...") | |
| messages = get_fara_prompt(task, input_pil_image) | |
| text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor_v(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") | |
| inputs = inputs.to(device) | |
| with torch.no_grad(): | |
| generated_ids = model_v.generate(**inputs, max_new_tokens=512) | |
| generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] | |
| raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| actions = parse_fara_response(raw_response) | |
| print(f"Raw: {raw_response}") | |
| if reasoning_text: print(f"Thinking: {reasoning_text}") | |
| # Visualize | |
| output_image = input_pil_image | |
| if actions: | |
| vis = create_localized_image(input_pil_image, actions) | |
| if vis: output_image = vis | |
| final_text_output = f"▶️ OUTPUT:\n{raw_response}" | |
| if reasoning_text: | |
| final_text_output = f"🧠 THINKING PROCESS:\n{reasoning_text}\n\n" + final_text_output | |
| return final_text_output, output_image | |
| # ----------------------------------------------------------------------------- | |
| # 6. UI SETUP | |
| # ----------------------------------------------------------------------------- | |
| with gr.Blocks(theme=steel_blue_theme, css=css) as demo: | |
| gr.Markdown("# **CUA GUI Agent 🖥️**", elem_id="main-title") | |
| gr.Markdown("Upload a screenshot, select a model, and provide a task. The model will determine the precise UI coordinates and actions.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| input_image = gr.Image(label="Upload Screenshot", height=500) | |
| with gr.Row(): | |
| model_choice = gr.Radio( | |
| choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-8B"], | |
| label="Select Model", | |
| value="Fara-7B", | |
| interactive=True | |
| ) | |
| task_input = gr.Textbox( | |
| label="Task Instruction", | |
| placeholder="e.g. Input the server address readyforquantum.com...", | |
| lines=2 | |
| ) | |
| submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary") | |
| with gr.Column(scale=3): | |
| output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500) | |
| output_text = gr.Textbox(label="Model Output & Reasoning", lines=12, show_copy_button=True) | |
| submit_btn.click( | |
| fn=process_screenshot, | |
| inputs=[input_image, task_input, model_choice], | |
| outputs=[output_text, output_image] | |
| ) | |
| gr.Examples( | |
| examples=[["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"]], | |
| inputs=[input_image, task_input, model_choice], | |
| label="Quick Examples" | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() |