Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import re | |
| import json | |
| import time | |
| import shutil | |
| import uuid | |
| import tempfile | |
| import unicodedata | |
| import gc | |
| from io import BytesIO | |
| from typing import Tuple, Optional, List, Dict, Any | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import spaces | |
| from PIL import Image, ImageDraw, ImageFont | |
| # Transformers & Qwen Utils | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| # ----------------------------------------------------------------------------- | |
| # 1. CONSTANTS & SYSTEM PROMPT | |
| # ----------------------------------------------------------------------------- | |
| # Available Models | |
| MODELS = { | |
| "Fara-7B": "microsoft/Fara-7B", | |
| "UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B" | |
| } | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # System Prompt asking for JSON format | |
| OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status. | |
| You need to generate the next action to complete the task. | |
| Output your action inside a <tool_call> block using JSON format. | |
| Include "coordinate": [x, y] in pixels for interactions. | |
| Examples: | |
| <tool_call> | |
| {"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}} | |
| </tool_call> | |
| <tool_call> | |
| {"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}} | |
| </tool_call> | |
| """ | |
| # ----------------------------------------------------------------------------- | |
| # 2. MODEL MANAGEMENT | |
| # ----------------------------------------------------------------------------- | |
| class ModelManager: | |
| def __init__(self): | |
| self.current_model_id = None | |
| self.model = None | |
| self.processor = None | |
| def load_model(self, model_key): | |
| model_id = MODELS.get(model_key) | |
| if not model_id: | |
| raise ValueError(f"Unknown model: {model_key}") | |
| # If already loaded, skip | |
| if self.current_model_id == model_id and self.model is not None: | |
| return | |
| print(f"--- Swapping model to {model_key} ({model_id}) ---") | |
| # Unload previous model to save VRAM | |
| if self.model is not None: | |
| del self.model | |
| del self.processor | |
| self.model = None | |
| self.processor = None | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| print("Previous model unloaded.") | |
| print(f"Loading {model_id}...") | |
| try: | |
| self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| ) | |
| if DEVICE == "cpu": | |
| self.model.to("cpu") | |
| self.model.eval() | |
| self.current_model_id = model_id | |
| print(f"Successfully loaded {model_key}") | |
| except Exception as e: | |
| print(f"Error loading model {model_id}: {e}") | |
| raise e | |
| def generate(self, model_key, messages, max_new_tokens=512): | |
| # Ensure correct model is loaded | |
| self.load_model(model_key) | |
| # Prepare inputs | |
| text = self.processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = self.processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(self.model.device) | |
| # Generate | |
| with torch.no_grad(): | |
| generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| return self.processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| # Global instance | |
| model_manager = ModelManager() | |
| # ----------------------------------------------------------------------------- | |
| # 3. PARSING & VISUALIZATION LOGIC | |
| # ----------------------------------------------------------------------------- | |
| def array_to_image(image_array: np.ndarray) -> Image.Image: | |
| if image_array is None: | |
| raise ValueError("No image provided. Please upload an image.") | |
| return Image.fromarray(np.uint8(image_array)) | |
| def get_navigation_prompt(task, image): | |
| return [ | |
| {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]}, | |
| {"role": "user", "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": f"Instruction: {task}"}, | |
| ]}, | |
| ] | |
| def parse_tool_calls(response: str) -> list[dict]: | |
| """ | |
| Parses <tool_call>{JSON}</tool_call> tags. | |
| Also attempts fallback regex for plain coordinate output just in case. | |
| """ | |
| actions = [] | |
| # 1. Try Specific JSON Tool Call | |
| json_matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL) | |
| for match in json_matches: | |
| try: | |
| data = json.loads(match.strip()) | |
| args = data.get("arguments", {}) | |
| coords = args.get("coordinate", []) | |
| action_type = args.get("action", "unknown") | |
| text_content = args.get("text", "") | |
| if coords and isinstance(coords, list) and len(coords) == 2: | |
| actions.append({ | |
| "type": action_type, | |
| "x": float(coords[0]), | |
| "y": float(coords[1]), | |
| "text": text_content, | |
| "source": "json" | |
| }) | |
| except: | |
| pass | |
| # 2. Fallback: Search for any [x, y] or (x, y) pattern if JSON parsing yielded nothing | |
| if not actions: | |
| # Regex for [123, 456] or (123, 456) | |
| coord_matches = re.findall(r"[\[\(](\d+(?:\.\d+)?),\s*(\d+(?:\.\d+)?)[\]\)]", response) | |
| for x, y in coord_matches: | |
| actions.append({ | |
| "type": "click", # Assume click for raw coords | |
| "x": float(x), | |
| "y": float(y), | |
| "text": "", | |
| "source": "regex" | |
| }) | |
| return actions | |
| def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]: | |
| """Draws markers on the image based on parsed pixel coordinates.""" | |
| if not actions: | |
| return None | |
| img_copy = original_image.copy() | |
| draw = ImageDraw.Draw(img_copy) | |
| width, height = img_copy.size | |
| try: | |
| font = ImageFont.load_default() | |
| except: | |
| font = None | |
| colors = { | |
| 'type': 'blue', | |
| 'click': 'red', | |
| 'left_click': 'red', | |
| 'right_click': 'purple', | |
| 'double_click': 'orange', | |
| 'unknown': 'green' | |
| } | |
| for act in actions: | |
| x = act['x'] | |
| y = act['y'] | |
| # Coordinate Normalization check | |
| if x <= 1.0 and y <= 1.0 and x > 0: | |
| pixel_x = int(x * width) | |
| pixel_y = int(y * height) | |
| else: | |
| pixel_x = int(x) | |
| pixel_y = int(y) | |
| action_type = act['type'] | |
| color = colors.get(action_type, 'green') | |
| # Draw Target | |
| r = 12 | |
| draw.ellipse( | |
| [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], | |
| outline=color, | |
| width=4 | |
| ) | |
| draw.ellipse( | |
| [pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], | |
| fill=color | |
| ) | |
| # Label | |
| label_text = f"{action_type}" | |
| if act['text']: | |
| label_text += f": '{act['text']}'" | |
| text_pos = (pixel_x + 15, pixel_y - 10) | |
| # Bounding box for text background | |
| if font: | |
| bbox = draw.textbbox(text_pos, label_text, font=font) | |
| draw.rectangle(bbox, fill="black") | |
| draw.text(text_pos, label_text, fill="white", font=font) | |
| else: | |
| draw.text(text_pos, label_text, fill="black") # fallback | |
| return img_copy | |
| # ----------------------------------------------------------------------------- | |
| # 4. GRADIO LOGIC | |
| # ----------------------------------------------------------------------------- | |
| def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: | |
| if input_numpy_image is None: | |
| return "⚠️ Please upload an image first.", None | |
| # Convert to PIL | |
| input_pil_image = array_to_image(input_numpy_image) | |
| # Build Prompt | |
| prompt = get_navigation_prompt(task, input_pil_image) | |
| # Generate Response | |
| print(f"Generating response with {model_choice}...") | |
| try: | |
| raw_response = model_manager.generate(model_choice, prompt, max_new_tokens=500) | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}", None | |
| print(f"Raw Output:\n{raw_response}") | |
| # Parse Actions | |
| actions = parse_tool_calls(raw_response) | |
| # Visualize | |
| output_image = input_pil_image | |
| if actions: | |
| visualized = create_localized_image(input_pil_image, actions) | |
| if visualized: | |
| output_image = visualized | |
| return raw_response, output_image | |
| # ----------------------------------------------------------------------------- | |
| # 5. GRADIO UI SETUP | |
| # ----------------------------------------------------------------------------- | |
| title = "CUA GUI Operator 🖥️" | |
| description = """ | |
| This demo uses **Vision Language Models** to understand GUI screenshots and generate actions. | |
| Select a model, upload a screenshot, and define a task. | |
| """ | |
| custom_css = """ | |
| #out_img { height: 600px; object-fit: contain; } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo: | |
| gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Model Selector | |
| model_selector = gr.Dropdown( | |
| label="Choose CUA Model", | |
| choices=["Fara-7B", "UI-TARS-1.5-7B"], | |
| value="Fara-7B", | |
| interactive=True | |
| ) | |
| input_image = gr.Image(label="Upload Screenshot", height=500) | |
| task_input = gr.Textbox( | |
| label="Task Instruction", | |
| placeholder="e.g. Input the server address readyforquantum.com...", | |
| lines=2 | |
| ) | |
| submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary") | |
| with gr.Column(): | |
| output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500) | |
| output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True) | |
| # Wire up the button | |
| submit_btn.click( | |
| fn=process_screenshot, | |
| inputs=[model_selector, input_image, task_input], | |
| outputs=[output_text, output_image] | |
| ) | |
| # Example for quick testing | |
| gr.Examples( | |
| examples=[ | |
| ["Fara-7B", "./assets/google.png", "Search for 'Hugging Face'"], | |
| ], | |
| inputs=[model_selector, input_image, task_input], | |
| label="Quick Examples" | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() |