prithivMLmods's picture
Update app.py
5c21f23 verified
raw
history blame
11.6 kB
import os
import re
import json
import time
import shutil
import uuid
import tempfile
import unicodedata
import gc
from io import BytesIO
from typing import Tuple, Optional, List, Dict, Any
import gradio as gr
import numpy as np
import torch
import spaces
from PIL import Image, ImageDraw, ImageFont
# Transformers & Qwen Utils
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
# -----------------------------------------------------------------------------
# 1. CONSTANTS & SYSTEM PROMPT
# -----------------------------------------------------------------------------
# Available Models
MODELS = {
"Fara-7B": "microsoft/Fara-7B",
"UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
}
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# System Prompt asking for JSON format
OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
You need to generate the next action to complete the task.
Output your action inside a <tool_call> block using JSON format.
Include "coordinate": [x, y] in pixels for interactions.
Examples:
<tool_call>
{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
</tool_call>
<tool_call>
{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
</tool_call>
"""
# -----------------------------------------------------------------------------
# 2. MODEL MANAGEMENT
# -----------------------------------------------------------------------------
class ModelManager:
def __init__(self):
self.current_model_id = None
self.model = None
self.processor = None
def load_model(self, model_key):
model_id = MODELS.get(model_key)
if not model_id:
raise ValueError(f"Unknown model: {model_key}")
# If already loaded, skip
if self.current_model_id == model_id and self.model is not None:
return
print(f"--- Swapping model to {model_key} ({model_id}) ---")
# Unload previous model to save VRAM
if self.model is not None:
del self.model
del self.processor
self.model = None
self.processor = None
gc.collect()
torch.cuda.empty_cache()
print("Previous model unloaded.")
print(f"Loading {model_id}...")
try:
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
device_map="auto" if DEVICE == "cuda" else None,
)
if DEVICE == "cpu":
self.model.to("cpu")
self.model.eval()
self.current_model_id = model_id
print(f"Successfully loaded {model_key}")
except Exception as e:
print(f"Error loading model {model_id}: {e}")
raise e
def generate(self, model_key, messages, max_new_tokens=512):
# Ensure correct model is loaded
self.load_model(model_key)
# Prepare inputs
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(self.model.device)
# Generate
with torch.no_grad():
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
return self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Global instance
model_manager = ModelManager()
# -----------------------------------------------------------------------------
# 3. PARSING & VISUALIZATION LOGIC
# -----------------------------------------------------------------------------
def array_to_image(image_array: np.ndarray) -> Image.Image:
if image_array is None:
raise ValueError("No image provided. Please upload an image.")
return Image.fromarray(np.uint8(image_array))
def get_navigation_prompt(task, image):
return [
{"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": f"Instruction: {task}"},
]},
]
def parse_tool_calls(response: str) -> list[dict]:
"""
Parses <tool_call>{JSON}</tool_call> tags.
Also attempts fallback regex for plain coordinate output just in case.
"""
actions = []
# 1. Try Specific JSON Tool Call
json_matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
for match in json_matches:
try:
data = json.loads(match.strip())
args = data.get("arguments", {})
coords = args.get("coordinate", [])
action_type = args.get("action", "unknown")
text_content = args.get("text", "")
if coords and isinstance(coords, list) and len(coords) == 2:
actions.append({
"type": action_type,
"x": float(coords[0]),
"y": float(coords[1]),
"text": text_content,
"source": "json"
})
except:
pass
# 2. Fallback: Search for any [x, y] or (x, y) pattern if JSON parsing yielded nothing
if not actions:
# Regex for [123, 456] or (123, 456)
coord_matches = re.findall(r"[\[\(](\d+(?:\.\d+)?),\s*(\d+(?:\.\d+)?)[\]\)]", response)
for x, y in coord_matches:
actions.append({
"type": "click", # Assume click for raw coords
"x": float(x),
"y": float(y),
"text": "",
"source": "regex"
})
return actions
def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
"""Draws markers on the image based on parsed pixel coordinates."""
if not actions:
return None
img_copy = original_image.copy()
draw = ImageDraw.Draw(img_copy)
width, height = img_copy.size
try:
font = ImageFont.load_default()
except:
font = None
colors = {
'type': 'blue',
'click': 'red',
'left_click': 'red',
'right_click': 'purple',
'double_click': 'orange',
'unknown': 'green'
}
for act in actions:
x = act['x']
y = act['y']
# Coordinate Normalization check
if x <= 1.0 and y <= 1.0 and x > 0:
pixel_x = int(x * width)
pixel_y = int(y * height)
else:
pixel_x = int(x)
pixel_y = int(y)
action_type = act['type']
color = colors.get(action_type, 'green')
# Draw Target
r = 12
draw.ellipse(
[pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
outline=color,
width=4
)
draw.ellipse(
[pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
fill=color
)
# Label
label_text = f"{action_type}"
if act['text']:
label_text += f": '{act['text']}'"
text_pos = (pixel_x + 15, pixel_y - 10)
# Bounding box for text background
if font:
bbox = draw.textbbox(text_pos, label_text, font=font)
draw.rectangle(bbox, fill="black")
draw.text(text_pos, label_text, fill="white", font=font)
else:
draw.text(text_pos, label_text, fill="black") # fallback
return img_copy
# -----------------------------------------------------------------------------
# 4. GRADIO LOGIC
# -----------------------------------------------------------------------------
@spaces.GPU(duration=120)
def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
if input_numpy_image is None:
return "⚠️ Please upload an image first.", None
# Convert to PIL
input_pil_image = array_to_image(input_numpy_image)
# Build Prompt
prompt = get_navigation_prompt(task, input_pil_image)
# Generate Response
print(f"Generating response with {model_choice}...")
try:
raw_response = model_manager.generate(model_choice, prompt, max_new_tokens=500)
except Exception as e:
return f"Error generating response: {str(e)}", None
print(f"Raw Output:\n{raw_response}")
# Parse Actions
actions = parse_tool_calls(raw_response)
# Visualize
output_image = input_pil_image
if actions:
visualized = create_localized_image(input_pil_image, actions)
if visualized:
output_image = visualized
return raw_response, output_image
# -----------------------------------------------------------------------------
# 5. GRADIO UI SETUP
# -----------------------------------------------------------------------------
title = "CUA GUI Operator 🖥️"
description = """
This demo uses **Vision Language Models** to understand GUI screenshots and generate actions.
Select a model, upload a screenshot, and define a task.
"""
custom_css = """
#out_img { height: 600px; object-fit: contain; }
"""
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
gr.Markdown(description)
with gr.Row():
with gr.Column():
# Model Selector
model_selector = gr.Dropdown(
label="Choose CUA Model",
choices=["Fara-7B", "UI-TARS-1.5-7B"],
value="Fara-7B",
interactive=True
)
input_image = gr.Image(label="Upload Screenshot", height=500)
task_input = gr.Textbox(
label="Task Instruction",
placeholder="e.g. Input the server address readyforquantum.com...",
lines=2
)
submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary")
with gr.Column():
output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
# Wire up the button
submit_btn.click(
fn=process_screenshot,
inputs=[model_selector, input_image, task_input],
outputs=[output_text, output_image]
)
# Example for quick testing
gr.Examples(
examples=[
["Fara-7B", "./assets/google.png", "Search for 'Hugging Face'"],
],
inputs=[model_selector, input_image, task_input],
label="Quick Examples"
)
if __name__ == "__main__":
demo.queue().launch()