Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 7 days ago

Commit

83e0b46

verified ·

1 Parent(s): 7696916

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -110

app.py CHANGED Viewed

@@ -5,8 +5,7 @@ import time
 import unicodedata
 import gc
 from io import BytesIO
-from typing import Iterable
-from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
@@ -94,7 +93,6 @@ class OrangeRedTheme(Soft):
 orange_red_theme = OrangeRedTheme()
-# --- Device Setup ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
@@ -129,16 +127,19 @@ except Exception as e:
     processor_x = None
 print("🔄 Loading Holo2-4B...")
-MODEL_ID_H = "Hcompany/Holo2-4B"
 try:
     processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
     model_h = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_H,
         trust_remote_code=True,
         torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
-    ).to(device).eval()
 except Exception as e:
-    print(f"Failed to load Holo2: {e}")
     model_h = None
     processor_h = None
@@ -177,17 +178,7 @@ def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    texts = []
-    for m in messages:
-        content = m.get("content", "")
-        if isinstance(content, list):
-            for c in content:
-                if isinstance(c, dict) and c.get("type") == "text":
-                    texts.append(c.get("text", ""))
-        elif isinstance(content, str):
-             texts.append(content)
-    return "\n".join(texts)
 def batch_decode_compat(processor, token_id_batches, **kw):
     tok = getattr(processor, "tokenizer", None)
@@ -205,7 +196,7 @@ def trim_generated(generated_ids, inputs):
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
-# --- Prompts ---
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
@@ -237,15 +228,23 @@ def get_localization_prompt(task, image):
         }
     ]
-# --- Parsing Logic ---
 def parse_click_response(text: str) -> List[Dict]:
-    """Parses standard (x,y) text responses from TARS/General VLMs"""
     actions = []
     text = text.strip()
-    print(f"Parsing click-style output: {text}")
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
@@ -262,6 +261,7 @@ def parse_click_response(text: str) -> List[Dict]:
     for m in matches_tuple:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
     unique_actions = []
     seen = set()
     for a in actions:
@@ -273,7 +273,6 @@ def parse_click_response(text: str) -> List[Dict]:
     return unique_actions
 def parse_fara_response(response: str) -> List[Dict]:
-    """Parses Fara's specific tool_call JSON format"""
     actions = []
     matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
@@ -292,35 +291,44 @@ def parse_fara_response(response: str) -> List[Dict]:
             pass
     return actions
-def parse_holo_reasoning(generated_ids: torch.Tensor, processor) -> tuple[str, str]:
-    """Parses Holo2 content separating thought process from JSON output"""
     all_ids = generated_ids[0].tolist()
-    # 151667 = <think>, 151668 = </think> for this tokenizer
     try:
         think_start_index = all_ids.index(151667)
     except ValueError:
         think_start_index = -1
     try:
         think_end_index = all_ids.index(151668)
     except ValueError:
-        think_end_index = len(all_ids)
-    thinking_content = ""
-    if think_start_index != -1:
-        thinking_content = processor.decode(
-            all_ids[think_start_index+1 : think_end_index],
-            skip_special_tokens=True
-        ).strip("\n")
-    # Content comes after the thinking block
-    start_content = think_end_index + 1 if think_end_index < len(all_ids) else 0
-    content = processor.decode(all_ids[start_content:], skip_special_tokens=True).strip("\n")
     return content, thinking_content
-# --- Visualization ---
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
@@ -337,15 +345,18 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         y = act['y']
         pixel_x, pixel_y = int(x), int(y)
         color = 'red' if 'click' in act['type'].lower() else 'blue'
-        r = 20
-        line_width = 5
-        # Draw target circles
-        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=line_width)
-        draw.ellipse([pixel_x - 4, pixel_y - 4, pixel_x + 4, pixel_y + 4], fill=color)
         label = f"{act['type'].capitalize()}"
         if act.get('text'): label += f": \"{act['text']}\""
@@ -355,10 +366,10 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
-            draw.rectangle(padded_bbox, fill="black", outline=color)
-            draw.text(text_pos, label, fill="white", font=font)
-        except Exception as e:
-            draw.text(text_pos, label, fill="white")
     return img_copy
@@ -372,9 +383,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
     actions = []
-    final_text_response = ""
-    # --- Fara-7B ---
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
@@ -397,76 +408,63 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        final_text_response = raw_response
         actions = parse_fara_response(raw_response)
-    # --- Holo2-4B ---
     elif model_choice == "Holo2-4B":
-        if model_h is None: return "Error: Holo2 model failed to load.", None
         print("Using Holo2-4B Pipeline...")
-        # Holo2 standard chat format
-        messages = [
-            {"role": "user", "content": [
-                {"type": "image", "image": input_pil_image},
-                {"type": "text", "text": task}
-            ]}
-        ]
-        # Prepare inputs
-        text_prompt = processor_h.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor_h(
-            text=[text_prompt],
-            images=image_inputs,
-            padding=True,
             return_tensors="pt"
-        )
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
-            # Adjust max_new_tokens to accommodate thinking process + json
-            generated_ids = model_h.generate(**inputs, max_new_tokens=1024)
-        # Parse Thinking vs Content
-        generated_ids_trimmed = trim_generated(generated_ids, inputs)
-        content, thinking = parse_holo_reasoning(generated_ids_trimmed, processor_h)
-        final_text_response = f"💭 **Reasoning:**\n{thinking}\n\n📍 **Action Output:**\n{content}"
-        # Parse JSON Coordinate {"x": int, "y": int} (0-1000 scale)
-        try:
-            # Clean content just in case of markdown blocks
-            clean_content = content.replace("```json", "").replace("```", "").strip()
-            data = json.loads(clean_content)
-            norm_x = data.get("x", 0)
-            norm_y = data.get("y", 0)
-            # Convert 0-1000 scale to original image pixels
-            pixel_x = (norm_x / 1000) * orig_w
-            pixel_y = (norm_y / 1000) * orig_h
-            actions.append({
-                "type": "click",
-                "x": int(pixel_x),
-                "y": int(pixel_y),
-                "text": "Target"
-            })
-        except json.JSONDecodeError:
-            print(f"Failed to parse Holo2 JSON: {content}")
-        except Exception as e:
-            print(f"Error processing Holo2 output: {e}")
-    # --- UI-TARS-1.5-7B ---
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
@@ -486,11 +484,10 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = batch_decode_compat(processor_x, generated_ids, skip_special_tokens=True)[0]
-        final_text_response = raw_response
         actions = parse_click_response(raw_response)
-        # Scale back from resized dims to original
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
@@ -501,17 +498,17 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     else:
         return f"Error: Unknown model '{model_choice}'", None
     print(f"Parsed Actions: {actions}")
-    # Generate visual output
     output_image = input_pil_image
     if actions:
         vis = create_localized_image(input_pil_image, actions)
         if vis: output_image = vis
-    return final_text_response, output_image
-# --- Gradio UI ---
 css="""
 #col-container {
@@ -545,7 +542,7 @@ with gr.Blocks() as demo:
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
-            output_text = gr.Textbox(label="Agent Model Response (Thinking & Action)", lines=12)
     submit_btn.click(
         fn=process_screenshot,

 import unicodedata
 import gc
 from io import BytesIO
+from typing import Iterable, Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
 orange_red_theme = OrangeRedTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
     processor_x = None
 print("🔄 Loading Holo2-4B...")
+MODEL_ID_H = "Hcompany/Holo2-4B"
 try:
     processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
     model_h = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_H,
         trust_remote_code=True,
         torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+        device_map="auto" if device == "cuda" else None
+    ).eval()
+    if device == "cpu":
+        model_h = model_h.to(device)
 except Exception as e:
+    print(f"Failed to load Holo2-4B: {e}")
     model_h = None
     processor_h = None
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    return ""
 def batch_decode_compat(processor, token_id_batches, **kw):
     tok = getattr(processor, "tokenizer", None)
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
+# --- Prompt Builders ---
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
         }
     ]
+def get_holo2_messages(task, image):
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": task}
+            ]
+        }
+    ]
+# --- Response Parsers ---
 def parse_click_response(text: str) -> List[Dict]:
     actions = []
     text = text.strip()
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
     for m in matches_tuple:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
+    # Deduplicate
     unique_actions = []
     seen = set()
     for a in actions:
     return unique_actions
 def parse_fara_response(response: str) -> List[Dict]:
     actions = []
     matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
             pass
     return actions
+def parse_holo2_reasoning(processor, generated_ids) -> tuple[str, str]:
+    """Parse content from generated_ids specifically for Holo2"""
     all_ids = generated_ids[0].tolist()
+    # Try to find thinking block indices
     try:
         think_start_index = all_ids.index(151667)
     except ValueError:
         think_start_index = -1
     try:
         think_end_index = all_ids.index(151668)
     except ValueError:
+        think_end_index = -1
+    if think_start_index != -1 and think_end_index != -1:
+        thinking_content = processor.decode(all_ids[think_start_index+1:think_end_index], skip_special_tokens=True).strip("\n")
+        content = processor.decode(all_ids[think_end_index+1:], skip_special_tokens=True).strip("\n")
+    else:
+        # If no thinking tags or incomplete, decode everything
+        thinking_content = ""
+        content = processor.decode(all_ids, skip_special_tokens=True).strip("\n")
     return content, thinking_content
+def parse_holo2_json(content: str) -> List[Dict]:
+    actions = []
+    try:
+        # Clean potential markdown
+        cleaned = content.replace("```json", "").replace("```", "").strip()
+        data = json.loads(cleaned)
+        if "x" in data and "y" in data:
+            actions.append({"type": "click", "x": data["x"], "y": data["y"], "text": ""})
+    except json.JSONDecodeError:
+        print(f"Failed to parse Holo2 JSON: {content}")
+    return actions
+# --- Visualizer ---
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
         y = act['y']
         pixel_x, pixel_y = int(x), int(y)
         color = 'red' if 'click' in act['type'].lower() else 'blue'
+        # Draw Cross and Circle style (as requested by user preference)
+        cross_size = 20
+        # Horizontal line
+        draw.line([pixel_x - cross_size, pixel_y, pixel_x + cross_size, pixel_y], fill=color, width=4)
+        # Vertical line
+        draw.line([pixel_x, pixel_y - cross_size, pixel_x, pixel_y + cross_size], fill=color, width=4)
+        # Circle
+        r = 15
+        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
         label = f"{act['type'].capitalize()}"
         if act.get('text'): label += f": \"{act['text']}\""
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
+            draw.rectangle(padded_bbox, fill="yellow", outline=color)
+            draw.text(text_pos, label, fill="black", font=font)
+        except Exception:
+            draw.text(text_pos, label, fill="black")
     return img_copy
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
     actions = []
+    raw_response = ""
+    thinking_output = ""
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_fara_response(raw_response)
     elif model_choice == "Holo2-4B":
+        if model_h is None: return "Error: Holo2-4B model failed to load.", None
         print("Using Holo2-4B Pipeline...")
+        # Specific Holo2 resizing logic
+        ip_config = processor_h.image_processor
+        resized_h, resized_w = smart_resize(
+            input_pil_image.height,
+            input_pil_image.width,
+            factor=ip_config.patch_size * ip_config.merge_size,
+            min_pixels=ip_config.size.get("shortest_edge", 256*256),
+            max_pixels=ip_config.size.get("longest_edge", 1280*1280),
+        )
+        processed_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
+        messages = get_holo2_messages(task, processed_image)
+        # Apply template with thinking=False for localization as per documentation/snippet
+        text_prompt = processor_h.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            thinking=False
+        )
         inputs = processor_h(
+            text=[text_prompt],
+            images=[processed_image],
+            padding=True,
             return_tensors="pt"
+        ).to(model_h.device)
         with torch.no_grad():
+            generated_ids = model_h.generate(**inputs, max_new_tokens=128)
+        # Parse reasoning/content
+        content, thinking_output = parse_holo2_reasoning(processor_h, trim_generated(generated_ids, inputs))
+        raw_response = content
+        if thinking_output:
+            raw_response = f"[Thinking Process]:\n{thinking_output}\n\n[Action]:\n{content}"
+        actions = parse_holo2_json(content)
+        # Handle Holo2 coordinate normalization (0-1000) relative to image
+        # Math: (x_norm / 1000) * orig_w
+        for a in actions:
+            a['x'] = (a['x'] / 1000) * orig_w
+            a['y'] = (a['y'] / 1000) * orig_h
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = batch_decode_compat(processor_x, generated_ids, skip_special_tokens=True)[0]
         actions = parse_click_response(raw_response)
+        # UI-TARS returns coordinates relative to resized image size
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
     else:
         return f"Error: Unknown model '{model_choice}'", None
+    print(f"Raw Output: {raw_response}")
     print(f"Parsed Actions: {actions}")
     output_image = input_pil_image
     if actions:
         vis = create_localized_image(input_pil_image, actions)
         if vis: output_image = vis
+    return raw_response, output_image
+# --- UI Setup ---
 css="""
 #col-container {
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
+            output_text = gr.Textbox(label="Agent Model Response", lines=10)
     submit_btn.click(
         fn=process_screenshot,