Spaces:

wxlsty323
/

ui-agent

Runtime error

App Files Files Community

wxlsty323 commited on Jun 2, 2025

Commit

3508f42

verified ·

1 Parent(s): 20608f5

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +55 -8
__pycache__/controller.cpython-312.pyc +0 -0
controller.py +59 -0
main.py +57 -0
models/__pycache__/vision_model.cpython-312.pyc +0 -0
models/vision_model.py +150 -0
requirements.txt +8 -0
utils/__pycache__/screen_utils.cpython-312.pyc +0 -0
utils/screen_utils.py +82 -0

README.md CHANGED Viewed

@@ -1,12 +1,59 @@
 ---
-title: Ui Agent
-emoji: 🚀
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.32.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ui-agent
+app_file: main.py
 sdk: gradio
+sdk_version: 4.19.2
 ---
+# Mac Vision Control Agent
+基于本地视觉大模型的 MacOS 自动化控制代理，支持自然语言控制和界面操作。
+## 功能特性
+- 基于 LLaVA 的本地视觉理解
+- 实时屏幕截图和界面分析
+- 自然语言指令解析
+- 自动化鼠标键盘控制
+- 界面元素精确定位
+- 支持常见 Mac 应用操作
+## 系统要求
+- MacBook Pro with Apple Silicon (M 系列芯片)
+- MacOS Sonoma 或更高版本
+- Python 3.9+
+- 48GB+ RAM
+## 安装步骤
+1. 安装 Ollama
+```bash
+curl -fsSL https://ollama.com/install.sh | sh
+```
+2. 下载 LLaVA 模型
+```bash
+ollama pull llava
+```
+3. 安装 Python 依赖
+```bash
+pip install -r requirements.txt
+```
+4. 运行应用
+```bash
+python main.py
+```
+## 使用方法
+1. 启动应用后，会打开 Gradio Web 界面
+2. 在输入框中输入自然语言指令，如"打开 Safari 浏览器并导航到 Google"
+3. 系统会自动分析当前屏幕，识别相关界面元素并执行操作
+## 注意事项
+- 首次运行时需要授予屏幕录制和辅助功能权限
+- 建议在稳定的系统环境下运行
+- 确保足够的系统内存用于模型运行

__pycache__/controller.cpython-312.pyc ADDED Viewed

Binary file (3.27 kB). View file

controller.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from utils.screen_utils import ScreenController
+from models.vision_model import VisionModel
+import time
+class Controller:
+    def __init__(self):
+        self.screen_controller = ScreenController()
+        self.vision_model = VisionModel()
+    def execute_command(self, command):
+        """执行自然语言命令"""
+        try:
+            # 1. 解释命令
+            action_plan = self.vision_model.interpret_command(command)
+            if not action_plan:
+                return False, "无法解析命令"
+            # 2. 获取当前屏幕截图
+            screen = self.screen_controller.capture_screen()
+            # 3. 定位目标元素
+            location = self.vision_model.get_element_location(screen, action_plan['target'])
+            if not location:
+                return False, f"无法找到目标元素：{action_plan['target']}"
+            # 4. 执行操作
+            success = self._perform_action(action_plan['action'], location, action_plan.get('params', {}))
+            return success, "操作执行成功" if success else "操作执行失败"
+        except Exception as e:
+            return False, f"执行命令时出错：{str(e)}"
+    def _perform_action(self, action, location, params):
+        """执行具体操作"""
+        x, y = location.get('x'), location.get('y')
+        if not (x and y):
+            return False
+        if action == 'click':
+            return self.screen_controller.click_position(x, y)
+        elif action == 'type':
+            self.screen_controller.click_position(x, y)
+            return self.screen_controller.type_text(params.get('text', ''))
+        elif action == 'press':
+            return self.screen_controller.press_key(params.get('key', ''))
+        elif action == 'move':
+            return self.screen_controller.move_to(x, y)
+        elif action == 'drag':
+            target_x = params.get('target_x')
+            target_y = params.get('target_y')
+            if target_x and target_y:
+                self.screen_controller.move_to(x, y)
+                return self.screen_controller.drag_to(target_x, target_y)
+        elif action == 'scroll':
+            return self.screen_controller.scroll(params.get('clicks', 0))
+        return False

main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gradio as gr
+from controller import Controller
+import time
+import os
+def process_command(command):
+    """处理用户输入的命令"""
+    controller = Controller()
+    success, message = controller.execute_command(command)
+    return message
+# 创建 Gradio 界面
+def create_interface():
+    with gr.Blocks(title="Mac Vision Control Agent") as interface:
+        gr.Markdown("""
+        # Mac Vision Control Agent
+        使用自然语言控制您的 Mac。输入您想要执行的操作，系统将自动分析屏幕并执行相应操作。
+        示例命令：
+        - "打开 Safari 并访问 google.com"
+        - "在 Finder 中打开下载文件夹"
+        - "打开系统偏好设置中的显示器选项"
+        """)
+        with gr.Row():
+            command_input = gr.Textbox(
+                label="输入命令",
+                placeholder="请输入自然语言命令...",
+                lines=2
+            )
+        with gr.Row():
+            submit_btn = gr.Button("执行")
+        output = gr.Textbox(label="执行结果")
+        submit_btn.click(
+            fn=process_command,
+            inputs=command_input,
+            outputs=output
+        )
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.queue()  # 添加队列支持
+    interface.launch(
+        server_name="0.0.0.0",  # 改回 0.0.0.0
+        server_port=7861,
+        share=True,
+        auth=None,
+        inbrowser=True,
+        show_error=True,  # 显示详细错误信息
+        quiet=False  # 显示更多日志
+    )

models/__pycache__/vision_model.cpython-312.pyc ADDED Viewed

Binary file (7 kB). View file

models/vision_model.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import requests
+import json
+import base64
+from PIL import Image
+import io
+import time
+class VisionModel:
+    def __init__(self, model_name="llava:34b"):
+        """初始化视觉模型"""
+        self.model_name = model_name
+        self.api_url = "http://127.0.0.1:11434/api/generate"
+        self.max_retries = 3
+        self.retry_delay = 2
+    def _check_ollama_service(self):
+        """检查 Ollama 服务是否可用"""
+        try:
+            response = requests.get("http://127.0.0.1:11434/api/tags")
+            return response.status_code == 200
+        except:
+            return False
+    def encode_image(self, image):
+        """将图片编码为 base64 字符串"""
+        if isinstance(image, str):
+            with open(image, 'rb') as img_file:
+                return base64.b64encode(img_file.read()).decode('utf-8')
+        elif isinstance(image, Image.Image):
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+        else:
+            raise ValueError("Unsupported image type")
+    def _format_json_response(self, text):
+        """格式化响应文本为合法的 JSON"""
+        try:
+            # 尝试直接解析
+            return json.loads(text)
+        except json.JSONDecodeError:
+            try:
+                # 尝试提取 JSON 部分
+                start = text.find('{')
+                end = text.rfind('}') + 1
+                if start >= 0 and end > start:
+                    json_str = text[start:end]
+                    return json.loads(json_str)
+            except:
+                pass
+            # 如果无法解析，返回默认格式
+            return {
+                "action": "click",
+                "target": "未能识别的目标",
+                "params": {}
+            }
+    def analyze_screen(self, image, prompt):
+        """分析屏幕内容"""
+        if not self._check_ollama_service():
+            print("Ollama service is not available. Please make sure 'ollama serve' is running.")
+            return None
+        for attempt in range(self.max_retries):
+            try:
+                base64_image = self.encode_image(image)
+                payload = {
+                    "model": self.model_name,
+                    "prompt": f"{prompt}\n请以JSON格式返回结果，确保返回的是合法的JSON字符串。",
+                    "stream": False,
+                    "images": [base64_image]
+                }
+                response = requests.post(self.api_url, json=payload, timeout=30)
+                if response.status_code == 200:
+                    result = response.json()
+                    return result.get('response', '')
+                else:
+                    print(f"API request failed with status code: {response.status_code}")
+                    if attempt < self.max_retries - 1:
+                        time.sleep(self.retry_delay)
+                        continue
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed: {str(e)}")
+                if attempt < self.max_retries - 1:
+                    time.sleep(self.retry_delay)
+                    continue
+                else:
+                    print("Max retries exceeded")
+                    return None
+        return None
+    def get_element_location(self, image, element_description):
+        """获取界面元素位置"""
+        prompt = f"""
+        请找到并描述以下界面元素的位置：{element_description}
+        请以JSON格式返回，必须包含以下字段：
+        {{
+            "x": 数字类型的x坐标,
+            "y": 数字类型的y坐标,
+            "confidence": 0到1之间的置信度
+        }}
+        """
+        result = self.analyze_screen(image, prompt)
+        if not result:
+            return None
+        return self._format_json_response(result)
+    def interpret_command(self, command):
+        """解释自然语言命令"""
+        try:
+            prompt = f"""
+            请将以下自然语言命令解析为具体的操作步骤：
+            命令：{command}
+            请以JSON格式返回，必须包含以下字段：
+            {{
+                "action": "click"/"type"/"press"/"move"/"drag"/"scroll" 中的一个,
+                "target": "目标元素的具体描述",
+                "params": {{
+                    // 根据action类型可能包含：
+                    "text": "要输入的文本",
+                    "key": "要按下的按键",
+                    "clicks": 滚动的距离,
+                    "target_x": 目标x坐标,
+                    "target_y": 目标y坐标
+                }}
+            }}
+            """
+            payload = {
+                "model": self.model_name,
+                "prompt": prompt,
+                "stream": False
+            }
+            response = requests.post(self.api_url, json=payload, timeout=30)
+            if response.status_code == 200:
+                result = response.json()
+                return self._format_json_response(result.get('response', '{}'))
+            else:
+                raise Exception(f"API request failed with status code: {response.status_code}")
+        except Exception as e:
+            print(f"Error interpreting command: {e}")
+            return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pyautogui==0.9.54
+pillow==10.2.0
+gradio==4.19.2
+numpy==1.26.4
+python-dotenv==1.0.1
+requests==2.31.0
+pynput==1.7.6
+opencv-python==4.9.0.80

utils/__pycache__/screen_utils.cpython-312.pyc ADDED Viewed

Binary file (3.93 kB). View file

utils/screen_utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pyautogui
+import numpy as np
+import cv2
+from PIL import Image
+import time
+from pynput import mouse, keyboard
+class ScreenController:
+    def __init__(self):
+        # 初始化 PyAutoGUI 安全设置
+        pyautogui.FAILSAFE = True
+        pyautogui.PAUSE = 0.5
+    def capture_screen(self):
+        """捕获当前屏幕"""
+        screenshot = pyautogui.screenshot()
+        return np.array(screenshot)
+    def find_element_on_screen(self, template_image, confidence=0.8):
+        """在屏幕上查找特定元素"""
+        try:
+            location = pyautogui.locateOnScreen(template_image, confidence=confidence)
+            if location:
+                return location
+            return None
+        except Exception as e:
+            print(f"Error finding element: {e}")
+            return None
+    def click_position(self, x, y):
+        """点击指定位置"""
+        try:
+            pyautogui.click(x, y)
+            return True
+        except Exception as e:
+            print(f"Error clicking position: {e}")
+            return False
+    def type_text(self, text):
+        """输入文本"""
+        try:
+            pyautogui.write(text)
+            return True
+        except Exception as e:
+            print(f"Error typing text: {e}")
+            return False
+    def press_key(self, key):
+        """按下特定按键"""
+        try:
+            pyautogui.press(key)
+            return True
+        except Exception as e:
+            print(f"Error pressing key: {e}")
+            return False
+    def move_to(self, x, y, duration=0.5):
+        """移动鼠标到指定位置"""
+        try:
+            pyautogui.moveTo(x, y, duration=duration)
+            return True
+        except Exception as e:
+            print(f"Error moving mouse: {e}")
+            return False
+    def drag_to(self, x, y, duration=0.5):
+        """拖拽到指定位置"""
+        try:
+            pyautogui.dragTo(x, y, duration=duration)
+            return True
+        except Exception as e:
+            print(f"Error dragging: {e}")
+            return False
+    def scroll(self, clicks):
+        """滚动屏幕"""
+        try:
+            pyautogui.scroll(clicks)
+            return True
+        except Exception as e:
+            print(f"Error scrolling: {e}")
+            return False