wxlsty323 commited on
Commit
3508f42
·
verified ·
1 Parent(s): 20608f5

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,59 @@
1
  ---
2
- title: Ui Agent
3
- emoji: 🚀
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.32.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ui-agent
3
+ app_file: main.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.19.2
 
 
6
  ---
7
+ # Mac Vision Control Agent
8
 
9
+ 基于本地视觉大模型的 MacOS 自动化控制代理,支持自然语言控制和界面操作。
10
+
11
+ ## 功能特性
12
+
13
+ - 基于 LLaVA 的本地视觉理解
14
+ - 实时屏幕截图和界面分析
15
+ - 自然语言指令解析
16
+ - 自动化鼠标键盘控制
17
+ - 界面元素精确定位
18
+ - 支持常见 Mac 应用操作
19
+
20
+ ## 系统要求
21
+
22
+ - MacBook Pro with Apple Silicon (M 系列芯片)
23
+ - MacOS Sonoma 或更高版本
24
+ - Python 3.9+
25
+ - 48GB+ RAM
26
+
27
+ ## 安装步骤
28
+
29
+ 1. 安装 Ollama
30
+ ```bash
31
+ curl -fsSL https://ollama.com/install.sh | sh
32
+ ```
33
+
34
+ 2. 下载 LLaVA 模型
35
+ ```bash
36
+ ollama pull llava
37
+ ```
38
+
39
+ 3. 安装 Python 依赖
40
+ ```bash
41
+ pip install -r requirements.txt
42
+ ```
43
+
44
+ 4. 运行应用
45
+ ```bash
46
+ python main.py
47
+ ```
48
+
49
+ ## 使用方法
50
+
51
+ 1. 启动应用后,会打开 Gradio Web 界面
52
+ 2. 在输入框中输入自然语言指令,如"打开 Safari 浏览器并导航到 Google"
53
+ 3. 系统会自动分析当前屏幕,识别相关界面元素并执行操作
54
+
55
+ ## 注意事项
56
+
57
+ - 首次运行时需要授予屏幕录制和辅助功能权限
58
+ - 建议在稳定的系统环境下运行
59
+ - 确保足够的系统内存用于模型运行
__pycache__/controller.cpython-312.pyc ADDED
Binary file (3.27 kB). View file
 
controller.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.screen_utils import ScreenController
2
+ from models.vision_model import VisionModel
3
+ import time
4
+
5
+ class Controller:
6
+ def __init__(self):
7
+ self.screen_controller = ScreenController()
8
+ self.vision_model = VisionModel()
9
+
10
+ def execute_command(self, command):
11
+ """执行自然语言命令"""
12
+ try:
13
+ # 1. 解释命令
14
+ action_plan = self.vision_model.interpret_command(command)
15
+ if not action_plan:
16
+ return False, "无法解析命令"
17
+
18
+ # 2. 获取当前屏幕截图
19
+ screen = self.screen_controller.capture_screen()
20
+
21
+ # 3. 定位目标元素
22
+ location = self.vision_model.get_element_location(screen, action_plan['target'])
23
+ if not location:
24
+ return False, f"无法找到目标元素:{action_plan['target']}"
25
+
26
+ # 4. 执行操作
27
+ success = self._perform_action(action_plan['action'], location, action_plan.get('params', {}))
28
+
29
+ return success, "操作执行成功" if success else "操作执行失败"
30
+
31
+ except Exception as e:
32
+ return False, f"执行命令时出错:{str(e)}"
33
+
34
+ def _perform_action(self, action, location, params):
35
+ """执行具体操作"""
36
+ x, y = location.get('x'), location.get('y')
37
+
38
+ if not (x and y):
39
+ return False
40
+
41
+ if action == 'click':
42
+ return self.screen_controller.click_position(x, y)
43
+ elif action == 'type':
44
+ self.screen_controller.click_position(x, y)
45
+ return self.screen_controller.type_text(params.get('text', ''))
46
+ elif action == 'press':
47
+ return self.screen_controller.press_key(params.get('key', ''))
48
+ elif action == 'move':
49
+ return self.screen_controller.move_to(x, y)
50
+ elif action == 'drag':
51
+ target_x = params.get('target_x')
52
+ target_y = params.get('target_y')
53
+ if target_x and target_y:
54
+ self.screen_controller.move_to(x, y)
55
+ return self.screen_controller.drag_to(target_x, target_y)
56
+ elif action == 'scroll':
57
+ return self.screen_controller.scroll(params.get('clicks', 0))
58
+
59
+ return False
main.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from controller import Controller
3
+ import time
4
+ import os
5
+
6
+ def process_command(command):
7
+ """处理用户输入的命令"""
8
+ controller = Controller()
9
+ success, message = controller.execute_command(command)
10
+ return message
11
+
12
+ # 创建 Gradio 界面
13
+ def create_interface():
14
+ with gr.Blocks(title="Mac Vision Control Agent") as interface:
15
+ gr.Markdown("""
16
+ # Mac Vision Control Agent
17
+
18
+ 使用自然语言控制您的 Mac。输入您想要执行的操作,系统将自动分析屏幕并执行相应操作。
19
+
20
+ 示例命令:
21
+ - "打开 Safari 并访问 google.com"
22
+ - "在 Finder 中打开下载文件夹"
23
+ - "打开系统偏好设置中的显示器选项"
24
+ """)
25
+
26
+ with gr.Row():
27
+ command_input = gr.Textbox(
28
+ label="输入命令",
29
+ placeholder="请输入自然语言命令...",
30
+ lines=2
31
+ )
32
+
33
+ with gr.Row():
34
+ submit_btn = gr.Button("执行")
35
+
36
+ output = gr.Textbox(label="执行结果")
37
+
38
+ submit_btn.click(
39
+ fn=process_command,
40
+ inputs=command_input,
41
+ outputs=output
42
+ )
43
+
44
+ return interface
45
+
46
+ if __name__ == "__main__":
47
+ interface = create_interface()
48
+ interface.queue() # 添加队列支持
49
+ interface.launch(
50
+ server_name="0.0.0.0", # 改回 0.0.0.0
51
+ server_port=7861,
52
+ share=True,
53
+ auth=None,
54
+ inbrowser=True,
55
+ show_error=True, # 显示详细错误信息
56
+ quiet=False # 显示更多日志
57
+ )
models/__pycache__/vision_model.cpython-312.pyc ADDED
Binary file (7 kB). View file
 
models/vision_model.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import base64
4
+ from PIL import Image
5
+ import io
6
+ import time
7
+
8
+ class VisionModel:
9
+ def __init__(self, model_name="llava:34b"):
10
+ """初始化视觉模型"""
11
+ self.model_name = model_name
12
+ self.api_url = "http://127.0.0.1:11434/api/generate"
13
+ self.max_retries = 3
14
+ self.retry_delay = 2
15
+
16
+ def _check_ollama_service(self):
17
+ """检查 Ollama 服务是否可用"""
18
+ try:
19
+ response = requests.get("http://127.0.0.1:11434/api/tags")
20
+ return response.status_code == 200
21
+ except:
22
+ return False
23
+
24
+ def encode_image(self, image):
25
+ """将图片编码为 base64 字符串"""
26
+ if isinstance(image, str):
27
+ with open(image, 'rb') as img_file:
28
+ return base64.b64encode(img_file.read()).decode('utf-8')
29
+ elif isinstance(image, Image.Image):
30
+ buffered = io.BytesIO()
31
+ image.save(buffered, format="PNG")
32
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
33
+ else:
34
+ raise ValueError("Unsupported image type")
35
+
36
+ def _format_json_response(self, text):
37
+ """格式化响应文本为合法的 JSON"""
38
+ try:
39
+ # 尝试直接解析
40
+ return json.loads(text)
41
+ except json.JSONDecodeError:
42
+ try:
43
+ # 尝试提取 JSON 部分
44
+ start = text.find('{')
45
+ end = text.rfind('}') + 1
46
+ if start >= 0 and end > start:
47
+ json_str = text[start:end]
48
+ return json.loads(json_str)
49
+ except:
50
+ pass
51
+
52
+ # 如果无法解析,返回默认格式
53
+ return {
54
+ "action": "click",
55
+ "target": "未能识别的目标",
56
+ "params": {}
57
+ }
58
+
59
+ def analyze_screen(self, image, prompt):
60
+ """分析屏幕内容"""
61
+ if not self._check_ollama_service():
62
+ print("Ollama service is not available. Please make sure 'ollama serve' is running.")
63
+ return None
64
+
65
+ for attempt in range(self.max_retries):
66
+ try:
67
+ base64_image = self.encode_image(image)
68
+
69
+ payload = {
70
+ "model": self.model_name,
71
+ "prompt": f"{prompt}\n请以JSON格式返回结果,确保返回的是合法的JSON字符串。",
72
+ "stream": False,
73
+ "images": [base64_image]
74
+ }
75
+
76
+ response = requests.post(self.api_url, json=payload, timeout=30)
77
+ if response.status_code == 200:
78
+ result = response.json()
79
+ return result.get('response', '')
80
+ else:
81
+ print(f"API request failed with status code: {response.status_code}")
82
+ if attempt < self.max_retries - 1:
83
+ time.sleep(self.retry_delay)
84
+ continue
85
+
86
+ except Exception as e:
87
+ print(f"Attempt {attempt + 1} failed: {str(e)}")
88
+ if attempt < self.max_retries - 1:
89
+ time.sleep(self.retry_delay)
90
+ continue
91
+ else:
92
+ print("Max retries exceeded")
93
+ return None
94
+
95
+ return None
96
+
97
+ def get_element_location(self, image, element_description):
98
+ """获取界面元素位置"""
99
+ prompt = f"""
100
+ 请找到并描述以下界面元素的位置:{element_description}
101
+ 请以JSON格式返回,必须包含以下字段:
102
+ {{
103
+ "x": 数字类型的x坐标,
104
+ "y": 数字类型的y坐标,
105
+ "confidence": 0到1之间的置信度
106
+ }}
107
+ """
108
+ result = self.analyze_screen(image, prompt)
109
+ if not result:
110
+ return None
111
+ return self._format_json_response(result)
112
+
113
+ def interpret_command(self, command):
114
+ """解释自然语言命令"""
115
+ try:
116
+ prompt = f"""
117
+ 请将以下自然语言命令解析为具体的操作步骤:
118
+ 命令:{command}
119
+
120
+ 请以JSON格式返回,必须包含以下字段:
121
+ {{
122
+ "action": "click"/"type"/"press"/"move"/"drag"/"scroll" 中的一个,
123
+ "target": "目标元素的具体描述",
124
+ "params": {{
125
+ // 根据action类型可能包含:
126
+ "text": "要输入的文本",
127
+ "key": "要按下的按键",
128
+ "clicks": 滚动的距离,
129
+ "target_x": 目标x坐标,
130
+ "target_y": 目标y坐标
131
+ }}
132
+ }}
133
+ """
134
+
135
+ payload = {
136
+ "model": self.model_name,
137
+ "prompt": prompt,
138
+ "stream": False
139
+ }
140
+
141
+ response = requests.post(self.api_url, json=payload, timeout=30)
142
+ if response.status_code == 200:
143
+ result = response.json()
144
+ return self._format_json_response(result.get('response', '{}'))
145
+ else:
146
+ raise Exception(f"API request failed with status code: {response.status_code}")
147
+
148
+ except Exception as e:
149
+ print(f"Error interpreting command: {e}")
150
+ return None
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pyautogui==0.9.54
2
+ pillow==10.2.0
3
+ gradio==4.19.2
4
+ numpy==1.26.4
5
+ python-dotenv==1.0.1
6
+ requests==2.31.0
7
+ pynput==1.7.6
8
+ opencv-python==4.9.0.80
utils/__pycache__/screen_utils.cpython-312.pyc ADDED
Binary file (3.93 kB). View file
 
utils/screen_utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyautogui
2
+ import numpy as np
3
+ import cv2
4
+ from PIL import Image
5
+ import time
6
+ from pynput import mouse, keyboard
7
+
8
+ class ScreenController:
9
+ def __init__(self):
10
+ # 初始化 PyAutoGUI 安全设置
11
+ pyautogui.FAILSAFE = True
12
+ pyautogui.PAUSE = 0.5
13
+
14
+ def capture_screen(self):
15
+ """捕获当前屏幕"""
16
+ screenshot = pyautogui.screenshot()
17
+ return np.array(screenshot)
18
+
19
+ def find_element_on_screen(self, template_image, confidence=0.8):
20
+ """在屏幕上查找特定元素"""
21
+ try:
22
+ location = pyautogui.locateOnScreen(template_image, confidence=confidence)
23
+ if location:
24
+ return location
25
+ return None
26
+ except Exception as e:
27
+ print(f"Error finding element: {e}")
28
+ return None
29
+
30
+ def click_position(self, x, y):
31
+ """点击指定位置"""
32
+ try:
33
+ pyautogui.click(x, y)
34
+ return True
35
+ except Exception as e:
36
+ print(f"Error clicking position: {e}")
37
+ return False
38
+
39
+ def type_text(self, text):
40
+ """输入文本"""
41
+ try:
42
+ pyautogui.write(text)
43
+ return True
44
+ except Exception as e:
45
+ print(f"Error typing text: {e}")
46
+ return False
47
+
48
+ def press_key(self, key):
49
+ """按下特定按键"""
50
+ try:
51
+ pyautogui.press(key)
52
+ return True
53
+ except Exception as e:
54
+ print(f"Error pressing key: {e}")
55
+ return False
56
+
57
+ def move_to(self, x, y, duration=0.5):
58
+ """移动鼠标到指定位置"""
59
+ try:
60
+ pyautogui.moveTo(x, y, duration=duration)
61
+ return True
62
+ except Exception as e:
63
+ print(f"Error moving mouse: {e}")
64
+ return False
65
+
66
+ def drag_to(self, x, y, duration=0.5):
67
+ """拖拽到指定位置"""
68
+ try:
69
+ pyautogui.dragTo(x, y, duration=duration)
70
+ return True
71
+ except Exception as e:
72
+ print(f"Error dragging: {e}")
73
+ return False
74
+
75
+ def scroll(self, clicks):
76
+ """滚动屏幕"""
77
+ try:
78
+ pyautogui.scroll(clicks)
79
+ return True
80
+ except Exception as e:
81
+ print(f"Error scrolling: {e}")
82
+ return False