ui-agent / controller.py
wxlsty323's picture
Upload folder using huggingface_hub
3508f42 verified
from utils.screen_utils import ScreenController
from models.vision_model import VisionModel
import time
class Controller:
def __init__(self):
self.screen_controller = ScreenController()
self.vision_model = VisionModel()
def execute_command(self, command):
"""执行自然语言命令"""
try:
# 1. 解释命令
action_plan = self.vision_model.interpret_command(command)
if not action_plan:
return False, "无法解析命令"
# 2. 获取当前屏幕截图
screen = self.screen_controller.capture_screen()
# 3. 定位目标元素
location = self.vision_model.get_element_location(screen, action_plan['target'])
if not location:
return False, f"无法找到目标元素:{action_plan['target']}"
# 4. 执行操作
success = self._perform_action(action_plan['action'], location, action_plan.get('params', {}))
return success, "操作执行成功" if success else "操作执行失败"
except Exception as e:
return False, f"执行命令时出错:{str(e)}"
def _perform_action(self, action, location, params):
"""执行具体操作"""
x, y = location.get('x'), location.get('y')
if not (x and y):
return False
if action == 'click':
return self.screen_controller.click_position(x, y)
elif action == 'type':
self.screen_controller.click_position(x, y)
return self.screen_controller.type_text(params.get('text', ''))
elif action == 'press':
return self.screen_controller.press_key(params.get('key', ''))
elif action == 'move':
return self.screen_controller.move_to(x, y)
elif action == 'drag':
target_x = params.get('target_x')
target_y = params.get('target_y')
if target_x and target_y:
self.screen_controller.move_to(x, y)
return self.screen_controller.drag_to(target_x, target_y)
elif action == 'scroll':
return self.screen_controller.scroll(params.get('clicks', 0))
return False