from utils.screen_utils import ScreenController from models.vision_model import VisionModel import time class Controller: def __init__(self): self.screen_controller = ScreenController() self.vision_model = VisionModel() def execute_command(self, command): """执行自然语言命令""" try: # 1. 解释命令 action_plan = self.vision_model.interpret_command(command) if not action_plan: return False, "无法解析命令" # 2. 获取当前屏幕截图 screen = self.screen_controller.capture_screen() # 3. 定位目标元素 location = self.vision_model.get_element_location(screen, action_plan['target']) if not location: return False, f"无法找到目标元素:{action_plan['target']}" # 4. 执行操作 success = self._perform_action(action_plan['action'], location, action_plan.get('params', {})) return success, "操作执行成功" if success else "操作执行失败" except Exception as e: return False, f"执行命令时出错:{str(e)}" def _perform_action(self, action, location, params): """执行具体操作""" x, y = location.get('x'), location.get('y') if not (x and y): return False if action == 'click': return self.screen_controller.click_position(x, y) elif action == 'type': self.screen_controller.click_position(x, y) return self.screen_controller.type_text(params.get('text', '')) elif action == 'press': return self.screen_controller.press_key(params.get('key', '')) elif action == 'move': return self.screen_controller.move_to(x, y) elif action == 'drag': target_x = params.get('target_x') target_y = params.get('target_y') if target_x and target_y: self.screen_controller.move_to(x, y) return self.screen_controller.drag_to(target_x, target_y) elif action == 'scroll': return self.screen_controller.scroll(params.get('clicks', 0)) return False