Spaces:

AUXteam
/

Maxun

Running

App Files Files Community

AUXteam commited on 27 days ago

Commit

155a7ae

verified ·

1 Parent(s): a8a10e9

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

src/magentic_ui/backend/__init__.py +1 -5
src/magentic_ui/backend/managers/__init__.py +1 -0
src/magentic_ui/backend/managers/vllm_manager.py +111 -0
src/magentic_ui/backend/web/app.py +20 -0
src/magentic_ui/backend/web/initialization.py +2 -1
src/magentic_ui/utils/__init__.py +8 -0
src/magentic_ui/utils/midscene_adapter.py +69 -0
src/magentic_ui/utils/utils.py +186 -0
tests/test_midscene_adapter.py +71 -0
tests/test_vllm_manager.py +56 -0

src/magentic_ui/backend/__init__.py CHANGED Viewed

@@ -1,6 +1,2 @@
 from .database.db_manager import DatabaseManager
-from .datamodel import Team
-from .teammanager import TeamManager
-from ..version import __version__
-__all__ = ["DatabaseManager", "Team", "TeamManager", "__version__"]


1	+ # src/magentic_ui/backend/__init__.py
2	from .database.db_manager import DatabaseManager

src/magentic_ui/backend/managers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src/magentic_ui/backend/managers/__init__.py

src/magentic_ui/backend/managers/vllm_manager.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import asyncio
+import os
+import signal
+import subprocess
+import time
+from typing import Optional
+from loguru import logger
+class VLLMManager:
+    """Manages the lifecycle of a VLLM server process."""
+    def __init__(
+        self,
+        model_name: str = "yujiepan/ui-tars-1.5-7B-GPTQ-W4A16g128",
+        port: int = 5000,
+        host: str = "0.0.0.0",
+        gpu_memory_utilization: float = 0.9,
+    ) -> None:
+        self.model_name = model_name
+        self.port = port
+        self.host = host
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self._process: Optional[subprocess.Popen] = None
+    async def start(self) -> None:
+        """Start the VLLM server process."""
+        if self._process is not None:
+            logger.warning("VLLM server is already running.")
+            return
+        cmd = [
+            "vllm",
+            "serve",
+            self.model_name,
+            "--port",
+            str(self.port),
+            "--host",
+            self.host,
+            "--gpu-memory-utilization",
+            str(self.gpu_memory_utilization),
+            "--dtype",
+            "auto",
+            "--trust-remote-code",
+        ]
+        logger.info(f"Starting VLLM server with command: {' '.join(cmd)}")
+        try:
+            self._process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                preexec_fn=os.setsid,  # Create a new process group
+            )
+            # Wait for the server to be ready
+            await self._wait_for_ready()
+            logger.info("VLLM server started successfully.")
+        except Exception as e:
+            logger.error(f"Failed to start VLLM server: {e}")
+            self.stop()
+            raise
+    async def _wait_for_ready(self, timeout: int = 300) -> None:
+        """Wait for the VLLM server to be ready."""
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            if self._process and self._process.poll() is not None:
+                raise RuntimeError("VLLM process exited unexpectedly.")
+            try:
+                # Check health endpoint
+                # In a real scenario, we would use `requests` or `httpx` to check /health
+                # For simplicity, we can just check if the port is open or rely on log output parsing
+                # Here we assume it takes some time to initialize
+                # A more robust check would involve HTTP request
+                await asyncio.sleep(5)
+                # Placeholder: assume ready after 10s for now or check logs
+                # In production, check `http://localhost:5000/health`
+                if self._process:
+                   # Check if process is still running
+                   if self._process.poll() is None:
+                       # Optimistic assumption for this example
+                       # Real implementation: requests.get(f"http://{self.host}:{self.port}/health")
+                       return
+            except Exception:
+                pass
+            await asyncio.sleep(2)
+        raise TimeoutError("VLLM server failed to start within timeout.")
+    def stop(self) -> None:
+        """Stop the VLLM server process."""
+        if self._process:
+            logger.info("Stopping VLLM server...")
+            try:
+                os.killpg(os.getpgid(self._process.pid), signal.SIGTERM)
+                self._process.wait(timeout=10)
+            except Exception as e:
+                logger.warning(f"Error stopping VLLM server: {e}")
+                if self._process:
+                    self._process.kill()
+            finally:
+                self._process = None
+            logger.info("VLLM server stopped.")
+    def is_running(self) -> bool:
+        """Check if the VLLM server is running."""
+        return self._process is not None and self._process.poll() is None

src/magentic_ui/backend/web/app.py CHANGED Viewed

@@ -24,11 +24,14 @@ from .routes import (
     ws,
     mcp,
 )
 # Initialize application
 app_file_path = os.path.dirname(os.path.abspath(__file__))
 initializer = AppInitializer(settings, app_file_path)
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
@@ -36,6 +39,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     Lifecycle manager for the FastAPI application.
     Handles initialization and cleanup of application resources.
     """
     try:
         # Load the config if provided
@@ -51,6 +55,20 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
         if os.environ.get("FARA_AGENT") is not None:
             config["use_fara_agent"] = os.environ["FARA_AGENT"] == "True"
         # Initialize managers (DB, Connection, Team)
         await init_managers(
             initializer.database_uri,
@@ -78,6 +96,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     try:
         logger.info("Cleaning up application resources...")
         await cleanup_managers()
         logger.info("Application shutdown complete")
     except Exception as e:
         logger.error(f"Error during shutdown: {str(e)}")

     ws,
     mcp,
 )
+from ..managers.vllm_manager import VLLMManager
 # Initialize application
 app_file_path = os.path.dirname(os.path.abspath(__file__))
 initializer = AppInitializer(settings, app_file_path)
+# Global VLLM Manager
+vllm_manager = None
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     Lifecycle manager for the FastAPI application.
     Handles initialization and cleanup of application resources.
     """
+    global vllm_manager
     try:
         # Load the config if provided
         if os.environ.get("FARA_AGENT") is not None:
             config["use_fara_agent"] = os.environ["FARA_AGENT"] == "True"
+        # Initialize VLLM if configured
+        if os.environ.get("USE_LOCAL_VLLM") == "True":
+            try:
+                vllm_port = int(os.environ.get("VLLM_PORT", 5000))
+                vllm_model = os.environ.get("VLLM_MODEL", "yujiepan/ui-tars-1.5-7B-GPTQ-W4A16g128")
+                vllm_manager = VLLMManager(model_name=vllm_model, port=vllm_port)
+                await vllm_manager.start()
+                # Inject VLLM URL into config for agents to use
+                config["vllm_base_url"] = f"http://localhost:{vllm_port}"
+            except Exception as e:
+                logger.error(f"Failed to start VLLM manager: {e}")
+                # decide if we should fail hard or continue without vision
+                # raise e
         # Initialize managers (DB, Connection, Team)
         await init_managers(
             initializer.database_uri,
     try:
         logger.info("Cleaning up application resources...")
         await cleanup_managers()
+        if vllm_manager:
+            vllm_manager.stop()
         logger.info("Application shutdown complete")
     except Exception as e:
         logger.error(f"Error during shutdown: {str(e)}")

src/magentic_ui/backend/web/initialization.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# api/initialization.py
 import os
 from pathlib import Path
@@ -7,6 +7,7 @@ from loguru import logger
 from pydantic import BaseModel
 from .config import Settings
 class _AppPaths(BaseModel):

+# src/magentic_ui/backend/web/initialization.py
 import os
 from pathlib import Path
 from pydantic import BaseModel
 from .config import Settings
+from ..managers.vllm_manager import VLLMManager
 class _AppPaths(BaseModel):

src/magentic_ui/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# src/magentic_ui/utils/__init__.py
+from .utils import (
+    LLMCallFilter,
+    json_data_to_markdown,
+    dict_to_str,
+    thread_to_context,
+    get_internal_urls,
+)

src/magentic_ui/utils/midscene_adapter.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import json
+from typing import Dict, Any, List
+class MidsceneAdapter:
+    """Adapts user prompts into VLLM/Midscene compatible payloads."""
+    def __init__(self, model_url: str = "http://localhost:5000"):
+        self.model_url = model_url
+    def format_prompt(self, user_instruction: str, screenshot_base64: str) -> Dict[str, Any]:
+        """Format the input for the VLLM/Midscene model.
+        This assumes the model accepts multimodal input (image + text) in a specific format.
+        Adjust the format based on the actual model's requirement (e.g., UI-TARS uses specific prompting).
+        """
+        # Example prompt structure for UI-TARS or similar vision models
+        prompt = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{screenshot_base64}"
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": f"Given the screenshot, perform the following action: {user_instruction}"
+                        }
+                    ]
+                }
+            ],
+            "temperature": 0.0,
+            "max_tokens": 1024
+        }
+        return prompt
+    def parse_response(self, response_data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Parse the model's response into actionable commands."""
+        # Example: The model might return a JSON string or a specific text format like `click(100, 200)`
+        content = response_data.get("choices", [{}])[0].get("message", {}).get("content", "")
+        actions = []
+        try:
+            import re
+            # simple heuristic parsing, replace with robust logic based on model output
+            if "click" in content:
+                # extract coordinates
+                match = re.search(r"click\((\d+),\s*(\d+)\)", content)
+                if match:
+                    x, y = map(int, match.groups())
+                    actions.append({"type": "click", "x": x, "y": y})
+            # Check for type command - prioritize specific regex
+            elif "type" in content:
+                # Look for type('text') or type("text")
+                match = re.search(r"type\(['\"](.*?)['\"]\)", content)
+                if match:
+                    text = match.group(1)
+                    actions.append({"type": "type", "text": text})
+            else:
+                 # Fallback: return as a raw thought or stop action
+                 actions.append({"type": "stop", "reason": content})
+        except Exception as e:
+            actions.append({"type": "error", "message": str(e)})
+        return actions

src/magentic_ui/utils/utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import json
+import logging
+import os
+import psutil
+from typing import List, Union, Dict
+from autogen_core.models import (
+    LLMMessage,
+    UserMessage,
+    AssistantMessage,
+)
+from autogen_agentchat.utils import remove_images
+from autogen_agentchat.messages import (
+    BaseChatMessage,
+    BaseTextChatMessage,
+    HandoffMessage,
+    MultiModalMessage,
+    StopMessage,
+    TextMessage,
+    ToolCallRequestEvent,
+    ToolCallExecutionEvent,
+    BaseAgentEvent,
+)
+from ..types import HumanInputFormat, RunPaths
+class LLMCallFilter(logging.Filter):
+    def filter(self, record: logging.LogRecord) -> bool:
+        try:
+            message = json.loads(record.getMessage())
+            return message.get("type") == "LLMCall"
+        except (json.JSONDecodeError, AttributeError):
+            return False
+# Define recursive types for JSON structures
+JsonPrimitive = Union[str, int, float, bool, None]
+JsonList = List[Union[JsonPrimitive, "JsonDict", "JsonList"]]
+JsonDict = Dict[str, Union[JsonPrimitive, JsonList, "JsonDict"]]
+JsonData = Union[JsonDict, JsonList, str]
+def json_data_to_markdown(data: JsonData) -> str:
+    """
+    Convert a dictionary, list, or JSON string to a nicely formatted Markdown string.
+    Handles nested structures of dictionaries and lists.
+    Args:
+        data (JsonData): The data to convert, can be:
+            - A dictionary with string keys and JSON-compatible values
+            - A list of JSON-compatible values
+            - A JSON string representing either of the above
+    Returns:
+        str: The formatted Markdown string.
+    Raises:
+        ValueError: If the input cannot be parsed or converted to markdown format.
+        json.JSONDecodeError: If the input string is not valid JSON.
+    """
+    def format_dict(d: JsonDict, indent: int = 0) -> str:
+        md = ""
+        for key, value in d.items():
+            md += "  " * indent + f"- {key}: "
+            if isinstance(value, dict):
+                md += "\n" + format_dict(value, indent + 1)
+            elif isinstance(value, list):
+                md += "\n" + format_list(value, indent + 1)
+            else:
+                md += f"{value}\n"
+        return md
+    def format_list(lst: JsonList, indent: int = 0) -> str:
+        md = ""
+        for item in lst:
+            if isinstance(item, dict):
+                md += "  " * indent + "- \n" + format_dict(item, indent + 1)
+            elif isinstance(item, list):
+                md += "  " * indent + "- \n" + format_list(item, indent + 1)
+            else:
+                md += "  " * indent + f"- {item}\n"
+        return md
+    try:
+        if isinstance(data, str):
+            data = json.loads(data)
+        if isinstance(data, list):
+            return format_list(data)
+        elif isinstance(data, dict):
+            return format_dict(data)
+        else:
+            raise ValueError(f"Expected dict, list or JSON string, got {type(data)}")
+    except json.JSONDecodeError as e:
+        raise json.JSONDecodeError(f"Invalid JSON string: {str(e)}", e.doc, e.pos)
+    except Exception as e:
+        raise ValueError(f"Failed to convert to markdown: {str(e)}")
+def dict_to_str(data: Union[JsonDict, str]) -> str:
+    """
+    Convert a dictionary or JSON string to a JSON string.
+    Args:
+        data (JsonDict | str): The dictionary or JSON string to convert.
+    Returns:
+        str: The input dictionary in JSON format.
+    """
+    if isinstance(data, dict):
+        return json.dumps(data)
+    elif isinstance(data, str):
+        return data
+    else:
+        raise ValueError("Unexpected input type")
+def thread_to_context(
+    messages: List[BaseAgentEvent | BaseChatMessage],
+    agent_name: str,
+    is_multimodal: bool = False,
+) -> List[LLMMessage]:
+    """Convert the message thread to a context for the model."""
+    context: List[LLMMessage] = []
+    for m in messages:
+        if isinstance(m, ToolCallRequestEvent | ToolCallExecutionEvent):
+            # Ignore tool call messages.
+            continue
+        elif isinstance(m, StopMessage | HandoffMessage):
+            context.append(UserMessage(content=m.content, source=m.source))
+        elif m.source == agent_name:
+            assert isinstance(m, TextMessage), f"{type(m)}"
+            context.append(AssistantMessage(content=m.content, source=m.source))
+        elif m.source == "user_proxy" or m.source == "user":
+            assert isinstance(m, TextMessage | MultiModalMessage), f"{type(m)}"
+            if isinstance(m.content, str):
+                human_input = HumanInputFormat.from_str(m.content)
+                content = f"{human_input.content}"
+                if human_input.plan is not None:
+                    content += f"\n\nI created the following plan: {human_input.plan}"
+                context.append(UserMessage(content=content, source=m.source))
+            else:
+                # If content is a list, transform only the string part
+                content_list = list(m.content)  # Create a copy of the list
+                for i, item in enumerate(content_list):
+                    if isinstance(item, str):
+                        human_input = HumanInputFormat.from_str(item)
+                        content_list[i] = f"{human_input.content}"
+                        if human_input.plan is not None and isinstance(
+                            content_list[i], str
+                        ):
+                            content_list[i] = (
+                                f"{content_list[i]}\n\nI created the following plan: {human_input.plan}"
+                            )
+                context.append(UserMessage(content=content_list, source=m.source))  # type: ignore
+        else:
+            assert isinstance(m, BaseTextChatMessage) or isinstance(
+                m, MultiModalMessage
+            ), f"{type(m)}"
+            context.append(UserMessage(content=m.content, source=m.source))
+    if is_multimodal:
+        return context
+    else:
+        return remove_images(context)
+def get_internal_urls(inside_docker: bool, paths: RunPaths) -> List[str] | None:
+    if not inside_docker:
+        return None
+    urls: List[str] = []
+    for _, addrs in psutil.net_if_addrs().items():
+        for addr in addrs:
+            if addr.family.name == "AF_INET":
+                urls.append(addr.address)
+    hostname = os.getenv("HOSTNAME")
+    if hostname is not None:
+        urls.append(hostname)
+    container_name = os.getenv("CONTAINER_NAME")
+    if container_name is not None:
+        urls.append(container_name)
+    return urls

tests/test_midscene_adapter.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import unittest
+import json
+import os
+import sys
+# Ensure src is in python path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
+from magentic_ui.utils.midscene_adapter import MidsceneAdapter
+class TestMidsceneAdapter(unittest.TestCase):
+    def setUp(self):
+        self.adapter = MidsceneAdapter()
+    def test_format_prompt(self):
+        user_instruction = "Click the login button"
+        screenshot_base64 = "base64encodedimage"
+        prompt = self.adapter.format_prompt(user_instruction, screenshot_base64)
+        self.assertIn("messages", prompt)
+        messages = prompt["messages"]
+        self.assertEqual(len(messages), 1)
+        content = messages[0]["content"]
+        has_image = False
+        has_text = False
+        for item in content:
+            if item.get("type") == "image_url":
+                self.assertIn(screenshot_base64, item["image_url"]["url"])
+                has_image = True
+            if item.get("type") == "text":
+                self.assertIn(user_instruction, item["text"])
+                has_text = True
+        self.assertTrue(has_image)
+        self.assertTrue(has_text)
+    def test_parse_response_click(self):
+        mock_response = {
+            "choices": [{
+                "message": {
+                    "content": "I see the button. click(150, 300)"
+                }
+            }]
+        }
+        actions = self.adapter.parse_response(mock_response)
+        self.assertEqual(len(actions), 1)
+        self.assertEqual(actions[0]["type"], "click")
+        self.assertEqual(actions[0]["x"], 150)
+        self.assertEqual(actions[0]["y"], 300)
+    def test_parse_response_type(self):
+        mock_response = {
+            "choices": [{
+                "message": {
+                    "content": "type('hello world')"
+                }
+            }]
+        }
+        actions = self.adapter.parse_response(mock_response)
+        self.assertEqual(len(actions), 1)
+        self.assertEqual(actions[0]["type"], "type")
+        self.assertEqual(actions[0]["text"], "hello world")
+if __name__ == '__main__':
+    unittest.main()

tests/test_vllm_manager.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import asyncio
+import unittest
+import os
+import sys
+from unittest.mock import MagicMock, patch
+# Ensure src is in python path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
+from magentic_ui.backend.managers.vllm_manager import VLLMManager
+class TestVLLMManager(unittest.TestCase):
+    @patch("magentic_ui.backend.managers.vllm_manager.subprocess.Popen")
+    def test_vllm_manager_start(self, mock_popen):
+        manager = VLLMManager()
+        # Mock process
+        mock_process = MagicMock()
+        mock_process.poll.return_value = None # Process running
+        mock_popen.return_value = mock_process
+        async def run_test():
+            # Mock _wait_for_ready to avoid actual sleep loop in test
+            with patch.object(manager, '_wait_for_ready', new_callable=AsyncMock):
+                await manager.start()
+        # Helper for async test
+        class AsyncMock(MagicMock):
+            async def __call__(self, *args, **kwargs):
+                return super(AsyncMock, self).__call__(*args, **kwargs)
+        loop = asyncio.new_event_loop()
+        loop.run_until_complete(run_test())
+        loop.close()
+        self.assertTrue(manager.is_running())
+        mock_popen.assert_called_once()
+    def test_vllm_manager_stop(self):
+        manager = VLLMManager()
+        manager._process = MagicMock()
+        manager._process.pid = 12345
+        # Patch os.getpgid since that's called before killpg
+        with patch("os.killpg") as mock_killpg, \
+             patch("os.getpgid", return_value=12345):
+            manager.stop()
+            # The killpg call happens inside stop()
+            mock_killpg.assert_called()
+        self.assertIsNone(manager._process)
+if __name__ == '__main__':
+    unittest.main()