Spaces:

airsltd
/

agent_manager

Runtime error

App Files Files Community

airsltd commited on Sep 17, 2025

Commit

893dedc

1 Parent(s): a2a8029

update

Browse files

Files changed (4) hide show

app.py +203 -3
deployer.py +117 -0
models.py +33 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,7 +1,207 @@
-from fastapi import FastAPI
 app = FastAPI()
 @app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+from fastapi import FastAPI, HTTPException, status, BackgroundTasks, Depends
+from typing import Dict, List, Optional
+from datetime import datetime
+import asyncio
+import json
+import uuid
+import os
+import redis
+from .models import AgentInfo, CreateAgentRequest, AgentUpdateRequest
+from .deployer import AgentDeployer, DockerAgentDeployer
 app = FastAPI()
+deployer: AgentDeployer = DockerAgentDeployer()
+# Redis 配置
+REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
+REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
+REDIS_DB = int(os.getenv("REDIS_DB", 0))
+redis_client: Optional[redis.Redis] = None
+redis_pubsub: Optional[redis.client.PubSub] = None
+AGENT_CHANNEL = "agent_discovery_channel"
+AGENT_KEY_PREFIX = "agent:"
+# 存储活跃 Agent 的信息 (内存缓存，最终以 Redis 为准)
+active_agents: Dict[str, AgentInfo] = {}
+async def _redis_listener():
+    """
+    监听 Redis 频道，处理 Agent 的上线、心跳和下线消息。
+    """
+    if not redis_pubsub:
+        return
+    while True:
+        message = redis_pubsub.get_message(ignore_subscribe_messages=True)
+        if message:
+            try:
+                data = json.loads(message['data'].decode('utf-8'))
+                agent_id = data.get("id")
+                event_type = data.get("event_type")
+                if agent_id:
+                    if event_type == "HEARTBEAT" or event_type == "ONLINE":
+                        # 从 Redis 获取最新的 Agent 信息
+                        agent_data = redis_client.hgetall(f"{AGENT_KEY_PREFIX}{agent_id}")
+                        if agent_data:
+                            agent_info = AgentInfo(**{k.decode('utf-8'): v.decode('utf-8') for k, v in agent_data.items()})
+                            active_agents[agent_id] = agent_info
+                            print(f"Agent {agent_id} {event_type} received. Status: {agent_info.status}")
+                        else:
+                            print(f"Agent {agent_id} {event_type} received, but no data in Redis.")
+                    elif event_type == "OFFLINE":
+                        if agent_id in active_agents:
+                            del active_agents[agent_id]
+                            redis_client.delete(f"{AGENT_KEY_PREFIX}{agent_id}")
+                            print(f"Agent {agent_id} OFFLINE received and removed.")
+            except json.JSONDecodeError:
+                print(f"Received invalid JSON message: {message['data']}")
+            except Exception as e:
+                print(f"Error processing Redis message: {e}")
+        await asyncio.sleep(0.1) # 避免忙等
+@app.on_event("startup")
+async def startup_event():
+    global redis_client, redis_pubsub
+    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
+    try:
+        redis_client.ping()
+        print("Connected to Redis successfully!")
+    except redis.exceptions.ConnectionError as e:
+        print(f"Could not connect to Redis: {e}")
+        # 可以在这里选择退出或以降级模式运行
+    redis_pubsub = redis_client.pubsub()
+    redis_pubsub.subscribe(AGENT_CHANNEL)
+    asyncio.create_task(_redis_listener())
+    # 从 Redis 加载所有已知的 Agent
+    for key in redis_client.keys(f"{AGENT_KEY_PREFIX}*"):
+        agent_data = redis_client.hgetall(key)
+        if agent_data:
+            agent_info = AgentInfo(**{k.decode('utf-8'): v.decode('utf-8') for k, v in agent_data.items()})
+            active_agents[agent_info.id] = agent_info
+            print(f"Loaded existing agent: {agent_info.id} ({agent_info.agent_type})")
+@app.on_event("shutdown")
+async def shutdown_event():
+    if redis_pubsub:
+        redis_pubsub.unsubscribe(AGENT_CHANNEL)
+    if redis_client:
+        redis_client.close()
+    print("Redis connection closed.")
 @app.get("/")
+async def read_root():
+    return {"message": "Agent Manager is running!"}
+@app.post("/agents", response_model=AgentInfo, status_code=status.HTTP_201_CREATED)
+async def create_agent(request: CreateAgentRequest):
+    """
+    部署一个新的 Agent 实例。
+    """
+    try:
+        agent_info = deployer.deploy_agent(request)
+        # 将 Agent 信息存储到 Redis
+        redis_client.hmset(f"{AGENT_KEY_PREFIX}{agent_info.id}", agent_info.dict())
+        active_agents[agent_info.id] = agent_info # 更新内存缓存
+        print(f"Agent {agent_info.id} ({agent_info.agent_type}) deployed and registered.")
+        return agent_info
+    except ValueError as e:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
+    except RuntimeError as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.get("/agents/{agent_id}", response_model=AgentInfo)
+async def get_agent_info(agent_id: str):
+    """
+    获取指定 Agent 实例的详细信息。
+    """
+    agent_info = active_agents.get(agent_id)
+    if not agent_info:
+        # 尝试从 Redis 加载，以防内存缓存丢失
+        agent_data = redis_client.hgetall(f"{AGENT_KEY_PREFIX}{agent_id}")
+        if agent_data:
+            agent_info = AgentInfo(**{k.decode('utf-8'): v.decode('utf-8') for k, v in agent_data.items()})
+            active_agents[agent_id] = agent_info
+        else:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Agent with ID {agent_id} not found.")
+    return agent_info
+@app.get("/agents", response_model=List[AgentInfo])
+async def list_agents(agent_type: Optional[str] = None):
+    """
+    列出所有活跃的 Agent 实例，可按 agent_type 过滤。
+    """
+    if agent_type:
+        return [agent for agent in active_agents.values() if agent.agent_type == agent_type]
+    return list(active_agents.values())
+@app.put("/agents/{agent_id}", response_model=AgentInfo)
+async def update_agent_status(agent_id: str, update_request: AgentUpdateRequest):
+    """
+    更新 Agent 实例的状态或信息。
+    主要用于 Agent 自身上报心跳或状态变更。
+    """
+    agent_info = active_agents.get(agent_id)
+    if not agent_info:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Agent with ID {agent_id} not found.")
+    update_data = update_request.dict(exclude_unset=True)
+    for key, value in update_data.items():
+        setattr(agent_info, key, value)
+    agent_info.last_heartbeat = datetime.now().isoformat() # 自动更新心跳时间
+    # 更新 Redis
+    redis_client.hmset(f"{AGENT_KEY_PREFIX}{agent_id}", agent_info.dict())
+    return agent_info
+@app.delete("/agents/{agent_id}", status_code=status.HTTP_204_NO_CONTENT)
+async def destroy_agent(agent_id: str):
+    """
+    停止并彻底销毁指定 Agent 实例及其底层资源。
+    """
+    agent_info = active_agents.get(agent_id)
+    if not agent_info:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Agent with ID {agent_id} not found.")
+    try:
+        if deployer.destroy_agent(agent_info.id): # deployer 销毁时使用 AgentInfo.id
+            if agent_id in active_agents:
+                del active_agents[agent_id]
+            redis_client.delete(f"{AGENT_KEY_PREFIX}{agent_id}")
+            print(f"Agent {agent_id} destroyed and removed from registry.")
+            return {"message": f"Agent {agent_id} destroyed successfully."}
+        else:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to destroy agent {agent_id}.")
+    except RuntimeError as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.post("/agents/{agent_id}/stop", status_code=status.HTTP_200_OK)
+async def stop_agent(agent_id: str):
+    """
+    优雅地停止指定 Agent 实例。
+    """
+    agent_info = active_agents.get(agent_id)
+    if not agent_info:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Agent with ID {agent_id} not found.")
+    try:
+        # 对于 DockerAgentDeployer，停止和销毁可能行为类似，这里假设停止是销毁的一部分
+        # 实际中，停止可能只是暂停容器，不删除
+        if deployer.destroy_agent(agent_info.id): # 暂时用 destroy_agent 模拟停止
+            agent_info.status = "stopped"
+            agent_info.last_heartbeat = datetime.now().isoformat()
+            redis_client.hmset(f"{AGENT_KEY_PREFIX}{agent_id}", agent_info.dict())
+            print(f"Agent {agent_id} stopped.")
+            return {"message": f"Agent {agent_id} stopped successfully."}
+        else:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to stop agent {agent_id}.")
+    except RuntimeError as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))

deployer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import docker
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+from datetime import datetime
+import uuid
+from .models import AgentInfo, CreateAgentRequest
+class AgentDeployer(ABC):
+    """
+    抽象基类，定义 Agent 部署器的接口。
+    """
+    @abstractmethod
+    def deploy_agent(self, request: CreateAgentRequest) -> AgentInfo:
+        """
+        部署一个新的 Agent 实例。
+        """
+        pass
+    @abstractmethod
+    def destroy_agent(self, agent_id: str) -> bool:
+        """
+        销毁一个 Agent 实例。
+        """
+        pass
+class DockerAgentDeployer(AgentDeployer):
+    """
+    基于 Docker 的 Agent 部署器实现。
+    """
+    def __init__(self):
+        self.client = docker.from_env()
+    def deploy_agent(self, request: CreateAgentRequest) -> AgentInfo:
+        agent_id = str(uuid.uuid4())
+        container_name = f"agent-{request.agent_type}-{agent_id[:8]}"
+        # 准备环境变量
+        environment = {
+            "AGENT_ID": agent_id,
+            "AGENT_TYPE": request.agent_type,
+            **request.env_vars
+        }
+        # 启动 Docker 容器
+        try:
+            container = self.client.containers.run(
+                request.image_name,
+                name=container_name,
+                detach=True,
+                environment=environment,
+                ports={'8000/tcp': None}, # 假设 Agent MCP Server 运行在容器的 8000 端口
+                # resource_limits 可以在这里配置，但 docker-py 的 run 方法直接支持的参数有限
+                # 更复杂的资源限制可能需要通过 create_container 和 start 组合
+            )
+            # 获取容器的 IP 地址和端口
+            container.reload()
+            # Docker 容器的 IP 地址通常在 bridge 网络中，需要进一步获取
+            # 简化处理，假设 Agent Manager 和 Agent 在同一网络，或者通过服务发现获取
+            # 这里暂时使用一个占位符，实际部署中需要更复杂的网络配置或服务发现
+            # 例如，如果 Agent Manager 运行在 Docker 网络中，可以通过容器名解析
+            # 或者通过 Redis 注册时 Agent 自身上报其可访问的 IP:Port
+            # 暂时使用一个占位符，实际需要从容器网络配置中获取
+            # 或者等待 Agent 启动后自行注册到 Redis
+            mcp_endpoint = f"http://{container_name}:8000"
+            now = datetime.now().isoformat()
+            agent_info = AgentInfo(
+                id=agent_id,
+                agent_type=request.agent_type,
+                mcp_endpoint=mcp_endpoint,
+                status="running",
+                created_at=now,
+                last_heartbeat=now,
+                metadata={"container_id": container.id, "container_name": container_name}
+            )
+            return agent_info
+        except docker.errors.ImageNotFound:
+            raise ValueError(f"Docker image '{request.image_name}' not found.")
+        except docker.errors.APIError as e:
+            raise RuntimeError(f"Failed to deploy agent container: {e}")
+    def destroy_agent(self, agent_id: str) -> bool:
+        try:
+            # 根据 agent_id 查找容器
+            # 假设 agent_id 存储在 AgentInfo 的 metadata 中作为 container_id
+            # 或者 Agent Manager 内部维护 agent_id 到 container_id 的映射
+            # 这里简化处理，假设 agent_id 就是 container_id 或者可以通过某种方式直接找到容器
+            # 实际中，Agent Manager 应该维护 AgentInfo 列表，通过 AgentInfo.metadata['container_id'] 来查找
+            # 为了演示，我们尝试通过名称查找，但更健壮的方式是存储 container_id
+            # 假设 agent_id 实际上是 AgentInfo.metadata['container_id']
+            # 这里需要一个机制来从 agent_id 映射到 container_id 或 container_name
+            # 暂时通过遍历所有容器来查找，实际不推荐
+            container_to_destroy = None
+            for container in self.client.containers.list(all=True):
+                if f"agent-{agent_id[:8]}" in container.name: # 粗略匹配
+                    container_to_destroy = container
+                    break
+            if container_to_destroy:
+                container_to_destroy.stop()
+                container_to_destroy.remove()
+                return True
+            else:
+                # 如果 agent_id 是 AgentInfo.id，我们需要从 Agent Manager 的存储中获取 container_id
+                # 这里只是一个占位符，实际需要 Agent Manager 的状态管理
+                print(f"Container for agent_id {agent_id} not found.")
+                return False
+        except docker.errors.NotFound:
+            print(f"Container for agent_id {agent_id} not found.")
+            return False
+        except docker.errors.APIError as e:
+            raise RuntimeError(f"Failed to destroy agent container {agent_id}: {e}")

models.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Dict, Optional
+from pydantic import BaseModel
+class AgentInfo(BaseModel):
+    """
+    表示一个 Agent 实例的信息。
+    """
+    id: str
+    agent_type: str
+    mcp_endpoint: str  # MCP 服务的访问地址 (e.g., "http://localhost:8001")
+    status: str = "running"  # Agent 的状态 (e.g., "running", "stopped", "error")
+    created_at: str
+    last_heartbeat: str
+    metadata: Dict = {} # 存储其他元数据，如资源使用情况、版本等
+class CreateAgentRequest(BaseModel):
+    """
+    创建 Agent 实例的请求模型。
+    """
+    agent_type: str
+    image_name: str  # Docker 镜像名称 (e.g., "my-echo-agent:latest")
+    env_vars: Dict[str, str] = {}  # 环境变量字典
+    resource_limits: Dict = {}  # 资源限制 (e.g., {"cpu": "0.5", "memory": "512m"})
+    config: Dict = {} # Agent 自身的配置信息
+class AgentUpdateRequest(BaseModel):
+    """
+    更新 Agent 实例信息的请求模型。
+    """
+    status: Optional[str] = None
+    mcp_endpoint: Optional[str] = None
+    last_heartbeat: Optional[str] = None
+    metadata: Optional[Dict] = None

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 fastapi
 uvicorn[standard]

 fastapi
 uvicorn[standard]
+docker
+redis