Upload 3 files

Browse files

Files changed (3) hide show

api/__init__.py +1 -0
api/api_server.py +373 -0
api/load_balancer.py +475 -0

api/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+

api/api_server.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""
+API Server for Mamba Swarm
+FastAPI-based server for serving the distributed Mamba language model
+"""
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any, AsyncGenerator
+import asyncio
+import json
+import time
+import logging
+import torch
+from contextlib import asynccontextmanager
+import uvicorn
+# Import your swarm components
+from system.mambaSwarm import SwarmEngine
+from system.inference import InferenceEngine
+from routing.router import Router
+from training.trainer import setup_logging
+# Pydantic models for API
+class GenerationRequest(BaseModel):
+    prompt: str = Field(..., description="Input text prompt")
+    max_length: int = Field(default=100, ge=1, le=2048, description="Maximum generation length")
+    temperature: float = Field(default=0.7, ge=0.1, le=2.0, description="Sampling temperature")
+    top_p: float = Field(default=0.9, ge=0.1, le=1.0, description="Top-p sampling")
+    top_k: int = Field(default=50, ge=1, le=100, description="Top-k sampling")
+    repetition_penalty: float = Field(default=1.1, ge=1.0, le=2.0, description="Repetition penalty")
+    stream: bool = Field(default=False, description="Enable streaming response")
+    domain: Optional[str] = Field(default=None, description="Specific domain for routing")
+class GenerationResponse(BaseModel):
+    generated_text: str
+    prompt: str
+    generation_time: float
+    tokens_generated: int
+    model_info: Dict[str, Any]
+class StreamingToken(BaseModel):
+    token: str
+    is_final: bool = False
+    metadata: Optional[Dict[str, Any]] = None
+class HealthResponse(BaseModel):
+    status: str
+    swarm_status: Dict[str, Any]
+    system_info: Dict[str, Any]
+    timestamp: float
+class ModelInfo(BaseModel):
+    total_parameters: int
+    active_encoders: int
+    total_encoders: int
+    memory_usage: Dict[str, float]
+    device_info: List[str]
+# Global swarm engine instance
+swarm_engine: Optional[SwarmEngine] = None
+inference_engine: Optional[InferenceEngine] = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Manage application lifespan"""
+    global swarm_engine, inference_engine
+    # Startup
+    logging.info("Initializing Mamba Swarm API Server...")
+    try:
+        # Initialize swarm engine
+        swarm_engine = SwarmEngine()
+        await asyncio.get_event_loop().run_in_executor(None, swarm_engine.initialize)
+        # Initialize inference engine
+        inference_engine = InferenceEngine(swarm_engine)
+        logging.info("Mamba Swarm API Server initialized successfully")
+    except Exception as e:
+        logging.error(f"Failed to initialize swarm: {e}")
+        raise
+    yield
+    # Shutdown
+    logging.info("Shutting down Mamba Swarm API Server...")
+    if swarm_engine:
+        swarm_engine.shutdown()
+# Create FastAPI app
+app = FastAPI(
+    title="Mamba Swarm API",
+    description="Distributed Mamba Language Model API with 100 encoder units",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Dependency to get swarm engine
+async def get_swarm_engine() -> SwarmEngine:
+    if swarm_engine is None:
+        raise HTTPException(status_code=503, detail="Swarm engine not initialized")
+    return swarm_engine
+async def get_inference_engine() -> InferenceEngine:
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Inference engine not initialized")
+    return inference_engine
+@app.get("/health", response_model=HealthResponse)
+async def health_check(swarm: SwarmEngine = Depends(get_swarm_engine)):
+    """Health check endpoint"""
+    try:
+        swarm_status = swarm.get_status()
+        system_info = {
+            "cuda_available": torch.cuda.is_available(),
+            "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+            "python_version": "3.8+",
+        }
+        return HealthResponse(
+            status="healthy",
+            swarm_status=swarm_status,
+            system_info=system_info,
+            timestamp=time.time()
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
+@app.get("/model/info", response_model=ModelInfo)
+async def get_model_info(swarm: SwarmEngine = Depends(get_swarm_engine)):
+    """Get model information"""
+    try:
+        info = swarm.get_model_info()
+        memory_stats = swarm.memory_manager.get_memory_stats()
+        return ModelInfo(
+            total_parameters=info.get("total_parameters", 7000000000),  # 100 * 70M
+            active_encoders=info.get("active_encoders", 100),
+            total_encoders=info.get("total_encoders", 100),
+            memory_usage={
+                "system_memory_gb": memory_stats.used_memory,
+                "gpu_memory_gb": memory_stats.gpu_memory,
+                "cache_size_gb": memory_stats.cache_size
+            },
+            device_info=info.get("devices", ["cuda:0" if torch.cuda.is_available() else "cpu"])
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")
+@app.post("/generate", response_model=GenerationResponse)
+async def generate_text(
+    request: GenerationRequest,
+    inference: InferenceEngine = Depends(get_inference_engine)
+):
+    """Generate text from prompt"""
+    try:
+        start_time = time.time()
+        # Generate text
+        result = await asyncio.get_event_loop().run_in_executor(
+            None,
+            inference.generate,
+            request.prompt,
+            {
+                "max_length": request.max_length,
+                "temperature": request.temperature,
+                "top_p": request.top_p,
+                "top_k": request.top_k,
+                "repetition_penalty": request.repetition_penalty,
+                "domain": request.domain
+            }
+        )
+        generation_time = time.time() - start_time
+        return GenerationResponse(
+            generated_text=result["generated_text"],
+            prompt=request.prompt,
+            generation_time=generation_time,
+            tokens_generated=result.get("tokens_generated", 0),
+            model_info=result.get("model_info", {})
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.post("/generate/stream")
+async def generate_text_stream(
+    request: GenerationRequest,
+    inference: InferenceEngine = Depends(get_inference_engine)
+):
+    """Generate text with streaming response"""
+    if not request.stream:
+        raise HTTPException(status_code=400, detail="Streaming not requested")
+    async def generate_stream() -> AsyncGenerator[str, None]:
+        try:
+            # Create generator for streaming
+            generator = inference.generate_stream(
+                request.prompt,
+                {
+                    "max_length": request.max_length,
+                    "temperature": request.temperature,
+                    "top_p": request.top_p,
+                    "top_k": request.top_k,
+                    "repetition_penalty": request.repetition_penalty,
+                    "domain": request.domain
+                }
+            )
+            for token_data in generator:
+                streaming_token = StreamingToken(
+                    token=token_data.get("token", ""),
+                    is_final=token_data.get("is_final", False),
+                    metadata=token_data.get("metadata", {})
+                )
+                yield f"data: {streaming_token.json()}\n\n"
+                if streaming_token.is_final:
+                    break
+        except Exception as e:
+            error_token = StreamingToken(
+                token="",
+                is_final=True,
+                metadata={"error": str(e)}
+            )
+            yield f"data: {error_token.json()}\n\n"
+    return StreamingResponse(
+        generate_stream(),
+        media_type="text/plain",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
+    )
+@app.post("/generate/batch")
+async def generate_batch(
+    requests: List[GenerationRequest],
+    inference: InferenceEngine = Depends(get_inference_engine)
+):
+    """Generate text for multiple prompts"""
+    if len(requests) > 10:
+        raise HTTPException(status_code=400, detail="Batch size too large (max 10)")
+    try:
+        # Process requests in parallel
+        tasks = []
+        for req in requests:
+            task = asyncio.get_event_loop().run_in_executor(
+                None,
+                inference.generate,
+                req.prompt,
+                {
+                    "max_length": req.max_length,
+                    "temperature": req.temperature,
+                    "top_p": req.top_p,
+                    "top_k": req.top_k,
+                    "repetition_penalty": req.repetition_penalty,
+                    "domain": req.domain
+                }
+            )
+            tasks.append(task)
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        responses = []
+        for i, (req, result) in enumerate(zip(requests, results)):
+            if isinstance(result, Exception):
+                responses.append({
+                    "error": str(result),
+                    "prompt": req.prompt,
+                    "index": i
+                })
+            else:
+                responses.append(GenerationResponse(
+                    generated_text=result["generated_text"],
+                    prompt=req.prompt,
+                    generation_time=result.get("generation_time", 0),
+                    tokens_generated=result.get("tokens_generated", 0),
+                    model_info=result.get("model_info", {})
+                ))
+        return {"responses": responses}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch generation failed: {str(e)}")
+@app.get("/metrics")
+async def get_metrics(swarm: SwarmEngine = Depends(get_swarm_engine)):
+    """Get system metrics"""
+    try:
+        metrics = {
+            "memory_report": swarm.memory_manager.get_memory_report(),
+            "swarm_metrics": swarm.get_metrics(),
+            "inference_stats": swarm.get_inference_stats() if hasattr(swarm, 'get_inference_stats') else {},
+            "timestamp": time.time()
+        }
+        return metrics
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to get metrics: {str(e)}")
+@app.post("/admin/reload")
+async def reload_model(
+    background_tasks: BackgroundTasks,
+    swarm: SwarmEngine = Depends(get_swarm_engine)
+):
+    """Reload the model (admin endpoint)"""
+    try:
+        background_tasks.add_task(swarm.reload_model)
+        return {"message": "Model reload initiated"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to reload model: {str(e)}")
+@app.post("/admin/cleanup")
+async def cleanup_memory(swarm: SwarmEngine = Depends(get_swarm_engine)):
+    """Force memory cleanup (admin endpoint)"""
+    try:
+        swarm.memory_manager.cleanup_memory(aggressive=True)
+        return {"message": "Memory cleanup completed"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to cleanup memory: {str(e)}")
+# Error handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    return {
+        "error": exc.detail,
+        "status_code": exc.status_code,
+        "timestamp": time.time()
+    }
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    logging.error(f"Unhandled exception: {exc}")
+    return {
+        "error": "Internal server error",
+        "status_code": 500,
+        "timestamp": time.time()
+    }
+def run_server(host: str = "0.0.0.0", port: int = 8000, workers: int = 1):
+    """Run the API server"""
+    setup_logging()
+    config = uvicorn.Config(
+        app=app,
+        host=host,
+        port=port,
+        workers=workers,
+        log_level="info",
+        access_log=True,
+        reload=False  # Set to True for development
+    )
+    server = uvicorn.Server(config)
+    server.run()
+if __name__ == "__main__":
+    run_server()

api/load_balancer.py ADDED Viewed

	@@ -0,0 +1,475 @@

+"""
+Load Balancer for Mamba Swarm API
+Distributes requests across multiple API server instances
+"""
+import asyncio
+import aiohttp
+import random
+import time
+import logging
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass, field
+from enum import Enum
+from collections import defaultdict, deque
+import json
+import hashlib
+class LoadBalancingStrategy(Enum):
+    ROUND_ROBIN = "round_robin"
+    LEAST_CONNECTIONS = "least_connections"
+    WEIGHTED_ROUND_ROBIN = "weighted_round_robin"
+    LEAST_RESPONSE_TIME = "least_response_time"
+    HASH_BASED = "hash_based"
+    RESOURCE_AWARE = "resource_aware"
+@dataclass
+class ServerInstance:
+    host: str
+    port: int
+    weight: float = 1.0
+    max_connections: int = 100
+    timeout: float = 30.0
+    current_connections: int = 0
+    total_requests: int = 0
+    failed_requests: int = 0
+    response_times: deque = field(default_factory=lambda: deque(maxlen=100))
+    last_health_check: float = 0.0
+    is_healthy: bool = True
+    health_check_failures: int = 0
+    @property
+    def url(self) -> str:
+        return f"http://{self.host}:{self.port}"
+    @property
+    def avg_response_time(self) -> float:
+        return sum(self.response_times) / len(self.response_times) if self.response_times else 0.0
+    @property
+    def success_rate(self) -> float:
+        total = self.total_requests
+        if total == 0:
+            return 1.0
+        return (total - self.failed_requests) / total
+    @property
+    def load_score(self) -> float:
+        """Calculate load score for resource-aware balancing"""
+        connection_load = self.current_connections / self.max_connections
+        response_time_load = min(self.avg_response_time / 1000.0, 1.0)  # Normalize to seconds
+        failure_rate = self.failed_requests / max(self.total_requests, 1)
+        return (connection_load * 0.4 + response_time_load * 0.4 + failure_rate * 0.2)
+class LoadBalancer:
+    """Advanced load balancer for Mamba Swarm API servers"""
+    def __init__(self,
+                 servers: List[Tuple[str, int]],
+                 strategy: LoadBalancingStrategy = LoadBalancingStrategy.RESOURCE_AWARE,
+                 health_check_interval: float = 30.0,
+                 health_check_timeout: float = 5.0,
+                 max_retries: int = 3):
+        self.logger = logging.getLogger(__name__)
+        self.strategy = strategy
+        self.health_check_interval = health_check_interval
+        self.health_check_timeout = health_check_timeout
+        self.max_retries = max_retries
+        # Initialize server instances
+        self.servers = [
+            ServerInstance(host=host, port=port)
+            for host, port in servers
+        ]
+        # Strategy-specific state
+        self.round_robin_index = 0
+        self.request_counts = defaultdict(int)
+        # Session for HTTP requests
+        self.session: Optional[aiohttp.ClientSession] = None
+        # Health check task
+        self.health_check_task: Optional[asyncio.Task] = None
+        # Metrics
+        self.total_requests = 0
+        self.failed_requests = 0
+        self.start_time = time.time()
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self.start()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.stop()
+    async def start(self):
+        """Start the load balancer"""
+        # Create HTTP session
+        timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
+        self.session = aiohttp.ClientSession(timeout=timeout)
+        # Start health check task
+        self.health_check_task = asyncio.create_task(self._health_check_loop())
+        # Initial health check
+        await self._check_all_servers_health()
+        self.logger.info(f"Load balancer started with {len(self.servers)} servers using {self.strategy.value} strategy")
+    async def stop(self):
+        """Stop the load balancer"""
+        if self.health_check_task:
+            self.health_check_task.cancel()
+            try:
+                await self.health_check_task
+            except asyncio.CancelledError:
+                pass
+        if self.session:
+            await self.session.close()
+        self.logger.info("Load balancer stopped")
+    def get_healthy_servers(self) -> List[ServerInstance]:
+        """Get list of healthy servers"""
+        return [server for server in self.servers if server.is_healthy]
+    def select_server(self, request_data: Optional[Dict[str, Any]] = None) -> Optional[ServerInstance]:
+        """Select server based on configured strategy"""
+        healthy_servers = self.get_healthy_servers()
+        if not healthy_servers:
+            self.logger.warning("No healthy servers available")
+            return None
+        if self.strategy == LoadBalancingStrategy.ROUND_ROBIN:
+            return self._round_robin_selection(healthy_servers)
+        elif self.strategy == LoadBalancingStrategy.LEAST_CONNECTIONS:
+            return self._least_connections_selection(healthy_servers)
+        elif self.strategy == LoadBalancingStrategy.WEIGHTED_ROUND_ROBIN:
+            return self._weighted_round_robin_selection(healthy_servers)
+        elif self.strategy == LoadBalancingStrategy.LEAST_RESPONSE_TIME:
+            return self._least_response_time_selection(healthy_servers)
+        elif self.strategy == LoadBalancingStrategy.HASH_BASED:
+            return self._hash_based_selection(healthy_servers, request_data)
+        elif self.strategy == LoadBalancingStrategy.RESOURCE_AWARE:
+            return self._resource_aware_selection(healthy_servers)
+        else:
+            return random.choice(healthy_servers)
+    def _round_robin_selection(self, servers: List[ServerInstance]) -> ServerInstance:
+        """Round-robin server selection"""
+        server = servers[self.round_robin_index % len(servers)]
+        self.round_robin_index += 1
+        return server
+    def _least_connections_selection(self, servers: List[ServerInstance]) -> ServerInstance:
+        """Select server with least connections"""
+        return min(servers, key=lambda s: s.current_connections)
+    def _weighted_round_robin_selection(self, servers: List[ServerInstance]) -> ServerInstance:
+        """Weighted round-robin selection"""
+        total_weight = sum(s.weight for s in servers)
+        random_weight = random.uniform(0, total_weight)
+        current_weight = 0
+        for server in servers:
+            current_weight += server.weight
+            if random_weight <= current_weight:
+                return server
+        return servers[-1]  # Fallback
+    def _least_response_time_selection(self, servers: List[ServerInstance]) -> ServerInstance:
+        """Select server with least average response time"""
+        return min(servers, key=lambda s: s.avg_response_time or float('inf'))
+    def _hash_based_selection(self, servers: List[ServerInstance], request_data: Optional[Dict[str, Any]]) -> ServerInstance:
+        """Hash-based selection for session affinity"""
+        if not request_data or 'prompt' not in request_data:
+            return random.choice(servers)
+        # Use prompt hash for consistent routing
+        prompt_hash = hashlib.md5(request_data['prompt'].encode()).hexdigest()
+        server_index = int(prompt_hash, 16) % len(servers)
+        return servers[server_index]
+    def _resource_aware_selection(self, servers: List[ServerInstance]) -> ServerInstance:
+        """Select server based on resource utilization"""
+        # Sort by load score (lower is better)
+        sorted_servers = sorted(servers, key=lambda s: s.load_score)
+        # Use weighted random selection favoring lower load servers
+        weights = [1.0 / (s.load_score + 0.1) for s in sorted_servers]
+        total_weight = sum(weights)
+        random_value = random.uniform(0, total_weight)
+        current_weight = 0
+        for server, weight in zip(sorted_servers, weights):
+            current_weight += weight
+            if random_value <= current_weight:
+                return server
+        return sorted_servers[0]  # Fallback to best server
+    async def forward_request(self,
+                             path: str,
+                             method: str = "POST",
+                             data: Optional[Dict[str, Any]] = None,
+                             headers: Optional[Dict[str, str]] = None,
+                             **kwargs) -> Tuple[int, Dict[str, Any]]:
+        """Forward request to selected server with retry logic"""
+        self.total_requests += 1
+        for attempt in range(self.max_retries + 1):
+            server = self.select_server(data)
+            if not server:
+                self.failed_requests += 1
+                return 503, {"error": "No healthy servers available"}
+            try:
+                start_time = time.time()
+                server.current_connections += 1
+                url = f"{server.url}{path}"
+                request_kwargs = {
+                    "timeout": aiohttp.ClientTimeout(total=server.timeout),
+                    **kwargs
+                }
+                if headers:
+                    request_kwargs["headers"] = headers
+                if data:
+                    request_kwargs["json"] = data
+                async with self.session.request(method, url, **request_kwargs) as response:
+                    response_time = time.time() - start_time
+                    response_data = await response.json()
+                    # Update server metrics
+                    server.current_connections -= 1
+                    server.total_requests += 1
+                    server.response_times.append(response_time * 1000)  # Store in ms
+                    if response.status >= 400:
+                        server.failed_requests += 1
+                        if attempt < self.max_retries:
+                            self.logger.warning(f"Request failed on {server.url} (attempt {attempt + 1}), retrying...")
+                            continue
+                    return response.status, response_data
+            except Exception as e:
+                server.current_connections = max(0, server.current_connections - 1)
+                server.failed_requests += 1
+                self.logger.error(f"Request failed on {server.url}: {e}")
+                if attempt < self.max_retries:
+                    await asyncio.sleep(0.1 * (attempt + 1))  # Exponential backoff
+                    continue
+        self.failed_requests += 1
+        return 502, {"error": "All servers failed after retries"}
+    async def _check_server_health(self, server: ServerInstance) -> bool:
+        """Check health of a single server"""
+        try:
+            url = f"{server.url}/health"
+            timeout = aiohttp.ClientTimeout(total=self.health_check_timeout)
+            async with self.session.get(url, timeout=timeout) as response:
+                if response.status == 200:
+                    health_data = await response.json()
+                    server.last_health_check = time.time()
+                    server.health_check_failures = 0
+                    # Update server metrics from health data if available
+                    if 'system_info' in health_data:
+                        # Could extract additional metrics here
+                        pass
+                    return True
+                else:
+                    server.health_check_failures += 1
+                    return False
+        except Exception as e:
+            server.health_check_failures += 1
+            self.logger.debug(f"Health check failed for {server.url}: {e}")
+            return False
+    async def _check_all_servers_health(self):
+        """Check health of all servers"""
+        tasks = [self._check_server_health(server) for server in self.servers]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        for server, result in zip(self.servers, results):
+            if isinstance(result, Exception):
+                server.is_healthy = False
+                server.health_check_failures += 1
+            else:
+                was_healthy = server.is_healthy
+                server.is_healthy = result and server.health_check_failures < 3
+                if not was_healthy and server.is_healthy:
+                    self.logger.info(f"Server {server.url} is back online")
+                elif was_healthy and not server.is_healthy:
+                    self.logger.warning(f"Server {server.url} is unhealthy")
+    async def _health_check_loop(self):
+        """Periodic health check loop"""
+        while True:
+            try:
+                await asyncio.sleep(self.health_check_interval)
+                await self._check_all_servers_health()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                self.logger.error(f"Health check loop error: {e}")
+    def add_server(self, host: str, port: int, weight: float = 1.0):
+        """Add a new server to the pool"""
+        server = ServerInstance(host=host, port=port, weight=weight)
+        self.servers.append(server)
+        self.logger.info(f"Added server {server.url}")
+    def remove_server(self, host: str, port: int):
+        """Remove a server from the pool"""
+        self.servers = [s for s in self.servers if not (s.host == host and s.port == port)]
+        self.logger.info(f"Removed server http://{host}:{port}")
+    def get_stats(self) -> Dict[str, Any]:
+        """Get load balancer statistics"""
+        uptime = time.time() - self.start_time
+        server_stats = []
+        for server in self.servers:
+            server_stats.append({
+                "url": server.url,
+                "is_healthy": server.is_healthy,
+                "current_connections": server.current_connections,
+                "total_requests": server.total_requests,
+                "failed_requests": server.failed_requests,
+                "success_rate": server.success_rate,
+                "avg_response_time_ms": server.avg_response_time,
+                "load_score": server.load_score,
+                "weight": server.weight
+            })
+        return {
+            "strategy": self.strategy.value,
+            "uptime_seconds": uptime,
+            "total_requests": self.total_requests,
+            "failed_requests": self.failed_requests,
+            "success_rate": (self.total_requests - self.failed_requests) / max(self.total_requests, 1),
+            "healthy_servers": len(self.get_healthy_servers()),
+            "total_servers": len(self.servers),
+            "servers": server_stats
+        }
+# FastAPI integration
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import JSONResponse
+import uvicorn
+def create_load_balancer_app(servers: List[Tuple[str, int]],
+                           strategy: LoadBalancingStrategy = LoadBalancingStrategy.RESOURCE_AWARE) -> FastAPI:
+    """Create FastAPI app with load balancer"""
+    app = FastAPI(title="Mamba Swarm Load Balancer", version="1.0.0")
+    load_balancer = LoadBalancer(servers, strategy)
+    @app.on_event("startup")
+    async def startup():
+        await load_balancer.start()
+    @app.on_event("shutdown")
+    async def shutdown():
+        await load_balancer.stop()
+    @app.get("/lb/health")
+    async def lb_health():
+        """Load balancer health endpoint"""
+        return {"status": "healthy", "stats": load_balancer.get_stats()}
+    @app.get("/lb/stats")
+    async def lb_stats():
+        """Get load balancer statistics"""
+        return load_balancer.get_stats()
+    @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
+    async def proxy_request(request: Request, path: str):
+        """Proxy all requests to backend servers"""
+        try:
+            # Get request data
+            body = await request.body()
+            headers = dict(request.headers)
+            # Remove hop-by-hop headers
+            headers.pop("host", None)
+            headers.pop("connection", None)
+            # Parse body if it's JSON
+            data = None
+            if body:
+                try:
+                    import json
+                    data = json.loads(body.decode())
+                except:
+                    pass
+            # Forward request
+            status, response_data = await load_balancer.forward_request(
+                f"/{path}",
+                request.method,
+                data=data,
+                headers=headers,
+                params=dict(request.query_params)
+            )
+            return JSONResponse(content=response_data, status_code=status)
+        except Exception as e:
+            return JSONResponse(
+                content={"error": f"Load balancer error: {str(e)}"},
+                status_code=500
+            )
+    return app
+def run_load_balancer(servers: List[Tuple[str, int]],
+                     host: str = "0.0.0.0",
+                     port: int = 8080,
+                     strategy: LoadBalancingStrategy = LoadBalancingStrategy.RESOURCE_AWARE):
+    """Run the load balancer"""
+    app = create_load_balancer_app(servers, strategy)
+    config = uvicorn.Config(
+        app=app,
+        host=host,
+        port=port,
+        log_level="info"
+    )
+    server = uvicorn.Server(config)
+    server.run()
+if __name__ == "__main__":
+    # Example usage
+    servers = [
+        ("localhost", 8000),
+        ("localhost", 8001),
+        ("localhost", 8002),
+    ]
+    run_load_balancer(servers, strategy=LoadBalancingStrategy.RESOURCE_AWARE)