Spaces:
Running
Running
eigentom
feat: allow decomposing inference endpoint for all elements, completely redesigned gradio app ui
4343377 | # Script to start vLLM services on GPU 4,5,6,7 and load balancer | |
| # Usage: ./scripts/start_vllm_with_balancer.sh | |
| set -e | |
| # Get script directory | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" | |
| cd "$PROJECT_ROOT" | |
| # Configuration | |
| # MODEL_NAME="ZhuofengLi/Qwen3-4B-Instruct-2507-DeepReview-lora-sft" # | |
| MODEL_NAME="openai/gpt-oss-120b" | |
| # GPU_CONFIG="2:8001,3:8002,4:8003,5:8004" # GPU:PORT pairs | |
| # GPU_CONFIG="1:8001,2:8002,3:8003,4:8004, 5:8005, 6:8006, 7:8007, 0:7999" # GPU:PORT pairs | |
| GPU_CONFIG="0:7001,1:7002,2:7003,3:7004,4:7005,5:7006,6:7007,7:7008" # GPU:PORT pairs | |
| TP_SIZE=1 # Tensor parallelism size per instance | |
| GPU_MEMORY_UTILIZATION=0.9 | |
| MAX_MODEL_LEN=131072 | |
| # Load balancer configuration | |
| LB_PORT=7000 # Load balancer port | |
| LB_STRATEGY="round_robin" # or "least_conn" | |
| LB_HEALTH_CHECK_INTERVAL=10.0 | |
| # Log directory | |
| LOG_DIR="./logs/vllm" | |
| mkdir -p "$LOG_DIR" | |
| # Endpoint pool file | |
| ENDPOINT_POOL_FILE="shared/configs/vllm_endpoint_pool.txt" | |
| mkdir -p "$(dirname "$ENDPOINT_POOL_FILE")" | |
| echo "==========================================" | |
| echo "Starting vLLM Services + Load Balancer" | |
| echo "==========================================" | |
| echo "Model: $MODEL_NAME" | |
| echo "GPU Configuration: $GPU_CONFIG" | |
| echo "Load Balancer Port: $LB_PORT" | |
| echo "Log Directory: $LOG_DIR" | |
| echo "" | |
| # Step 1: Start vLLM services | |
| echo "=== Step 1: Starting vLLM services ===" | |
| echo "" | |
| # Clear existing endpoints | |
| > "$ENDPOINT_POOL_FILE" | |
| # Parse GPU configuration | |
| IFS=',' read -ra GPU_CONFIGS <<< "$GPU_CONFIG" | |
| # Array to store PIDs | |
| VLLM_PIDS=() | |
| for gpu_config in "${GPU_CONFIGS[@]}"; do | |
| IFS=':' read -r gpu_id port <<< "$gpu_config" | |
| echo "Starting vLLM on GPU $gpu_id, port $port..." | |
| # Set CUDA_VISIBLE_DEVICES for this specific GPU | |
| export CUDA_VISIBLE_DEVICES=$gpu_id | |
| # Log file | |
| LOG_FILE="$LOG_DIR/vllm_gpu${gpu_id}_port${port}.log" | |
| # Start vLLM service in background | |
| ( | |
| echo "=== GPU $gpu_id, Port $port ===" >> "$LOG_FILE" | |
| echo "Starting at $(date)" >> "$LOG_FILE" | |
| echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" >> "$LOG_FILE" | |
| echo "" >> "$LOG_FILE" | |
| vllm serve "$MODEL_NAME" \ | |
| --port "$port" \ | |
| --tensor-parallel-size "$TP_SIZE" \ | |
| --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ | |
| --max-model-len "$MAX_MODEL_LEN" \ | |
| --trust-remote-code \ | |
| --dtype bfloat16 \ | |
| >> "$LOG_FILE" 2>&1 | |
| ) & | |
| PID=$! | |
| VLLM_PIDS+=($PID) | |
| # Add endpoint to pool file (for load balancer) | |
| echo "http://localhost:$port/v1" >> "$ENDPOINT_POOL_FILE" | |
| echo " -> Started with PID $PID" | |
| echo " -> Endpoint: http://localhost:$port/v1" | |
| echo " -> Log: $LOG_FILE" | |
| # Wait a bit before starting next service | |
| sleep 3 | |
| done | |
| # Save PIDs (one per line for easier parsing) | |
| printf "%s\n" "${VLLM_PIDS[@]}" > "$LOG_DIR/vllm_pids.txt" | |
| echo "" | |
| echo "vLLM service PIDs saved to: $LOG_DIR/vllm_pids.txt" | |
| echo "" | |
| # Step 2: Wait for services to be ready | |
| echo "=== Step 2: Waiting for vLLM services to be ready ===" | |
| echo "Waiting 90 seconds for services to initialize..." | |
| sleep 90 | |
| # Check service health | |
| echo "" | |
| echo "Checking service health..." | |
| HEALTHY_COUNT=0 | |
| for gpu_config in "${GPU_CONFIGS[@]}"; do | |
| IFS=':' read -r gpu_id port <<< "$gpu_config" | |
| if curl -s "http://localhost:$port/v1/models" > /dev/null 2>&1; then | |
| echo " GPU $gpu_id (port $port): HEALTHY" | |
| HEALTHY_COUNT=$((HEALTHY_COUNT + 1)) | |
| else | |
| echo " GPU $gpu_id (port $port): NOT READY (may still be initializing)" | |
| fi | |
| done | |
| if [ $HEALTHY_COUNT -eq 0 ]; then | |
| echo "" | |
| echo "WARNING: No services are healthy yet. They may still be loading the model." | |
| echo "You can check logs in $LOG_DIR/ for progress." | |
| fi | |
| echo "" | |
| # Step 3: Start load balancer | |
| echo "=== Step 3: Starting Load Balancer ===" | |
| echo "" | |
| # Build backend URLs | |
| BACKEND_URLS=() | |
| for gpu_config in "${GPU_CONFIGS[@]}"; do | |
| IFS=':' read -r gpu_id port <<< "$gpu_config" | |
| BACKEND_URLS+=("http://localhost:$port/v1") | |
| done | |
| echo "Load Balancer Configuration:" | |
| echo " Port: $LB_PORT" | |
| echo " Strategy: $LB_STRATEGY" | |
| echo " Backends: ${BACKEND_URLS[*]}" | |
| echo "" | |
| # Activate virtual environment if it exists | |
| if [ -d ".venv" ]; then | |
| source .venv/bin/activate | |
| fi | |
| # Check if FastAPI is installed | |
| python3 -c "import fastapi" 2>/dev/null || { | |
| echo "Error: FastAPI not installed. Install with: pip install fastapi uvicorn httpx" | |
| exit 1 | |
| } | |
| # Start load balancer in background | |
| echo "Starting load balancer..." | |
| nohup python3 -m shared.utils.load_balancer \ | |
| --backends "${BACKEND_URLS[@]}" \ | |
| --host 0.0.0.0 \ | |
| --port "$LB_PORT" \ | |
| --strategy "$LB_STRATEGY" \ | |
| --health-check-interval "$LB_HEALTH_CHECK_INTERVAL" \ | |
| > "$LOG_DIR/load_balancer_port${LB_PORT}.log" 2>&1 & | |
| LB_PID=$! | |
| # Save load balancer PID | |
| echo "$LB_PID" > "$LOG_DIR/vllm_lb_pid.txt" | |
| echo " -> Load balancer started with PID $LB_PID" | |
| echo " -> Endpoint: http://localhost:$LB_PORT" | |
| echo " -> Log: $LOG_DIR/load_balancer_port${LB_PORT}.log" | |
| echo " -> PID saved to: $LOG_DIR/vllm_lb_pid.txt" | |
| # Wait a bit for load balancer to start | |
| sleep 5 | |
| # Check load balancer health | |
| echo "" | |
| echo "Checking load balancer health..." | |
| if curl -s "http://localhost:$LB_PORT/health" > /dev/null 2>&1; then | |
| echo " Load balancer: HEALTHY" | |
| curl -s "http://localhost:$LB_PORT/health" | python3 -m json.tool 2>/dev/null || curl -s "http://localhost:$LB_PORT/health" | |
| else | |
| echo " Load balancer: NOT READY (check log: $LOG_DIR/load_balancer_port${LB_PORT}.log)" | |
| fi | |
| echo "" | |
| echo "==========================================" | |
| echo "Deployment Complete!" | |
| echo "==========================================" | |
| echo "" | |
| echo "vLLM Services:" | |
| for i in "${!GPU_CONFIGS[@]}"; do | |
| gpu_config="${GPU_CONFIGS[$i]}" | |
| IFS=':' read -r gpu_id port <<< "$gpu_config" | |
| PID="${VLLM_PIDS[$i]}" | |
| echo " GPU $gpu_id: http://localhost:$port/v1 (PID: $PID)" | |
| done | |
| echo "" | |
| echo "Load Balancer:" | |
| echo " http://localhost:$LB_PORT (PID: $LB_PID)" | |
| echo "" | |
| echo "Configuration:" | |
| echo " Update llm_service_config.yaml: base_url: \"http://localhost:$LB_PORT/v1\"" | |
| echo "" | |
| echo "To stop these specific services, run:" | |
| echo " ./scripts/stop_vllm_services.sh" | |
| echo "" | |
| echo "This will only kill the processes listed above, not other vLLM services." | |
| echo "" | |