#!/bin/bash export CUDA_DEVICE_ORDER=PCI_BUS_ID export CUDA_VISIBLE_DEVICES=5 # Start NVIDIA MPS for efficient GPU sharing nvidia-cuda-mps-control -d echo "✅ MPS started on GPU 2" # ────────────────────────────────────────────── # Service 1: vLLM — Llama-3.1-8B-Instruct # ────────────────────────────────────────────── vllm serve meta-llama/Llama-3.1-8B-Instruct \ --port 8031 \ --served-model-name dspy \ --dtype bfloat16 \ --tensor-parallel-size 1 \ --max-model-len 16384 \ --gpu-memory-utilization 0.40 \ --enable-prefix-caching \ --max-num-seqs 256 & VLLM_PID=$! echo "⏳ Loading vLLM (Llama-3.1-8B)... PID: $VLLM_PID" sleep 40 # wait for vLLM to fully load # ────────────────────────────────────────────── # Service 2: FastAPI — HHEM Support Claim API # ────────────────────────────────────────────── export SUPPORT_API_PORT=8030 export SUPPORT_API_HOST=0.0.0.0 export HHEM_MODEL_NAME=vectara/hallucination_evaluation_model SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" echo "⏳ Starting Support Claim Checking API..." cd "$SCRIPT_DIR" python support_claim_api.py & HHEM_PID=$! echo "✅ HHEM API started... PID: $HHEM_PID" # ────────────────────────────────────────────── echo "" echo "=========================================" echo " Both services running on GPU 2" echo " vLLM (dspy): http://0.0.0.0:8031" echo " HHEM (support): http://0.0.0.0:8030" echo "=========================================" echo "" # Wait for both processes wait $VLLM_PID $HHEM_PID