File size: 2,007 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | #!/bin/bash
export CUDA_DEVICE_ORDER=PCI_BUS_ID
export CUDA_VISIBLE_DEVICES=5
# Start NVIDIA MPS for efficient GPU sharing
nvidia-cuda-mps-control -d
echo "β
MPS started on GPU 2"
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Service 1: vLLM β Llama-3.1-8B-Instruct
# ββββββββββββββββββββββββββββββββββββββββββββββ
vllm serve meta-llama/Llama-3.1-8B-Instruct \
--port 8031 \
--served-model-name dspy \
--dtype bfloat16 \
--tensor-parallel-size 1 \
--max-model-len 16384 \
--gpu-memory-utilization 0.40 \
--enable-prefix-caching \
--max-num-seqs 256 &
VLLM_PID=$!
echo "β³ Loading vLLM (Llama-3.1-8B)... PID: $VLLM_PID"
sleep 40 # wait for vLLM to fully load
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Service 2: FastAPI β HHEM Support Claim API
# ββββββββββββββββββββββββββββββββββββββββββββββ
export SUPPORT_API_PORT=8030
export SUPPORT_API_HOST=0.0.0.0
export HHEM_MODEL_NAME=vectara/hallucination_evaluation_model
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
echo "β³ Starting Support Claim Checking API..."
cd "$SCRIPT_DIR"
python support_claim_api.py &
HHEM_PID=$!
echo "β
HHEM API started... PID: $HHEM_PID"
# ββββββββββββββββββββββββββββββββββββββββββββββ
echo ""
echo "========================================="
echo " Both services running on GPU 2"
echo " vLLM (dspy): http://0.0.0.0:8031"
echo " HHEM (support): http://0.0.0.0:8030"
echo "========================================="
echo ""
# Wait for both processes
wait $VLLM_PID $HHEM_PID |