| export CUDA_DEVICE_ORDER=PCI_BUS_ID | |
| export CUDA_VISIBLE_DEVICES=5 | |
| # Start NVIDIA MPS for efficient GPU sharing | |
| nvidia-cuda-mps-control -d | |
| echo "β MPS started on GPU 2" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Service 1: vLLM β Llama-3.1-8B-Instruct | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| vllm serve meta-llama/Llama-3.1-8B-Instruct \ | |
| --port 8031 \ | |
| --served-model-name dspy \ | |
| --dtype bfloat16 \ | |
| --tensor-parallel-size 1 \ | |
| --max-model-len 16384 \ | |
| --gpu-memory-utilization 0.40 \ | |
| --enable-prefix-caching \ | |
| --max-num-seqs 256 & | |
| VLLM_PID=$! | |
| echo "β³ Loading vLLM (Llama-3.1-8B)... PID: $VLLM_PID" | |
| sleep 40 # wait for vLLM to fully load | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Service 2: FastAPI β HHEM Support Claim API | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| export SUPPORT_API_PORT=8030 | |
| export SUPPORT_API_HOST=0.0.0.0 | |
| export HHEM_MODEL_NAME=vectara/hallucination_evaluation_model | |
| SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |
| echo "β³ Starting Support Claim Checking API..." | |
| cd "$SCRIPT_DIR" | |
| python support_claim_api.py & | |
| HHEM_PID=$! | |
| echo "β HHEM API started... PID: $HHEM_PID" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| echo "" | |
| echo "=========================================" | |
| echo " Both services running on GPU 2" | |
| echo " vLLM (dspy): http://0.0.0.0:8031" | |
| echo " HHEM (support): http://0.0.0.0:8030" | |
| echo "=========================================" | |
| echo "" | |
| # Wait for both processes | |
| wait $VLLM_PID $HHEM_PID |