File size: 1,712 Bytes
77cec27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#! /bin/bash

################################################################################
# Shell script that starts a copy of vLLM with a base model plus all the 
# available aLoRA adapters in this repository.
#
# To run this script:
# 1. Create and activate a Python virtual environment using a tool such as 
#    miniforge, uv, or venv.
# 2. Install the fork of vLLM that supports aLoRA on your machine 
#    (`VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/tdoublep/vllm.git@alora`)
# 3. Install the Hugging Face CLI (`pip install -U "huggingface_hub[cli]"`)
# 3. Download the intrinsics library by running:
#    hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib
# 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG, and PORT as needed
# 5. Run this script from the root of your local copy of rag-intrinsics-lib.
################################################################################

BASE_MODEL_NAME=granite-3.3-8b-instruct
BASE_MODEL_ORG=ibm-granite
PORT=55555

export VLLM_API_KEY=rag_intrinsics_1234

# Find all aLoRA adapters for the target base model. Note that this can be 
# edited to serve both aLoRA and LoRA adapters simultaneously.
ALORAS=""
for item in "."/*; do
    # Remove the "./"
    name=$(basename -- "${item}")
    if [ -d "./${name}/alora/${BASE_MODEL_NAME}" ]; then
        ALORAS+="${name}=./${name}/alora/${BASE_MODEL_NAME} "
    fi
done


CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \
    --port ${PORT} \
    --gpu-memory-utilization 0.45 \
    --max-model-len 8192 \
    --enable-lora \
    --enable-activated-lora \
    --enable-prefix-caching \
    --max_lora_rank 64 \
    --lora-modules $ALORAS"

echo $CMD
$CMD