Spaces:
Paused
Paused
feat(download-model): add download model at runtime
Browse files- Dockerfile +8 -0
- README.md +2 -1
- download_model.py +13 -0
- run.sh +0 -2
Dockerfile
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
FROM python:3.12
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
RUN useradd -m -u 1000 user
|
| 4 |
USER user
|
| 5 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
@@ -11,6 +14,11 @@ RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://dow
|
|
| 11 |
|
| 12 |
COPY --chown=user . /app
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
EXPOSE 7860
|
| 15 |
|
| 16 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.12
|
| 2 |
|
| 3 |
+
# Declare your environment variables with the ARG directive
|
| 4 |
+
ARG HF_TOKEN
|
| 5 |
+
|
| 6 |
RUN useradd -m -u 1000 user
|
| 7 |
USER user
|
| 8 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
|
|
| 14 |
|
| 15 |
COPY --chown=user . /app
|
| 16 |
|
| 17 |
+
|
| 18 |
+
# Download at build time,
|
| 19 |
+
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
|
| 20 |
+
RUN python /app/download_model.py
|
| 21 |
+
|
| 22 |
EXPOSE 7860
|
| 23 |
|
| 24 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -15,6 +15,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
|
|
| 15 |
poetry export -f requirements.txt --output requirements.txt --without-hashes
|
| 16 |
```
|
| 17 |
|
|
|
|
| 18 |
|
| 19 |
## VLLM OpenAI Compatible API Server
|
| 20 |
|
|
@@ -27,7 +28,7 @@ Fixes:
|
|
| 27 |
|
| 28 |
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
## Documentation about config
|
| 33 |
|
|
|
|
| 15 |
poetry export -f requirements.txt --output requirements.txt --without-hashes
|
| 16 |
```
|
| 17 |
|
| 18 |
+
* The `HUGGING_FACE_HUB_TOKEN` and `HF_TOKEN` must exist during runtime (use the same value, it must have read permission to the model.)
|
| 19 |
|
| 20 |
## VLLM OpenAI Compatible API Server
|
| 21 |
|
|
|
|
| 28 |
|
| 29 |
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
| 30 |
|
| 31 |
+
|
| 32 |
|
| 33 |
## Documentation about config
|
| 34 |
|
download_model.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from huggingface_hub import snapshot_download
|
| 3 |
+
|
| 4 |
+
hf_token: str = os.getenv("HF_TOKEN")
|
| 5 |
+
hf_token = hf_token.strip()
|
| 6 |
+
if hf_token == "":
|
| 7 |
+
raise ValueError("HF_TOKEN is empty")
|
| 8 |
+
|
| 9 |
+
snapshot_download(
|
| 10 |
+
repo_id="sail/Sailor-4B-Chat",
|
| 11 |
+
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
| 12 |
+
token=hf_token,
|
| 13 |
+
)
|
run.sh
CHANGED
|
@@ -20,8 +20,6 @@ python -u /app/openai_compatible_api_server.py \
|
|
| 20 |
--revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
|
| 21 |
--host 0.0.0.0 \
|
| 22 |
--port 7860 \
|
| 23 |
-
--max-num-batched-tokens 32768 \
|
| 24 |
-
--max-model-len 32768 \
|
| 25 |
--dtype half \
|
| 26 |
--enforce-eager \
|
| 27 |
--gpu-memory-utilization 0.85
|
|
|
|
| 20 |
--revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
|
| 21 |
--host 0.0.0.0 \
|
| 22 |
--port 7860 \
|
|
|
|
|
|
|
| 23 |
--dtype half \
|
| 24 |
--enforce-eager \
|
| 25 |
--gpu-memory-utilization 0.85
|