Trouter-Library commited on
Commit
51402bc
·
verified ·
1 Parent(s): b5a4f02

Create Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +151 -0
Dockerfile ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for DeepXR/Helion-2.5-Rnd
2
+ # Optimized for production inference with vLLM
3
+
4
+ # Stage 1: Base image with CUDA and Python
5
+ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base
6
+
7
+ # Set environment variables
8
+ ENV DEBIAN_FRONTEND=noninteractive \
9
+ PYTHONUNBUFFERED=1 \
10
+ CUDA_HOME=/usr/local/cuda \
11
+ TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
12
+ FORCE_CUDA=1 \
13
+ MAX_JOBS=8
14
+
15
+ # Install system dependencies
16
+ RUN apt-get update && apt-get install -y \
17
+ python3.10 \
18
+ python3-pip \
19
+ python3.10-dev \
20
+ git \
21
+ wget \
22
+ curl \
23
+ vim \
24
+ build-essential \
25
+ cmake \
26
+ ninja-build \
27
+ ccache \
28
+ libssl-dev \
29
+ libffi-dev \
30
+ libjpeg-dev \
31
+ libpng-dev \
32
+ libgomp1 \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # Update pip and install build tools
36
+ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
37
+
38
+ # Stage 2: Build dependencies
39
+ FROM base AS builder
40
+
41
+ WORKDIR /build
42
+
43
+ # Install PyTorch with CUDA support
44
+ RUN pip install --no-cache-dir \
45
+ torch==2.2.0 \
46
+ torchvision==0.17.0 \
47
+ torchaudio==2.2.0 \
48
+ --index-url https://download.pytorch.org/whl/cu121
49
+
50
+ # Install vLLM and core dependencies
51
+ RUN pip install --no-cache-dir \
52
+ vllm==0.3.3 \
53
+ transformers==4.40.0 \
54
+ tokenizers==0.15.2 \
55
+ sentencepiece==0.2.0 \
56
+ accelerate==0.28.0 \
57
+ bitsandbytes==0.43.0 \
58
+ safetensors==0.4.2 \
59
+ huggingface-hub==0.21.4
60
+
61
+ # Install additional ML libraries
62
+ RUN pip install --no-cache-dir \
63
+ numpy==1.26.4 \
64
+ scipy==1.12.0 \
65
+ pandas==2.2.1 \
66
+ scikit-learn==1.4.1 \
67
+ pydantic==2.6.4 \
68
+ fastapi==0.110.0 \
69
+ uvicorn[standard]==0.29.0 \
70
+ aiohttp==3.9.3 \
71
+ ray[default]==2.10.0
72
+
73
+ # Install monitoring and optimization tools
74
+ RUN pip install --no-cache-dir \
75
+ prometheus-client==0.20.0 \
76
+ gputil==1.4.0 \
77
+ psutil==5.9.8 \
78
+ py-cpuinfo==9.0.0 \
79
+ pynvml==11.5.0
80
+
81
+ # Stage 3: Final runtime image
82
+ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
83
+
84
+ # Copy environment variables
85
+ ENV DEBIAN_FRONTEND=noninteractive \
86
+ PYTHONUNBUFFERED=1 \
87
+ CUDA_HOME=/usr/local/cuda \
88
+ MODEL_NAME=DeepXR/Helion-2.5-Rnd \
89
+ MODEL_PATH=/models/helion \
90
+ PORT=8000 \
91
+ HOST=0.0.0.0 \
92
+ TENSOR_PARALLEL_SIZE=2 \
93
+ MAX_MODEL_LEN=131072 \
94
+ GPU_MEMORY_UTILIZATION=0.95 \
95
+ WORKERS=1
96
+
97
+ # Install runtime dependencies only
98
+ RUN apt-get update && apt-get install -y \
99
+ python3.10 \
100
+ python3-pip \
101
+ curl \
102
+ vim \
103
+ libgomp1 \
104
+ && rm -rf /var/lib/apt/lists/*
105
+
106
+ # Copy Python packages from builder
107
+ COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
108
+ COPY --from=builder /usr/local/bin /usr/local/bin
109
+
110
+ # Create application directory
111
+ WORKDIR /app
112
+
113
+ # Create necessary directories
114
+ RUN mkdir -p /models/helion /app/inference /app/logs /app/cache
115
+
116
+ # Copy inference code
117
+ COPY ./inference /app/inference
118
+ COPY ./model_config.yaml /app/
119
+ COPY ./config.json /app/
120
+
121
+ # Set permissions
122
+ RUN chmod +x /app/inference/*.py
123
+
124
+ # Create non-root user for security
125
+ RUN useradd -m -u 1000 helion && \
126
+ chown -R helion:helion /app /models
127
+
128
+ USER helion
129
+
130
+ # Health check
131
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
132
+ CMD curl -f http://localhost:${PORT}/health || exit 1
133
+
134
+ # Expose ports
135
+ EXPOSE 8000 8001 8002
136
+
137
+ # Set default command
138
+ CMD ["python3", "-m", "inference.server", \
139
+ "--model", "${MODEL_PATH}", \
140
+ "--host", "${HOST}", \
141
+ "--port", "${PORT}", \
142
+ "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
143
+ "--max-model-len", "${MAX_MODEL_LEN}", \
144
+ "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]
145
+
146
+ # Labels
147
+ LABEL maintainer="DeepXR Team" \
148
+ version="2.5.0-rnd" \
149
+ description="Helion-2.5 Research & Development Model - Advanced Language Model" \
150
+ model="DeepXR/Helion-2.5-Rnd" \
151
+ license="Apache-2.0"