File size: 4,112 Bytes
7c19d46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | # =============================================================================
# ML Pipeline — Training Job + Inference Service
# =============================================================================
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-inference
namespace: ml-pipeline
labels:
app: ml-inference
version: v1
spec:
replicas: 1
selector:
matchLabels:
app: ml-inference
template:
metadata:
labels:
app: ml-inference
version: v1
annotations:
sidecar.istio.io/inject: "true"
spec:
serviceAccountName: ml-inference
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: inference
image: "ecr.aws/devsecops/ml-inference:v1.0.0"
ports:
- containerPort: 8000
protocol: TCP
env:
- name: MODEL_PATH
value: "/models/latest"
- name: HF_HOME
value: "/cache/huggingface"
resources:
requests:
cpu: "2"
memory: 4Gi
nvidia.com/gpu: "1"
limits:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 10
periodSeconds: 10
volumeMounts:
- name: model-storage
mountPath: /models
- name: huggingface-cache
mountPath: /cache/huggingface
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: model-pvc
- name: huggingface-cache
emptyDir:
medium: Memory
sizeLimit: 1Gi
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
nodeSelector:
workload: ml
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-pvc
namespace: ml-pipeline
spec:
accessModes:
- ReadWriteOnce
storageClassName: gp3-encrypted
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: Service
metadata:
name: ml-inference
namespace: ml-pipeline
spec:
selector:
app: ml-inference
ports:
- port: 8000
targetPort: 8000
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: ml-inference
namespace: ml-pipeline
---
# ML Training Job Template
apiVersion: batch/v1
kind: Job
metadata:
name: ml-train-{{ .JobID }}
namespace: ml-pipeline
spec:
backoffLimit: 2
ttlSecondsAfterFinished: 86400 # Clean up after 24h
template:
spec:
serviceAccountName: ml-train
securityContext:
runAsNonRoot: true
runAsUser: 1000
containers:
- name: trainer
image: "ecr.aws/devsecops/ml-train:v1.0.0"
command: ["python", "train.py"]
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-credentials
key: token
- name: TRACKIO_URL
value: "https://trackio.platform.internal"
resources:
requests:
cpu: "4"
memory: 16Gi
nvidia.com/gpu: "1"
limits:
cpu: "8"
memory: 32Gi
nvidia.com/gpu: "1"
volumeMounts:
- name: training-data
mountPath: /data
- name: model-output
mountPath: /output
volumes:
- name: training-data
persistentVolumeClaim:
claimName: training-data-pvc
- name: model-output
persistentVolumeClaim:
claimName: model-output-pvc
restartPolicy: Never
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
|