File size: 4,112 Bytes
7c19d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# =============================================================================
# ML Pipeline — Training Job + Inference Service
# =============================================================================

apiVersion: apps/v1
kind: Deployment
metadata:
  name: ml-inference
  namespace: ml-pipeline
  labels:
    app: ml-inference
    version: v1
spec:
  replicas: 1
  selector:
    matchLabels:
      app: ml-inference
  template:
    metadata:
      labels:
        app: ml-inference
        version: v1
      annotations:
        sidecar.istio.io/inject: "true"
    spec:
      serviceAccountName: ml-inference
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
      containers:
        - name: inference
          image: "ecr.aws/devsecops/ml-inference:v1.0.0"
          ports:
            - containerPort: 8000
              protocol: TCP
          env:
            - name: MODEL_PATH
              value: "/models/latest"
            - name: HF_HOME
              value: "/cache/huggingface"
          resources:
            requests:
              cpu: "2"
              memory: 4Gi
              nvidia.com/gpu: "1"
            limits:
              cpu: "4"
              memory: 8Gi
              nvidia.com/gpu: "1"
          livenessProbe:
            httpGet:
              path: /health
              port: 8000
            initialDelaySeconds: 30
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /ready
              port: 8000
            initialDelaySeconds: 10
            periodSeconds: 10
          volumeMounts:
            - name: model-storage
              mountPath: /models
            - name: huggingface-cache
              mountPath: /cache/huggingface
      volumes:
        - name: model-storage
          persistentVolumeClaim:
            claimName: model-pvc
        - name: huggingface-cache
          emptyDir:
            medium: Memory
            sizeLimit: 1Gi
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      nodeSelector:
        workload: ml
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: model-pvc
  namespace: ml-pipeline
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: gp3-encrypted
  resources:
    requests:
      storage: 50Gi
---
apiVersion: v1
kind: Service
metadata:
  name: ml-inference
  namespace: ml-pipeline
spec:
  selector:
    app: ml-inference
  ports:
    - port: 8000
      targetPort: 8000
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: ml-inference
  namespace: ml-pipeline
---
# ML Training Job Template
apiVersion: batch/v1
kind: Job
metadata:
  name: ml-train-{{ .JobID }}
  namespace: ml-pipeline
spec:
  backoffLimit: 2
  ttlSecondsAfterFinished: 86400  # Clean up after 24h
  template:
    spec:
      serviceAccountName: ml-train
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
      containers:
        - name: trainer
          image: "ecr.aws/devsecops/ml-train:v1.0.0"
          command: ["python", "train.py"]
          env:
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-credentials
                  key: token
            - name: TRACKIO_URL
              value: "https://trackio.platform.internal"
          resources:
            requests:
              cpu: "4"
              memory: 16Gi
              nvidia.com/gpu: "1"
            limits:
              cpu: "8"
              memory: 32Gi
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: training-data
              mountPath: /data
            - name: model-output
              mountPath: /output
      volumes:
        - name: training-data
          persistentVolumeClaim:
            claimName: training-data-pvc
        - name: model-output
          persistentVolumeClaim:
            claimName: model-output-pvc
      restartPolicy: Never
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule