# ============================================================================= # ML Pipeline — Training Job + Inference Service # ============================================================================= apiVersion: apps/v1 kind: Deployment metadata: name: ml-inference namespace: ml-pipeline labels: app: ml-inference version: v1 spec: replicas: 1 selector: matchLabels: app: ml-inference template: metadata: labels: app: ml-inference version: v1 annotations: sidecar.istio.io/inject: "true" spec: serviceAccountName: ml-inference securityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 containers: - name: inference image: "ecr.aws/devsecops/ml-inference:v1.0.0" ports: - containerPort: 8000 protocol: TCP env: - name: MODEL_PATH value: "/models/latest" - name: HF_HOME value: "/cache/huggingface" resources: requests: cpu: "2" memory: 4Gi nvidia.com/gpu: "1" limits: cpu: "4" memory: 8Gi nvidia.com/gpu: "1" livenessProbe: httpGet: path: /health port: 8000 initialDelaySeconds: 30 periodSeconds: 30 readinessProbe: httpGet: path: /ready port: 8000 initialDelaySeconds: 10 periodSeconds: 10 volumeMounts: - name: model-storage mountPath: /models - name: huggingface-cache mountPath: /cache/huggingface volumes: - name: model-storage persistentVolumeClaim: claimName: model-pvc - name: huggingface-cache emptyDir: medium: Memory sizeLimit: 1Gi tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule nodeSelector: workload: ml --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: model-pvc namespace: ml-pipeline spec: accessModes: - ReadWriteOnce storageClassName: gp3-encrypted resources: requests: storage: 50Gi --- apiVersion: v1 kind: Service metadata: name: ml-inference namespace: ml-pipeline spec: selector: app: ml-inference ports: - port: 8000 targetPort: 8000 --- apiVersion: v1 kind: ServiceAccount metadata: name: ml-inference namespace: ml-pipeline --- # ML Training Job Template apiVersion: batch/v1 kind: Job metadata: name: ml-train-{{ .JobID }} namespace: ml-pipeline spec: backoffLimit: 2 ttlSecondsAfterFinished: 86400 # Clean up after 24h template: spec: serviceAccountName: ml-train securityContext: runAsNonRoot: true runAsUser: 1000 containers: - name: trainer image: "ecr.aws/devsecops/ml-train:v1.0.0" command: ["python", "train.py"] env: - name: HF_TOKEN valueFrom: secretKeyRef: name: hf-credentials key: token - name: TRACKIO_URL value: "https://trackio.platform.internal" resources: requests: cpu: "4" memory: 16Gi nvidia.com/gpu: "1" limits: cpu: "8" memory: 32Gi nvidia.com/gpu: "1" volumeMounts: - name: training-data mountPath: /data - name: model-output mountPath: /output volumes: - name: training-data persistentVolumeClaim: claimName: training-data-pvc - name: model-output persistentVolumeClaim: claimName: model-output-pvc restartPolicy: Never tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule