shaikhsalman
/

devsecops-platform

+# =============================================================================
+# FinOps Engine — Cloud Cost Governance
+# =============================================================================
+# Addresses: cost waste, rightsizing, scheduling, unit economics
+# =============================================================================
+# --- Spot Instance Strategy ---
+# Use SPOT for ML training workloads (70-90% cost savings)
+# Use ON_DEMAND for production services (no interruption risk)
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ml-training-spot
+  namespace: ml-pipeline
+  labels:
+    app: ml-training-spot
+    finops: spot-instance
+spec:
+  replicas: 0  # Scale up on demand via KEDA
+  selector:
+    matchLabels:
+      app: ml-training-spot
+  template:
+    metadata:
+      labels:
+        app: ml-training-spot
+        finops: spot-instance
+    spec:
+      containers:
+        - name: trainer
+          image: "ecr.aws/devsecops/ml-train:v1.0.0"
+          resources:
+            requests:
+              cpu: "4"
+              memory: 16Gi
+              nvidia.com/gpu: "1"
+            limits:
+              cpu: "8"
+              memory: 32Gi
+              nvidia.com/gpu: "1"
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      nodeSelector:
+        workload: ml-spot
+      # Allow eviction for spot reclamation
+      terminationGracePeriodSeconds: 120
+---
+# --- KEDA Scaler — Scale ML training on queue depth ---
+apiVersion: keda.sh/v1alpha1
+kind: ScaledJob
+metadata:
+  name: ml-training-scaler
+  namespace: ml-pipeline
+spec:
+  minReplicaCount: 0
+  maxReplicaCount: 4
+  pollingInterval: 30
+  triggers:
+    - type: aws-sqs
+      metadata:
+        queueURL: https://sqs.us-east-1.amazonaws.com/123456789012/ml-training-queue
+        queueLength: "1"
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          restartPolicy: Never
+          containers:
+            - name: trainer
+              image: "ecr.aws/devsecops/ml-train:v1.0.0"