shaikhsalman commited on
Commit
f082a01
·
verified ·
1 Parent(s): 345e0af

Upload finops/cost-optimization.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. finops/cost-optimization.yaml +73 -0
finops/cost-optimization.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # FinOps Engine — Cloud Cost Governance
3
+ # =============================================================================
4
+ # Addresses: cost waste, rightsizing, scheduling, unit economics
5
+ # =============================================================================
6
+
7
+ # --- Spot Instance Strategy ---
8
+ # Use SPOT for ML training workloads (70-90% cost savings)
9
+ # Use ON_DEMAND for production services (no interruption risk)
10
+
11
+ apiVersion: apps/v1
12
+ kind: Deployment
13
+ metadata:
14
+ name: ml-training-spot
15
+ namespace: ml-pipeline
16
+ labels:
17
+ app: ml-training-spot
18
+ finops: spot-instance
19
+ spec:
20
+ replicas: 0 # Scale up on demand via KEDA
21
+ selector:
22
+ matchLabels:
23
+ app: ml-training-spot
24
+ template:
25
+ metadata:
26
+ labels:
27
+ app: ml-training-spot
28
+ finops: spot-instance
29
+ spec:
30
+ containers:
31
+ - name: trainer
32
+ image: "ecr.aws/devsecops/ml-train:v1.0.0"
33
+ resources:
34
+ requests:
35
+ cpu: "4"
36
+ memory: 16Gi
37
+ nvidia.com/gpu: "1"
38
+ limits:
39
+ cpu: "8"
40
+ memory: 32Gi
41
+ nvidia.com/gpu: "1"
42
+ tolerations:
43
+ - key: nvidia.com/gpu
44
+ operator: Exists
45
+ effect: NoSchedule
46
+ nodeSelector:
47
+ workload: ml-spot
48
+ # Allow eviction for spot reclamation
49
+ terminationGracePeriodSeconds: 120
50
+ ---
51
+ # --- KEDA Scaler — Scale ML training on queue depth ---
52
+ apiVersion: keda.sh/v1alpha1
53
+ kind: ScaledJob
54
+ metadata:
55
+ name: ml-training-scaler
56
+ namespace: ml-pipeline
57
+ spec:
58
+ minReplicaCount: 0
59
+ maxReplicaCount: 4
60
+ pollingInterval: 30
61
+ triggers:
62
+ - type: aws-sqs
63
+ metadata:
64
+ queueURL: https://sqs.us-east-1.amazonaws.com/123456789012/ml-training-queue
65
+ queueLength: "1"
66
+ jobTemplate:
67
+ spec:
68
+ template:
69
+ spec:
70
+ restartPolicy: Never
71
+ containers:
72
+ - name: trainer
73
+ image: "ecr.aws/devsecops/ml-train:v1.0.0"