shaikhsalman commited on
Commit
9d4d5c7
·
verified ·
1 Parent(s): 36df1e5

refactor: merged structure - model at center, DevSecOps wrapped around it

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +60 -129
  2. ai-ml/hf-finetuning/TRAINING_RECIPE.md +0 -58
  3. ai-ml/hf-finetuning/run_finetune.py +0 -67
  4. ci-cd/gitlab-ci/.gitlab-ci.yml +0 -113
  5. ci-cd/jenkins/Jenkinsfile +0 -136
  6. compliance/{cis-benchmarks/cis-eks-k8s.yaml → cis-eks-k8s.yaml} +0 -0
  7. compliance/{soc2/controls-mapping.yaml → controls-mapping.yaml} +0 -0
  8. compliance/{nist/nist-800-53-mapping.yaml → nist-800-53-mapping.yaml} +0 -0
  9. {docker/base-images → deployment}/Dockerfile.ml-inference +0 -0
  10. {k8s/workloads/ml-pipeline → deployment}/deployment.yaml +0 -0
  11. {ai-ml/mlflow → deployment}/mlflow-deployment.yaml +0 -0
  12. docker/base-images/Dockerfile.backend +0 -51
  13. docker/base-images/Dockerfile.frontend +0 -33
  14. finops/cost-optimization.yaml +0 -73
  15. finops/finops-cronjob.yaml +0 -23
  16. finops/finops-policy.yaml +0 -73
  17. incident-response/auto-remediation/auto-remediate.sh +0 -50
  18. platform/adr/template.md → infrastructure/adr-template.md +0 -0
  19. {ci-cd/github-actions → infrastructure/ci-cd}/devsecops-pipeline.yml +0 -0
  20. {finops → infrastructure}/finops_scanner.py +0 -0
  21. {scripts/bash → infrastructure}/incident-response.sh +0 -0
  22. {monitoring → infrastructure/monitoring}/alertmanager/alertmanager-config.yaml +0 -0
  23. monitoring/grafana/dashboards/platform-overview.json → infrastructure/monitoring/grafana-platform-overview.json +0 -0
  24. {monitoring → infrastructure/monitoring}/otel/otel-collector.yaml +0 -0
  25. {monitoring → infrastructure/monitoring}/prometheus/alerts.yaml +0 -0
  26. incident-response/postmortem/template.md → infrastructure/postmortem-template.md +0 -0
  27. {terraform → infrastructure/terraform}/environments/prod/main.tf +0 -0
  28. {terraform → infrastructure/terraform}/modules/eks/main.tf +0 -0
  29. {terraform → infrastructure/terraform}/modules/eks/outputs.tf +0 -0
  30. {terraform → infrastructure/terraform}/modules/eks/variables.tf +0 -0
  31. {terraform → infrastructure/terraform}/modules/guardduty/main.tf +0 -0
  32. {terraform → infrastructure/terraform}/modules/guardduty/variables.tf +0 -0
  33. {terraform → infrastructure/terraform}/modules/iam/main.tf +0 -0
  34. {terraform → infrastructure/terraform}/modules/kms/main.tf +0 -0
  35. {terraform → infrastructure/terraform}/modules/macie/main.tf +0 -0
  36. {terraform → infrastructure/terraform}/modules/rds/main.tf +0 -0
  37. {terraform → infrastructure/terraform}/modules/rds/variables.tf +0 -0
  38. {terraform → infrastructure/terraform}/modules/s3/main.tf +0 -0
  39. {terraform → infrastructure/terraform}/modules/s3/variables.tf +0 -0
  40. {terraform → infrastructure/terraform}/modules/vpc/main.tf +0 -0
  41. {terraform → infrastructure/terraform}/modules/vpc/outputs.tf +0 -0
  42. {terraform → infrastructure/terraform}/modules/vpc/variables.tf +0 -0
  43. k8s/base/limit-ranges/limit-ranges.yaml +0 -74
  44. k8s/base/namespaces/namespaces.yaml +0 -69
  45. k8s/base/network-policies/network-policies.yaml +0 -124
  46. k8s/base/pdbs/pdbs.yaml +0 -62
  47. k8s/base/rbac/rbac.yaml +0 -78
  48. k8s/base/resource-quotas/resource-quotas.yaml +0 -50
  49. k8s/base/slos/slos.yaml +0 -68
  50. k8s/kustomize/base/kustomization.yaml +0 -18
README.md CHANGED
@@ -1,144 +1,75 @@
1
- # DevSecOps Platform OMEGA — Enterprise AI Operating System
 
 
 
 
 
 
 
 
 
 
2
 
3
- > Production-grade, security-first, automation-first platform covering the full DevOps, Cloud, Kubernetes, Security, AI/ML, FinOps, and Governance lifecycle.
4
 
5
- **156 files | 182KB | 13 domains | All production-ready**
6
 
7
- ## Architecture
8
 
9
- ```
10
- ENGINEERING COMMAND CENTER
11
- |
12
- +------------------+------------------+
13
- | | | | |
14
- RELIABILITY SECURITY FINOPS PLATFORM AI/ML
15
- (SLO/PDB) (GuardDuty) (Cost) (Golden (RAG/SFT)
16
- | | | Path) |
17
- +---------+--------+--------+---------+--+
18
- | |
19
- KUBERNETES TERRAFORM
20
- (Kustomize) (IaC Modules)
21
- | |
22
- AWS CLOUD INFRASTRUCTURE
23
- ```
24
-
25
- ## OMEGA 10-Dimension Scorecard
26
-
27
- | # | Dimension | Score | Assets |
28
- |---|-----------|-------|--------|
29
- | 1 | **Reliability** | 8/10 | PDBs, SLOs, HPA, multi-AZ, Istio |
30
- | 2 | **Security** | 9/10 | GuardDuty, Macie, Falco, Kyverno, Trivy, mTLS |
31
- | 3 | **Dev Velocity** | 7/10 | Golden paths, self-service envs, Kustomize |
32
- | 4 | **Cost Efficiency** | 7/10 | FinOps scanner, spot instances, scheduling policy |
33
- | 5 | **Governance** | 8/10 | SOC2, NIST 800-53, CIS, OPA, ADR template |
34
- | 6 | **Automation** | 7/10 | Bootstrap, auto-remediation, GitOps (ArgoCD) |
35
- | 7 | **Incident Recovery** | 8/10 | Runbook, postmortem template, war-room |
36
- | 8 | **Standardization** | 8/10 | Kustomize overlays, golden path templates |
37
- | 9 | **AI Enablement** | 8/10 | RAG, LoRA v2, MLflow, Trackio, GPU scheduling |
38
- | 10 | **Engineering Excellence** | 7/10 | ADR template, checklists, SRE standards |
39
-
40
- ## Platform Modules
41
-
42
- ### Infrastructure (Terraform)
43
- | Module | Purpose | Key Feature |
44
- |--------|---------|-------------|
45
- | VPC | Network isolation | Flow logs, default deny SG/NACL |
46
- | EKS | Kubernetes cluster | Private API, KMS encryption, IRSA |
47
- | RDS | Database | Multi-AZ, encrypted, performance insights |
48
- | S3 | Storage | SSE-KMS, versioning, lifecycle |
49
- | IAM | Access control | MFA, least privilege, access analyzer |
50
- | KMS | Key management | Auto-rotation, multi-key |
51
- | GuardDuty | Threat detection | EBS malware scan, K8s audit, S3 |
52
- | Macie | PII detection | Automated data classification |
53
-
54
- ### Kubernetes
55
- | Layer | Components |
56
- |-------|-----------|
57
- | **Base** | Namespaces, RBAC, NetPols, Quotas, Limits, PDBs, SLOs |
58
- | **Platform** | ArgoCD, Istio (mTLS), ExternalSecrets, CertManager |
59
- | **Security** | Trivy Operator, Falco (eBPF), Kyverno (7 policies), OPA |
60
- | **Observability** | Prometheus, Grafana, Loki, Alertmanager, OTEL |
61
- | **Workloads** | Frontend, Backend (HPA), ML Pipeline (GPU) |
62
-
63
- ### FinOps Engine
64
- | Asset | Purpose |
65
- |-------|---------|
66
- | finops-policy.yaml | 11 cost optimization rules |
67
- | finops_scanner.py | Automated waste detection |
68
- | cost-optimization.yaml | Spot instance strategy + KEDA |
69
- | finops-cronjob.yaml | Daily cost scan CronJob |
70
-
71
- ### Platform Engineering
72
- | Asset | Purpose |
73
- |-------|---------|
74
- | golden-paths/microservice/ | Production-ready service template + checklist |
75
- | self-service/ | Ephemeral environment provisioning config |
76
- | adr/template.md | Architecture Decision Record template |
77
- | kustomize/ | Base + dev/staging/prod overlays |
78
-
79
- ### Incident Response
80
- | Asset | Purpose |
81
- |-------|---------|
82
- | auto-remediate.sh | OOM fix, pod restart, security escalation |
83
- | postmortem/template.md | Full postmortem with 5 Whys + action items |
84
- | incident-response.sh | Diagnostic runbook (5 incident types) |
85
-
86
- ### AI/ML Hub
87
- | Asset | Purpose |
88
- |-------|---------|
89
- | finetune.py | LoRA Without Regret (r=256, all-linear) |
90
- | run_finetune.py | CLI entry point with dataset selection |
91
- | TRAINING_RECIPE.md | v1→v2 upgrade documentation |
92
- | rag_pipeline.py | LangChain + HF + ChromaDB RAG |
93
- | mlflow/ | MLflow tracking deployment |
94
-
95
- ### Compliance
96
- | Framework | Coverage |
97
- |-----------|---------|
98
- | SOC2 Type II | CC6-CC9 controls mapped |
99
- | NIST 800-53 | 12 controls mapped |
100
- | CIS Benchmarks | EKS + K8s automated |
101
- | OPA Gatekeeper | Admission policies |
102
-
103
- ### CI/CD Pipelines
104
- | System | Features |
105
- |--------|----------|
106
- | GitHub Actions | 6-stage DevSecOps (SAST→Build→Scan→Test→Sign→Deploy) |
107
- | Jenkins | Parallel SAST + production deployment |
108
- | GitLab CI | Full scan + sign + deploy pipeline |
109
-
110
- ## Quick Start
111
-
112
- ```bash
113
- # Bootstrap full platform
114
- ./scripts/bash/bootstrap.sh prod
115
 
116
- # Security audit
117
- python3 scripts/python/security_audit.py
118
 
119
- # FinOps cost scan
120
- python3 finops/finops_scanner.py
121
 
122
- # Incident response
123
- ./scripts/bash/incident-response.sh security
124
 
125
- # Auto-remediate
126
- ./incident-response/auto-remediation/auto-remediate.sh PodCrashLooping backend <pod-name>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ```
128
 
129
- ## Self-Improvement Checklist
130
 
131
- After every deployment, ask:
 
 
132
 
133
- - [ ] Can this be automated?
134
- - [ ] Can this be templated?
135
- - [ ] Can this be secured further?
136
- - [ ] Can this be cheaper?
137
- - [ ] Can this scale better?
138
- - [ ] Can this reduce human toil?
139
 
140
- If yes, enhance and push.
 
141
 
142
- ## Hub
 
143
 
144
- **[huggingface.co/shaikhsalman/devsecops-platform](https://huggingface.co/shaikhsalman/devsecops-platform)**
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - devsecops
5
+ - llm
6
+ - sft
7
+ - lora
8
+ - tulu-3
9
+ - kubernetes
10
+ - terraform
11
+ ---
12
 
13
+ # DevSecOps Model Platform
14
 
15
+ > Train a secure model on the best data, then deploy it securely.
16
 
17
+ ## Start Here: Train Your Model
18
 
19
+ | Dataset | Size | What It Gives You | Command |
20
+ |---------|------|-------------------|---------|
21
+ | **tulu-3-sft-mixture** | 940K | Math, code, safety, chat (BEST) | python model/train_tulu3.py |
22
+ | **OpenThoughts-114k** | 114K | Reasoning, chain-of-thought | python model/train_openthoughts.py |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ **allenai/tulu-3-sft-mixture** is from Allen AI Tulu 3 - current SOTA open instruction-tuned model. Proven on Llama-3.1-8B: MMLU 53.5, GSM8K 79.9, HumanEval 76.8.
 
25
 
26
+ LoRA config from LoRA Without Regret (Schulman 2025): r=256, alpha=16, all-linear = matches full fine-tuning at 67% compute.
 
27
 
28
+ ## Repository Structure
 
29
 
30
+ ```
31
+ model/ THE MODEL - train, serve, enhance
32
+ train_tulu3.py Primary: 940K best data (zero preprocessing)
33
+ train_openthoughts.py Reasoning: 114K CoT traces
34
+ finetune_configurable.py Multi-dataset configurable trainer
35
+ rag_pipeline.py RAG for DevSecOps knowledge
36
+ DATASETS.md Why these datasets, proven recipes
37
+
38
+ deployment/ SERVE IT - Kubernetes + Docker + vLLM
39
+ deployment.yaml ML inference K8s manifest
40
+ mlflow-deployment.yaml Experiment tracking
41
+ Dockerfile.ml-inference Hardened multi-stage image
42
+
43
+ security/ PROTECT IT - scanning + policies
44
+ scanning/ Trivy, Semgrep, Checkov, SBOM
45
+ policies/ Kyverno, OPA Gatekeeper
46
+
47
+ infrastructure/ RUN IT - Terraform + monitoring + CI/CD
48
+ terraform/ VPC, EKS, RDS, S3, IAM, KMS, GuardDuty, Macie
49
+ monitoring/ Prometheus, Alertmanager, OTEL, Grafana
50
+ ci-cd/ GitHub Actions DevSecOps pipeline
51
+
52
+ compliance/ CERTIFY IT - SOC2, NIST, CIS
53
+ controls-mapping.yaml SOC2 Type II
54
+ nist-800-53-mapping.yaml NIST 800-53 Rev5
55
+ cis-eks-k8s.yaml CIS Benchmarks
56
  ```
57
 
58
+ ## Quick Commands
59
 
60
+ ```bash
61
+ # Train on best data (A100, ~6h)
62
+ python model/train_tulu3.py
63
 
64
+ # Quick test (any GPU)
65
+ python model/train_tulu3.py --max_steps 100 --no_push
 
 
 
 
66
 
67
+ # Security scan
68
+ python security/scanning/security_audit.py
69
 
70
+ # Deploy model to K8s
71
+ kubectl apply -f deployment/deployment.yaml
72
 
73
+ # Infrastructure (Terraform)
74
+ cd infrastructure/terraform/environments/prod && terraform apply
75
+ ```
ai-ml/hf-finetuning/TRAINING_RECIPE.md DELETED
@@ -1,58 +0,0 @@
1
- # Model Enhancement — Dataset & Training Recipe vNext
2
-
3
- ## What Changed (v1 → v2)
4
-
5
- | Parameter | v1 (Old) | v2 (LoRA Without Regret) | Why |
6
- |-----------|----------|--------------------------|-----|
7
- | **Dataset** | ultrachat_200k (5K subset) | **tulu-3-sft-mixture** (940K) | 19 curated sources > single source |
8
- | **LoRA r** | 16 | **256** | SFT-scale datasets need r=256 to match full FT |
9
- | **LoRA alpha** | 32 | **16** | Stable scaling with high rank |
10
- | **Target modules** | q/k/v/o_proj only | **all-linear** | Attention-only underperforms even at higher rank |
11
- | **Effective batch** | 32 | **16** | LoRA less tolerant of large batches |
12
- | **Learning rate** | 2e-4 | **2e-4** (same) | 10x full FT rate — correct in v1 |
13
- | **Packing** | False | **True (bfd_split)** | Preserves all tokens, 2-3x throughput |
14
- | **assistant_only_loss** | False | **True** | Loss only on assistant tokens |
15
- | **EOS token** | Not set | **<\|eot_id\|>** | Llama 3.1 chat template |
16
- | **LR scheduler** | linear | **cosine** | Better convergence for LoRA |
17
- | **Epochs** | 3 | **1** | 940K examples = 1 epoch sufficient |
18
-
19
- ## Dataset Comparison
20
-
21
- | Dataset | Size | Format | Best For | Quality |
22
- |---------|------|--------|----------|---------|
23
- | **tulu-3-sft-mixture** | 940K | messages ✅ | General SFT (code, math, IF, safety, science) | ⭐⭐⭐⭐⭐ |
24
- | **OpenThoughts-114k** | 114K | conversations (needs conversion) | Reasoning, CoT traces | ⭐⭐⭐⭐ |
25
- | ultrachat_200k | 200K | messages ✅ | Multi-turn chat baseline | ⭐⭐⭐ |
26
-
27
- ## Key Research: "LoRA Without Regret" (Schulman et al., 2025)
28
-
29
- Four findings that change how we fine-tune:
30
-
31
- 1. **Target ALL linear layers** — not just attention. Increasing rank does NOT compensate for skipping layers.
32
- 2. **Use r=256 for SFT** — sufficient capacity for post-training scale datasets.
33
- 3. **Use 10x higher LR** (2e-4 vs 2e-5 for full FT) — 1/r scaling makes optimal LR rank-independent.
34
- 4. **Keep batch size < 32** — LoRA is less tolerant of large batches. Cannot be mitigated by increasing rank.
35
-
36
- ## Recommended Training Matrix
37
-
38
- ### SFT (Supervised Fine-Tuning)
39
-
40
- | Model | Dataset | Hardware | Time | Cost |
41
- |-------|---------|----------|------|------|
42
- | Llama-3.1-8B-Instruct | tulu-3-sft (940K) | A100 (80GB) | ~6h | ~$24 |
43
- | Llama-3.1-8B-Instruct | OpenThoughts-114k | A100 (80GB) | ~2h | ~$8 |
44
- | Llama-3.1-8B-Instruct | tulu-3-sft (940K) | A10G (24GB) + QLoRA | ~12h | ~$24 |
45
-
46
- ### GRPO (Reinforcement Learning)
47
-
48
- | Model | Dataset | LoRA r | Hardware |
49
- |-------|---------|--------|----------|
50
- | Qwen3-0.6B | OpenR1-Math-220k | 1 | A100 |
51
- | Llama-3.1-8B-Base | GSM8k | 1-32 | A100 |
52
-
53
- ## Source Attribution
54
-
55
- - LoRA Without Regret: Schulman et al., 2025, Thinking Machines Lab
56
- - tulu-3-sft-mixture: Allen AI, used by Tulu 3 (SOTA open instruction-tuned)
57
- - OpenThoughts-114k: Open community, reasoning-heavy CoT data
58
- - LoRA Land: Predibase 2024, 224/310 LoRA models surpassed GPT-4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ai-ml/hf-finetuning/run_finetune.py DELETED
@@ -1,67 +0,0 @@
1
- # =============================================================================
2
- # SFT Fine-Tuning — CLI Entry Point (LoRA Without Regret config)
3
- # =============================================================================
4
- # Usage:
5
- # # Default: tulu-3-sft + Llama-3.1-8B
6
- # python run_finetune.py
7
- #
8
- # # OpenThoughts reasoning dataset
9
- # python run_finetune.py --dataset_key openthoughts-114k
10
- #
11
- # # Ultrachat fallback
12
- # python run_finetune.py --dataset_key ultrachat-200k
13
- #
14
- # # Custom hub model ID
15
- # python run_finetune.py --hub_model_id my-org/my-model-v2
16
- # =============================================================================
17
-
18
- import argparse
19
- import sys
20
- from finetune import FinetuneConfig, finetune, DATASET_REGISTRY
21
-
22
-
23
- def main():
24
- parser = argparse.ArgumentParser(description="SFT Fine-Tuning (LoRA Without Regret)")
25
- parser.add_argument("--dataset_key", default="tulu-3-sft",
26
- choices=list(DATASET_REGISTRY.keys()),
27
- help="Dataset to train on")
28
- parser.add_argument("--hub_model_id", default=None,
29
- help="HuggingFace Hub model ID for push")
30
- parser.add_argument("--num_train_epochs", type=int, default=None)
31
- parser.add_argument("--learning_rate", type=float, default=None)
32
- parser.add_argument("--lora_r", type=int, default=None)
33
- parser.add_argument("--per_device_train_batch_size", type=int, default=None)
34
- parser.add_argument("--max_seq_length", type=int, default=None)
35
-
36
- args = parser.parse_args()
37
-
38
- config = FinetuneConfig()
39
- if args.dataset_key:
40
- config.dataset_key = args.dataset_key
41
- if args.hub_model_id:
42
- config.hub_model_id = args.hub_model_id
43
- if args.num_train_epochs:
44
- config.num_train_epochs = args.num_train_epochs
45
- if args.learning_rate:
46
- config.learning_rate = args.learning_rate
47
- if args.lora_r:
48
- config.lora_r = args.lora_r
49
- if args.per_device_train_batch_size:
50
- config.per_device_train_batch_size = args.per_device_train_batch_size
51
- if args.max_seq_length:
52
- config.max_seq_length = args.max_seq_length
53
-
54
- print(f"Config: model={config.model_name}")
55
- print(f" dataset={config.dataset_key}")
56
- print(f" lora_r={config.lora_r}, lora_alpha={config.lora_alpha}")
57
- print(f" target_modules={config.target_modules}")
58
- print(f" lr={config.learning_rate}, epochs={config.num_train_epochs}")
59
- print(f" effective_batch={config.per_device_train_batch_size * config.gradient_accumulation_steps}")
60
- print(f" packing={config.packing}, strategy={config.packing_strategy}")
61
- print(f" assistant_only_loss={config.assistant_only_loss}")
62
-
63
- finetune(config)
64
-
65
-
66
- if __name__ == "__main__":
67
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ci-cd/gitlab-ci/.gitlab-ci.yml DELETED
@@ -1,113 +0,0 @@
1
- # =============================================================================
2
- # GitLab CI — DevSecOps Pipeline
3
- # =============================================================================
4
-
5
- stages:
6
- - sast
7
- - build
8
- - scan
9
- - test
10
- - sign
11
- - deploy
12
-
13
- variables:
14
- REGISTRY: ecr.aws/devsecops
15
- TRIVY_SEVERITY: "CRITICAL,HIGH"
16
-
17
- # --- SAST Stage ---
18
- semgrep:
19
- stage: sast
20
- image: semgrep/semgrep:latest
21
- script:
22
- - semgrep --config auto --json --output semgrep.json .
23
- artifacts:
24
- paths:
25
- - semgrep.json
26
-
27
- secret-scan:
28
- stage: sast
29
- image: aquasec/trivy:latest
30
- script:
31
- - trivy fs --scanners secret --exit-code 1 .
32
-
33
- checkov:
34
- stage: sast
35
- image: bridgecrew/checkov:latest
36
- script:
37
- - checkov -d terraform/ --output cli
38
-
39
- # --- Build Stage ---
40
- build:
41
- stage: build
42
- image: docker:24
43
- services:
44
- - docker:24-dind
45
- before_script:
46
- - aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY
47
- script:
48
- - |
49
- docker build \
50
- --build-arg BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) \
51
- -t $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA \
52
- -t $REGISTRY/$CI_PROJECT_NAME:latest .
53
- - docker push $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
54
-
55
- # --- Scan Stage ---
56
- trivy-scan:
57
- stage: scan
58
- image: aquasec/trivy:latest
59
- needs: [build]
60
- script:
61
- - trivy image --severity $TRIVY_SEVERITY --exit-code 1 --ignore-unfixed $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
62
-
63
- generate-sbom:
64
- stage: scan
65
- image: anchore/syft:latest
66
- needs: [build]
67
- script:
68
- - syft $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -o spdx-json > sbom.spdx.json
69
- artifacts:
70
- paths:
71
- - sbom.spdx.json
72
-
73
- # --- Test Stage ---
74
- integration-test:
75
- stage: test
76
- image: docker:24
77
- services:
78
- - docker:24-dind
79
- script:
80
- - docker compose -f docker-compose.test.yml up --abort-on-container-exit
81
-
82
- # --- Sign Stage ---
83
- sign:
84
- stage: sign
85
- image: bitnami/cosign:latest
86
- needs: [build, trivy-scan, generate-sbom]
87
- variables:
88
- COSIGN_EXPERIMENTAL: "1"
89
- script:
90
- - cosign sign --yes $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
91
- - cosign attest --yes --predicate sbom.spdx.json --type spdxjson $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
92
-
93
- # --- Deploy Stage ---
94
- deploy-staging:
95
- stage: deploy
96
- image: bitnami/kubectl:latest
97
- needs: [sign, integration-test]
98
- environment:
99
- name: staging
100
- script:
101
- - kubectl set image deployment/$CI_PROJECT_NAME $CI_PROJECT_NAME=$REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -n staging
102
- - kubectl rollout status deployment/$CI_PROJECT_NAME -n staging --timeout=300s
103
-
104
- deploy-prod:
105
- stage: deploy
106
- image: bitnami/kubectl:latest
107
- needs: [deploy-staging]
108
- environment:
109
- name: production
110
- when: manual
111
- script:
112
- - kubectl set image deployment/$CI_PROJECT_NAME $CI_PROJECT_NAME=$REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -n production
113
- - kubectl rollout status deployment/$CI_PROJECT_NAME -n production --timeout=300s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ci-cd/jenkins/Jenkinsfile DELETED
@@ -1,136 +0,0 @@
1
- // =============================================================================
2
- // Jenkinsfile — Shared DevSecOps Pipeline
3
- // =============================================================================
4
-
5
- pipeline {
6
- agent { label 'docker' }
7
-
8
- environment {
9
- REGISTRY = 'ecr.aws/devsecops'
10
- IMAGE_NAME = "${env.JOB_NAME.split('/').last()}"
11
- IMAGE_TAG = "${env.GIT_COMMIT.take(12)}"
12
- TRIVY_SEVERITY = 'CRITICAL,HIGH'
13
- }
14
-
15
- stages {
16
- // ----- Stage 1: SAST -----
17
- stage('SAST') {
18
- parallel {
19
- stage('Semgrep') {
20
- steps {
21
- sh 'semgrep --config auto --json --output semgrep.json .'
22
- }
23
- }
24
- stage('Secret Scan') {
25
- steps {
26
- sh 'trivy fs --scanners secret --exit-code 1 .'
27
- }
28
- }
29
- stage('IaC Scan') {
30
- steps {
31
- sh 'checkov -d terraform/ --output cli --soft-fail false'
32
- }
33
- }
34
- }
35
- }
36
-
37
- // ----- Stage 2: Build -----
38
- stage('Build') {
39
- steps {
40
- script {
41
- docker.withRegistry("https://${REGISTRY}", 'ecr:us-east-1') {
42
- def app = docker.build(
43
- "${IMAGE_NAME}:${IMAGE_TAG}",
44
- '--build-arg BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) .'
45
- )
46
- app.push()
47
- app.push('latest')
48
- }
49
- }
50
- }
51
- }
52
-
53
- // ----- Stage 3: Container Scan -----
54
- stage('Security Scan') {
55
- steps {
56
- sh """
57
- trivy image \
58
- --severity ${TRIVY_SEVERITY} \
59
- --exit-code 1 \
60
- --ignore-unfixed \
61
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
62
- """
63
- // Generate SBOM
64
- sh """
65
- syft ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
66
- -o cyclonedx-json > sbom.cyclonedx.json
67
- """
68
- }
69
- }
70
-
71
- // ----- Stage 4: Test -----
72
- stage('Integration Test') {
73
- steps {
74
- sh 'docker compose -f docker-compose.test.yml up --abort-on-container-exit'
75
- }
76
- }
77
-
78
- // ----- Stage 5: Sign -----
79
- stage('Sign & Attest') {
80
- steps {
81
- sh """
82
- cosign sign --yes \
83
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
84
- cosign attest --yes \
85
- --predicate sbom.cyclonedx.json \
86
- --type cyclonedx \
87
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
88
- """
89
- }
90
- }
91
-
92
- // ----- Stage 6: Deploy -----
93
- stage('Deploy Staging') {
94
- steps {
95
- sh """
96
- kubectl set image deployment/${IMAGE_NAME} \
97
- ${IMAGE_NAME}=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
98
- -n staging
99
- """
100
- // Wait for rollout
101
- sh 'kubectl rollout status deployment/${IMAGE_NAME} -n staging --timeout=300s'
102
- }
103
- }
104
-
105
- stage('Deploy Production') {
106
- when {
107
- branch 'main'
108
- }
109
- input {
110
- message "Deploy ${IMAGE_NAME}:${IMAGE_TAG} to production?"
111
- }
112
- steps {
113
- sh """
114
- kubectl set image deployment/${IMAGE_NAME} \
115
- ${IMAGE_NAME}=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
116
- -n production
117
- """
118
- sh 'kubectl rollout status deployment/${IMAGE_NAME} -n production --timeout=300s'
119
- }
120
- }
121
- }
122
-
123
- post {
124
- always {
125
- archiveArtifacts artifacts: 'semgrep.json, sbom.cyclonedx.json', allowEmptyArchive: true
126
- recordIssues(tools: [semgrep(pattern: 'semgrep.json')])
127
- }
128
- failure {
129
- slackSend(
130
- channel: '#platform-alerts',
131
- color: 'danger',
132
- message: "FAILED: ${env.JOB_NAME} #${env.BUILD_NUMBER}"
133
- )
134
- }
135
- }
136
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
compliance/{cis-benchmarks/cis-eks-k8s.yaml → cis-eks-k8s.yaml} RENAMED
File without changes
compliance/{soc2/controls-mapping.yaml → controls-mapping.yaml} RENAMED
File without changes
compliance/{nist/nist-800-53-mapping.yaml → nist-800-53-mapping.yaml} RENAMED
File without changes
{docker/base-images → deployment}/Dockerfile.ml-inference RENAMED
File without changes
{k8s/workloads/ml-pipeline → deployment}/deployment.yaml RENAMED
File without changes
{ai-ml/mlflow → deployment}/mlflow-deployment.yaml RENAMED
File without changes
docker/base-images/Dockerfile.backend DELETED
@@ -1,51 +0,0 @@
1
- # =============================================================================
2
- # Multi-Stage Hardened Dockerfile — Python Backend
3
- # =============================================================================
4
- # Security Features:
5
- # - Multi-stage build (build → runtime)
6
- # - Non-root user
7
- # - Minimal base (distroless)
8
- # - Pinned versions
9
- # - No shell in runtime image
10
- # - Health check
11
- # =============================================================================
12
-
13
- # --- Build Stage ---
14
- FROM python:3.12-slim AS builder
15
-
16
- WORKDIR /build
17
-
18
- # Pin pip and install dependencies
19
- COPY requirements.txt .
20
- RUN pip install --no-cache-dir --require-hashes -r requirements.txt
21
-
22
- # Copy application
23
- COPY src/ /build/src/
24
- COPY pyproject.toml /build/
25
-
26
- # Build wheel
27
- RUN pip wheel --no-cache-dir --no-deps -w /build/wheels .
28
-
29
- # --- Runtime Stage ---
30
- FROM gcr.io/distroless/python3-debian12:nonroot AS runtime
31
-
32
- # Copy wheels from builder
33
- COPY --from=builder /build/wheels /app/wheels/
34
- COPY --from=builder /build/src/ /app/src/
35
-
36
- # Set environment
37
- ENV PYTHONUNBUFFERED=1 \
38
- PYTHONDONTWRITEBYTECODE=1 \
39
- PATH="/app/.local/bin:${PATH}"
40
-
41
- WORKDIR /app
42
-
43
- # Run as non-root (distroless nonroot image UID 65532)
44
- USER 65532:65532
45
-
46
- EXPOSE 8080
47
-
48
- HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
49
- CMD ["python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/healthz')"]
50
-
51
- ENTRYPOINT ["python", "-m", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8080"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/base-images/Dockerfile.frontend DELETED
@@ -1,33 +0,0 @@
1
- # =============================================================================
2
- # Multi-Stage Hardened Dockerfile — React Frontend
3
- # =============================================================================
4
-
5
- # --- Build Stage ---
6
- FROM node:20-alpine AS builder
7
-
8
- WORKDIR /app
9
-
10
- # Pin package versions with lockfile
11
- COPY package.json package-lock.json ./
12
- RUN npm ci --ignore-scripts
13
-
14
- COPY . .
15
- RUN npm run build
16
-
17
- # --- Runtime Stage ---
18
- FROM nginxinc/nginx-unprivileged:1.25-alpine AS runtime
19
-
20
- # Remove default nginx configs
21
- RUN rm -f /etc/nginx/conf.d/default.conf
22
-
23
- # Copy custom nginx config (security headers)
24
- COPY docker/nginx.conf /etc/nginx/conf.d/
25
- COPY --from=builder /app/dist /usr/share/nginx/html
26
-
27
- # Security headers are in nginx.conf
28
- EXPOSE 8080
29
-
30
- USER 101:101
31
-
32
- HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
33
- CMD ["curl", "-f", "http://localhost:8080/healthz"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/cost-optimization.yaml DELETED
@@ -1,73 +0,0 @@
1
- # =============================================================================
2
- # FinOps Engine — Cloud Cost Governance
3
- # =============================================================================
4
- # Addresses: cost waste, rightsizing, scheduling, unit economics
5
- # =============================================================================
6
-
7
- # --- Spot Instance Strategy ---
8
- # Use SPOT for ML training workloads (70-90% cost savings)
9
- # Use ON_DEMAND for production services (no interruption risk)
10
-
11
- apiVersion: apps/v1
12
- kind: Deployment
13
- metadata:
14
- name: ml-training-spot
15
- namespace: ml-pipeline
16
- labels:
17
- app: ml-training-spot
18
- finops: spot-instance
19
- spec:
20
- replicas: 0 # Scale up on demand via KEDA
21
- selector:
22
- matchLabels:
23
- app: ml-training-spot
24
- template:
25
- metadata:
26
- labels:
27
- app: ml-training-spot
28
- finops: spot-instance
29
- spec:
30
- containers:
31
- - name: trainer
32
- image: "ecr.aws/devsecops/ml-train:v1.0.0"
33
- resources:
34
- requests:
35
- cpu: "4"
36
- memory: 16Gi
37
- nvidia.com/gpu: "1"
38
- limits:
39
- cpu: "8"
40
- memory: 32Gi
41
- nvidia.com/gpu: "1"
42
- tolerations:
43
- - key: nvidia.com/gpu
44
- operator: Exists
45
- effect: NoSchedule
46
- nodeSelector:
47
- workload: ml-spot
48
- # Allow eviction for spot reclamation
49
- terminationGracePeriodSeconds: 120
50
- ---
51
- # --- KEDA Scaler — Scale ML training on queue depth ---
52
- apiVersion: keda.sh/v1alpha1
53
- kind: ScaledJob
54
- metadata:
55
- name: ml-training-scaler
56
- namespace: ml-pipeline
57
- spec:
58
- minReplicaCount: 0
59
- maxReplicaCount: 4
60
- pollingInterval: 30
61
- triggers:
62
- - type: aws-sqs
63
- metadata:
64
- queueURL: https://sqs.us-east-1.amazonaws.com/123456789012/ml-training-queue
65
- queueLength: "1"
66
- jobTemplate:
67
- spec:
68
- template:
69
- spec:
70
- restartPolicy: Never
71
- containers:
72
- - name: trainer
73
- image: "ecr.aws/devsecops/ml-train:v1.0.0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/finops-cronjob.yaml DELETED
@@ -1,23 +0,0 @@
1
- # FinOps Daily Cost Scanner
2
- apiVersion: batch/v1
3
- kind: CronJob
4
- metadata:
5
- name: finops-scanner
6
- namespace: platform-system
7
- spec:
8
- schedule: "0 6 * * 1-5" # 6am weekdays
9
- jobTemplate:
10
- spec:
11
- template:
12
- spec:
13
- serviceAccountName: finops-scanner
14
- containers:
15
- - name: scanner
16
- image: "ecr.aws/devsecops/finops-scanner:latest"
17
- command: ["python3", "finops_scanner.py"]
18
- env:
19
- - name: AWS_REGION
20
- value: "us-east-1"
21
- restartPolicy: Never
22
- concurrencyPolicy: Forbid
23
- successfulJobsHistoryLimit: 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/finops-policy.yaml DELETED
@@ -1,73 +0,0 @@
1
- # =============================================================================
2
- # FinOps Policy — Cloud Waste Detection & Rightsizing Rules
3
- # =============================================================================
4
-
5
- policies:
6
- # --- Unused Resources ---
7
- - id: FINOPS-001
8
- name: "Detect unused EBS volumes"
9
- severity: medium
10
- check: "aws ec2 describe-volumes --filters Name=status,Values=available"
11
- action: "Create snapshot, delete volume after 7 days"
12
- estimated_savings: "$50-200/month per volume"
13
-
14
- - id: FINOPS-002
15
- name: "Detect idle RDS instances"
16
- severity: medium
17
- check: "Connection count < 5 for 7 days"
18
- action: "Downsize instance class or stop non-prod"
19
- estimated_savings: "30-50% of RDS cost"
20
-
21
- - id: FINOPS-003
22
- name: "Detect unattached EIPs"
23
- severity: low
24
- check: "aws ec2 describe-addresses --filter Name=association-id,Values=''"
25
- action: "Release EIP"
26
- estimated_savings: "$3.60/month per EIP"
27
-
28
- # --- Rightsizing ---
29
- - id: FINOPS-010
30
- name: "EC2 rightsizing recommendations"
31
- severity: medium
32
- check: "CPU < 15% for 14 days OR Memory < 25% for 14 days"
33
- action: "Downsize to next tier (e.g., m6i.xlarge -> m6i.large)"
34
- estimated_savings: "20-40% per instance"
35
-
36
- - id: FINOPS-011
37
- name: "Over-provisioned K8s workloads"
38
- severity: medium
39
- check: "Container CPU request > 2x actual P95 usage"
40
- action: "Reduce requests to P95 + 20% headroom"
41
- estimated_savings: "30-50% of cluster cost"
42
-
43
- # --- Scheduling ---
44
- - id: FINOPS-020
45
- name: "Non-prod environment scheduling"
46
- severity: high
47
- check: "Dev/staging workloads running 24/7"
48
- action: "Scale to 0 outside business hours (Mon-Fri 8am-8pm)"
49
- estimated_savings: "65% of non-prod compute"
50
-
51
- # --- Reserved Instances Coverage ---
52
- - id: FINOPS-030
53
- name: "RI coverage below 70%"
54
- severity: high
55
- check: "RI coverage < 70% for consistent workloads"
56
- action: "Purchase RIs for EKS node groups + RDS"
57
- estimated_savings: "30-40% vs on-demand"
58
-
59
- # --- Storage Tiering ---
60
- - id: FINOPS-040
61
- name: "S3 intelligent tiering"
62
- severity: medium
63
- check: "S3 objects > 90 days in STANDARD"
64
- action: "Enable S3 Intelligent-Tiering on all buckets"
65
- estimated_savings: "40-60% on infrequent access data"
66
-
67
- # --- GPU Utilization ---
68
- - id: FINOPS-050
69
- name: "Underutilized GPU nodes"
70
- severity: critical
71
- check: "GPU utilization < 30% for 4 hours"
72
- action: "Scale down GPU node group or use KEDA for demand-based scaling"
73
- estimated_savings: "$2-6/hour per GPU"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
incident-response/auto-remediation/auto-remediate.sh DELETED
@@ -1,50 +0,0 @@
1
- #!/usr/bin/env bash
2
- # =============================================================================
3
- # Autonomous Incident Remediation Engine
4
- # =============================================================================
5
- # Triggered by Alertmanager webhook. Auto-remediates known patterns.
6
- # =============================================================================
7
-
8
- set -euo pipefail
9
-
10
- ALERT_NAME="${1:-unknown}"
11
- NAMESPACE="${2:-default}"
12
- POD_NAME="${3:-}"
13
-
14
- log() { echo "[$(date +%H:%M:%S)] [REMEDIATE] $*"; }
15
-
16
- case "${ALERT_NAME}" in
17
- PodCrashLooping)
18
- log "Remediating crash-looping pod: ${NAMESPACE}/${POD_NAME}"
19
- # Check if OOM killed
20
- OOM_COUNT=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o json | jq -r '.status.containerStatuses[0].lastState.terminated.reason // empty' | grep -c OOMKilled || true)
21
- if [[ "${OOM_COUNT}" -gt 0 ]]; then
22
- log "OOM detected - increasing memory limit"
23
- kubectl patch deployment "${POD_NAME%-*}" -n "${NAMESPACE}" -p '{"spec":{"template":{"spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"2Gi"}}}]}}}}'
24
- log "Memory limit increased to 2Gi"
25
- else
26
- log "Non-OOM crash - restarting pod"
27
- kubectl delete pod "${POD_NAME}" -n "${NAMESPACE}" --grace-period=30
28
- fi
29
- ;;
30
-
31
- HighMemoryUsage)
32
- log "Node memory pressure detected"
33
- # Evict lowest-priority pods
34
- kubectl get pods -A --sort-by=.spec.priority --field-selector=status.phase=Running | tail -5 | while read ns pod rest; do
35
- log "Considering eviction: ${ns}/${pod}"
36
- done
37
- ;;
38
-
39
- FalcoRuntimeAlert)
40
- log "Runtime security alert - do NOT auto-remediate"
41
- log "Escalate to security team: #security-alerts"
42
- # Only notify - never auto-remediate security alerts
43
- ;;
44
-
45
- *)
46
- log "Unknown alert pattern: ${ALERT_NAME}"
47
- log "Manual investigation required"
48
- exit 1
49
- ;;
50
- esac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
platform/adr/template.md → infrastructure/adr-template.md RENAMED
File without changes
{ci-cd/github-actions → infrastructure/ci-cd}/devsecops-pipeline.yml RENAMED
File without changes
{finops → infrastructure}/finops_scanner.py RENAMED
File without changes
{scripts/bash → infrastructure}/incident-response.sh RENAMED
File without changes
{monitoring → infrastructure/monitoring}/alertmanager/alertmanager-config.yaml RENAMED
File without changes
monitoring/grafana/dashboards/platform-overview.json → infrastructure/monitoring/grafana-platform-overview.json RENAMED
File without changes
{monitoring → infrastructure/monitoring}/otel/otel-collector.yaml RENAMED
File without changes
{monitoring → infrastructure/monitoring}/prometheus/alerts.yaml RENAMED
File without changes
incident-response/postmortem/template.md → infrastructure/postmortem-template.md RENAMED
File without changes
{terraform → infrastructure/terraform}/environments/prod/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/eks/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/eks/outputs.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/eks/variables.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/guardduty/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/guardduty/variables.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/iam/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/kms/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/macie/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/rds/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/rds/variables.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/s3/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/s3/variables.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/vpc/main.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/vpc/outputs.tf RENAMED
File without changes
{terraform → infrastructure/terraform}/modules/vpc/variables.tf RENAMED
File without changes
k8s/base/limit-ranges/limit-ranges.yaml DELETED
@@ -1,74 +0,0 @@
1
- # =============================================================================
2
- # Limit Ranges — Default Resource Requests/Limits Per Container
3
- # =============================================================================
4
-
5
- apiVersion: v1
6
- kind: LimitRange
7
- metadata:
8
- name: default-limits
9
- namespace: frontend
10
- spec:
11
- limits:
12
- - type: Container
13
- default:
14
- cpu: 500m
15
- memory: 256Mi
16
- defaultRequest:
17
- cpu: 100m
18
- memory: 128Mi
19
- max:
20
- cpu: "2"
21
- memory: 2Gi
22
- min:
23
- cpu: 50m
24
- memory: 64Mi
25
- maxLimitRequestRatio:
26
- cpu: "4"
27
- memory: "4"
28
- ---
29
- apiVersion: v1
30
- kind: LimitRange
31
- metadata:
32
- name: default-limits
33
- namespace: backend
34
- spec:
35
- limits:
36
- - type: Container
37
- default:
38
- cpu: "1"
39
- memory: 512Mi
40
- defaultRequest:
41
- cpu: 200m
42
- memory: 256Mi
43
- max:
44
- cpu: "4"
45
- memory: 4Gi
46
- min:
47
- cpu: 100m
48
- memory: 128Mi
49
- maxLimitRequestRatio:
50
- cpu: "4"
51
- memory: "4"
52
- ---
53
- apiVersion: v1
54
- kind: LimitRange
55
- metadata:
56
- name: default-limits
57
- namespace: ml-pipeline
58
- spec:
59
- limits:
60
- - type: Container
61
- default:
62
- cpu: "2"
63
- memory: 4Gi
64
- nvidia.com/gpu: "1"
65
- defaultRequest:
66
- cpu: 500m
67
- memory: 1Gi
68
- max:
69
- cpu: "8"
70
- memory: 16Gi
71
- nvidia.com/gpu: "2"
72
- min:
73
- cpu: 200m
74
- memory: 512Mi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/namespaces/namespaces.yaml DELETED
@@ -1,69 +0,0 @@
1
- # =============================================================================
2
- # Namespace Definitions — Security-First Multi-Tenant Layout
3
- # =============================================================================
4
- # Each namespace gets:
5
- # - Labels for network policy targeting
6
- # - Resource quotas
7
- # - Limit ranges
8
- # - Pod security standards via labels (Kyverno enforces)
9
-
10
- apiVersion: v1
11
- kind: Namespace
12
- metadata:
13
- name: platform-system
14
- labels:
15
- pod-security.kubernetes.io/enforce: "privileged"
16
- pod-security.kubernetes.io/audit: "privileged"
17
- pod-security.kubernetes.io/warn: "privileged"
18
- platform: "true"
19
- ---
20
- apiVersion: v1
21
- kind: Namespace
22
- metadata:
23
- name: monitoring
24
- labels:
25
- pod-security.kubernetes.io/enforce: "restricted"
26
- pod-security.kubernetes.io/audit: "restricted"
27
- pod-security.kubernetes.io/warn: "restricted"
28
- platform: "true"
29
- ---
30
- apiVersion: v1
31
- kind: Namespace
32
- metadata:
33
- name: security
34
- labels:
35
- pod-security.kubernetes.io/enforce: "restricted"
36
- pod-security.kubernetes.io/audit: "restricted"
37
- pod-security.kubernetes.io/warn: "restricted"
38
- platform: "true"
39
- ---
40
- apiVersion: v1
41
- kind: Namespace
42
- metadata:
43
- name: frontend
44
- labels:
45
- pod-security.kubernetes.io/enforce: "restricted"
46
- pod-security.kubernetes.io/audit: "restricted"
47
- pod-security.kubernetes.io/warn: "restricted"
48
- app-team: "frontend"
49
- ---
50
- apiVersion: v1
51
- kind: Namespace
52
- metadata:
53
- name: backend
54
- labels:
55
- pod-security.kubernetes.io/enforce: "restricted"
56
- pod-security.kubernetes.io/audit: "restricted"
57
- pod-security.kubernetes.io/warn: "restricted"
58
- app-team: "backend"
59
- ---
60
- apiVersion: v1
61
- kind: Namespace
62
- metadata:
63
- name: ml-pipeline
64
- labels:
65
- pod-security.kubernetes.io/enforce: "baseline"
66
- pod-security.kubernetes.io/audit: "restricted"
67
- pod-security.kubernetes.io/warn: "restricted"
68
- app-team: "ml"
69
- nvidia.com/gpu: "true"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/network-policies/network-policies.yaml DELETED
@@ -1,124 +0,0 @@
1
- # =============================================================================
2
- # Network Policies — Zero Trust Default Deny + Selective Allow
3
- # =============================================================================
4
- # Strategy: Default deny all ingress/egress, then allow only known traffic
5
-
6
- # --- Default Deny All Ingress in Every Namespace ---
7
- apiVersion: networking.k8s.io/v1
8
- kind: NetworkPolicy
9
- metadata:
10
- name: default-deny-ingress
11
- namespace: frontend
12
- spec:
13
- podSelector: {} # Matches all pods
14
- policyTypes:
15
- - Ingress
16
- ---
17
- apiVersion: networking.k8s.io/v1
18
- kind: NetworkPolicy
19
- metadata:
20
- name: default-deny-ingress
21
- namespace: backend
22
- spec:
23
- podSelector: {}
24
- policyTypes:
25
- - Ingress
26
- ---
27
- apiVersion: networking.k8s.io/v1
28
- kind: NetworkPolicy
29
- metadata:
30
- name: default-deny-ingress
31
- namespace: ml-pipeline
32
- spec:
33
- podSelector: {}
34
- policyTypes:
35
- - Ingress
36
- ---
37
- # --- Frontend: Allow ingress from Istio ingress gateway only ---
38
- apiVersion: networking.k8s.io/v1
39
- kind: NetworkPolicy
40
- metadata:
41
- name: allow-istio-ingress
42
- namespace: frontend
43
- spec:
44
- podSelector:
45
- matchLabels:
46
- app: frontend
47
- policyTypes:
48
- - Ingress
49
- ingress:
50
- - from:
51
- - namespaceSelector:
52
- matchLabels:
53
- name: istio-system
54
- - podSelector:
55
- matchLabels:
56
- istio: ingressgateway
57
- ports:
58
- - port: 8080
59
- protocol: TCP
60
- ---
61
- # --- Backend: Allow ingress from frontend namespace only ---
62
- apiVersion: networking.k8s.io/v1
63
- kind: NetworkPolicy
64
- metadata:
65
- name: allow-from-frontend
66
- namespace: backend
67
- spec:
68
- podSelector:
69
- matchLabels:
70
- app: backend
71
- policyTypes:
72
- - Ingress
73
- - Egress
74
- ingress:
75
- - from:
76
- - namespaceSelector:
77
- matchLabels:
78
- app-team: frontend
79
- ports:
80
- - port: 8080
81
- protocol: TCP
82
- egress:
83
- # Allow DNS
84
- - to: []
85
- ports:
86
- - port: 53
87
- protocol: UDP
88
- - port: 53
89
- protocol: TCP
90
- # Allow RDS
91
- - to: []
92
- ports:
93
- - port: 5432
94
- protocol: TCP
95
- ---
96
- # --- ML Pipeline: Allow from backend + Istio ---
97
- apiVersion: networking.k8s.io/v1
98
- kind: NetworkPolicy
99
- metadata:
100
- name: allow-ml-traffic
101
- namespace: ml-pipeline
102
- spec:
103
- podSelector: {}
104
- policyTypes:
105
- - Ingress
106
- - Egress
107
- ingress:
108
- - from:
109
- - namespaceSelector:
110
- matchLabels:
111
- app-team: backend
112
- - from:
113
- - namespaceSelector:
114
- matchLabels:
115
- name: istio-system
116
- egress:
117
- - to: []
118
- ports:
119
- - port: 53
120
- protocol: UDP
121
- - to: []
122
- ports:
123
- - port: 443
124
- protocol: TCP # HuggingFace Hub, S3, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/pdbs/pdbs.yaml DELETED
@@ -1,62 +0,0 @@
1
- # =============================================================================
2
- # Pod Disruption Budgets — Availability Guarantees Per Workload
3
- # =============================================================================
4
- # PDBs prevent voluntary evictions (upgrades, drain) from taking down too many pods.
5
- # Without PDBs: kubectl drain or cluster-autoscaler can cause unplanned outages.
6
- # =============================================================================
7
-
8
- apiVersion: policy/v1
9
- kind: PodDisruptionBudget
10
- metadata:
11
- name: frontend-pdb
12
- namespace: frontend
13
- spec:
14
- minAvailable: 2 # At least 2 pods always running (3 replicas total)
15
- selector:
16
- matchLabels:
17
- app: frontend
18
- ---
19
- apiVersion: policy/v1
20
- kind: PodDisruptionBudget
21
- metadata:
22
- name: backend-pdb
23
- namespace: backend
24
- spec:
25
- minAvailable: 2
26
- selector:
27
- matchLabels:
28
- app: backend
29
- ---
30
- apiVersion: policy/v1
31
- kind: PodDisruptionBudget
32
- metadata:
33
- name: ml-inference-pdb
34
- namespace: ml-pipeline
35
- spec:
36
- maxUnavailable: 1 # At most 1 pod disrupted at a time
37
- selector:
38
- matchLabels:
39
- app: ml-inference
40
- ---
41
- # Platform services — always keep 1 available
42
- apiVersion: policy/v1
43
- kind: PodDisruptionBudget
44
- metadata:
45
- name: prometheus-pdb
46
- namespace: monitoring
47
- spec:
48
- minAvailable: 1
49
- selector:
50
- matchLabels:
51
- app: kube-prometheus-stack-prometheus
52
- ---
53
- apiVersion: policy/v1
54
- kind: PodDisruptionBudget
55
- metadata:
56
- name: argocd-pdb
57
- namespace: platform-system
58
- spec:
59
- minAvailable: 1
60
- selector:
61
- matchLabels:
62
- app.kubernetes.io/name: argocd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/rbac/rbac.yaml DELETED
@@ -1,78 +0,0 @@
1
- # =============================================================================
2
- # RBAC — Least-Privilege Access Control
3
- # =============================================================================
4
-
5
- # Platform Admins — Full cluster access
6
- apiVersion: rbac.authorization.k8s.io/v1
7
- kind: ClusterRole
8
- metadata:
9
- name: platform-admin
10
- rules:
11
- - apiGroups: ["*"]
12
- resources: ["*"]
13
- verbs: ["*"]
14
- # Exclude secrets CRUD for audit trail — use ExternalSecrets instead
15
- - apiGroups: [""]
16
- resources: ["secrets"]
17
- verbs: ["get", "list", "watch"] # No create/update/delete
18
- ---
19
- apiVersion: rbac.authorization.k8s.io/v1
20
- kind: ClusterRoleBinding
21
- metadata:
22
- name: platform-admin
23
- roleRef:
24
- apiGroup: rbac.authorization.k8s.io
25
- kind: ClusterRole
26
- name: platform-admin
27
- subjects:
28
- - kind: Group
29
- name: platform-admins
30
- apiGroup: rbac.authorization.k8s.io
31
- ---
32
- # Developer — Read + Pod Exec + Logs within their namespaces
33
- apiVersion: rbac.authorization.k8s.io/v1
34
- kind: ClusterRole
35
- metadata:
36
- name: developer
37
- rules:
38
- - apiGroups: ["", "apps", "batch", "extensions"]
39
- resources: ["pods", "pods/log", "pods/exec", "deployments", "statefulsets", "jobs", "cronjobs"]
40
- verbs: ["get", "list", "watch"]
41
- - apiGroups: [""]
42
- resources: ["pods/exec"]
43
- verbs: ["create"]
44
- - apiGroups: ["", "apps"]
45
- resources: ["deployments", "statefulsets"]
46
- verbs: ["patch"] # For restart rollout only
47
- - apiGroups: ["metrics.k8s.io"]
48
- resources: ["pods", "nodes"]
49
- verbs: ["get", "list"]
50
- ---
51
- # Viewer — Read-only cluster-wide
52
- apiVersion: rbac.authorization.k8s.io/v1
53
- kind: ClusterRole
54
- metadata:
55
- name: viewer
56
- rules:
57
- - apiGroups: ["", "apps", "batch", "extensions", "networking.k8s.io"]
58
- resources: ["*"]
59
- verbs: ["get", "list", "watch"]
60
- - nonResourceURLs: ["*"]
61
- verbs: ["get"]
62
- ---
63
- # ML Engineer — Access to ml-pipeline namespace only
64
- apiVersion: rbac.authorization.k8s.io/v1
65
- kind: Role
66
- metadata:
67
- name: ml-engineer
68
- namespace: ml-pipeline
69
- rules:
70
- - apiGroups: ["", "apps", "batch", "kubeflow.org", "serving.kubeflow.org"]
71
- resources: ["pods", "pods/log", "pods/exec", "deployments", "jobs", "notebooks", "inferenceservices"]
72
- verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
73
- - apiGroups: [""]
74
- resources: ["secrets"]
75
- verbs: ["get", "list"] # No create/update
76
- - apiGroups: [""]
77
- resources: ["configmaps"]
78
- verbs: ["get", "list", "create", "update"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/resource-quotas/resource-quotas.yaml DELETED
@@ -1,50 +0,0 @@
1
- # =============================================================================
2
- # Resource Quotas — Prevent Resource Exhaustion Per Namespace
3
- # =============================================================================
4
-
5
- apiVersion: v1
6
- kind: ResourceQuota
7
- metadata:
8
- name: frontend-quota
9
- namespace: frontend
10
- spec:
11
- hard:
12
- requests.cpu: "4"
13
- requests.memory: 8Gi
14
- limits.cpu: "8"
15
- limits.memory: 16Gi
16
- pods: "20"
17
- services: "5"
18
- persistentvolumeclaims: "10"
19
- requests.nvidia.com/gpu: "0" # No GPUs for frontend
20
- ---
21
- apiVersion: v1
22
- kind: ResourceQuota
23
- metadata:
24
- name: backend-quota
25
- namespace: backend
26
- spec:
27
- hard:
28
- requests.cpu: "8"
29
- requests.memory: 16Gi
30
- limits.cpu: "16"
31
- limits.memory: 32Gi
32
- pods: "30"
33
- services: "10"
34
- persistentvolumeclaims: "20"
35
- ---
36
- apiVersion: v1
37
- kind: ResourceQuota
38
- metadata:
39
- name: ml-quota
40
- namespace: ml-pipeline
41
- spec:
42
- hard:
43
- requests.cpu: "16"
44
- requests.memory: 64Gi
45
- limits.cpu: "32"
46
- limits.memory: 128Gi
47
- pods: "15"
48
- services: "5"
49
- persistentvolumeclaims: "30"
50
- requests.nvidia.com/gpu: "4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/slos/slos.yaml DELETED
@@ -1,68 +0,0 @@
1
- # =============================================================================
2
- # Service Level Objectives — Platform SLOs
3
- # =============================================================================
4
- # SLOs define reliability targets. Error budgets = 100% - SLO.
5
- # Burn rate alerts fire when error budget is consumed too fast.
6
- # =============================================================================
7
-
8
- # --- API Availability SLO: 99.95% (21.9 min/month error budget) ---
9
- apiVersion: monitoring.coreos.com/v1
10
- kind: PrometheusRule
11
- metadata:
12
- name: slo-api-availability
13
- namespace: monitoring
14
- labels:
15
- release: kube-prometheus-stack
16
- slo: "true"
17
- spec:
18
- groups:
19
- - name: slo.api.availability
20
- rules:
21
- # SLO metric: 5m success rate
22
- - record: slo:api_availability:rate5m
23
- expr: |
24
- sum(rate(http_requests_total{code!~"5.."}[5m]))
25
- /
26
- sum(rate(http_requests_total[5m]))
27
-
28
- # 1h error budget burn rate (14.4x = consume 30d budget in 2d)
29
- - alert: SLOAPIAvailabilityBurnRateHigh
30
- expr: |
31
- (
32
- (1 - slo:api_availability:rate5m) > (14.4 * 0.001)
33
- )
34
- for: 5m
35
- labels:
36
- severity: critical
37
- slo: api-availability
38
- annotations:
39
- summary: "API availability SLO budget burning too fast"
40
- runbook: "https://runbook.platform.internal/slo-api-burn"
41
-
42
- - name: slo.api.latency
43
- rules:
44
- # Latency SLO: P99 < 2s, 99.9% of requests
45
- - record: slo:api_latency_p99:rate5m
46
- expr: |
47
- histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
48
-
49
- - alert: SLOAPILatencyBurnRateHigh
50
- expr: |
51
- slo:api_latency_p99:rate5m > 2
52
- for: 10m
53
- labels:
54
- severity: warning
55
- slo: api-latency
56
- annotations:
57
- summary: "API P99 latency exceeding 2s SLO"
58
-
59
- - name: slo.error_budget
60
- rules:
61
- # Remaining error budget (percentage)
62
- - record: slo:error_budget_remaining:ratio
63
- expr: |
64
- 1 - (
65
- (1 - slo:api_availability:rate5m)
66
- /
67
- 0.0005
68
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/kustomize/base/kustomization.yaml DELETED
@@ -1,18 +0,0 @@
1
- apiVersion: kustomize.config.k8s.io/v1beta1
2
- kind: Kustomization
3
- resources:
4
- - ../../base/namespaces/
5
- - ../../base/rbac/
6
- - ../../base/network-policies/
7
- - ../../base/resource-quotas/
8
- - ../../base/limit-ranges/
9
- - ../../base/pdbs/
10
- - ../../base/slos/
11
- - ../../manifests/cert-manager/
12
- - ../../manifests/external-secrets/
13
- - ../../manifests/istio/
14
- - ../../manifests/argo-cd/
15
- - ../../manifests/trivy-operator/
16
- - ../../manifests/falco/
17
- - ../../manifests/kyverno/
18
- - ../../manifests/prometheus-stack/