zenith-backend / app /services /ai /ops /ml-deployment-optimizer.yaml
teoat's picture
Upload folder using huggingface_hub
4ae946d verified
apiVersion: v1
kind: Namespace
metadata:
name: aiops-system
labels:
name: aiops-system
purpose: "intelligent-operations"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: zenith-ml-deployment-optimizer
namespace: aiops-system
labels:
app: ml-deployment-optimizer
component: aiops
spec:
replicas: 2
selector:
matchLabels:
app: ml-deployment-optimizer
template:
metadata:
labels:
app: ml-deployment-optimizer
component: aiops
spec:
containers:
- name: ml-optimizer
image: zenith/ml-deployment-optimizer:latest
ports:
- containerPort: 8080
name: http
- containerPort: 8081
name: grpc
env:
- name: MODEL_PATH
value: "/models/deployment-optimization"
- name: PROMETHEUS_URL
value: "http://prometheus.monitoring.svc.cluster.local:9090"
- name: KUBERNETES_SERVICE_HOST
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: KAFKA_BOOTSTRAP_SERVERS
value: "kafka.kafka.svc.cluster.local:9092"
- name: REDIS_URL
value: "redis://redis-cluster.zenith-production.svc.cluster.local:6379"
- name: LOG_LEVEL
value: "INFO"
- name: OPTIMIZATION_INTERVAL
value: "300s"
- name: CONFIDENCE_THRESHOLD
value: "0.85"
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 4000m
memory: 8Gi
volumeMounts:
- name: model-storage
mountPath: /models
readOnly: true
- name: config-volume
mountPath: /config
readOnly: true
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
- name: ml-trainer
image: zenith/ml-trainer:latest
env:
- name: TRAINING_INTERVAL
value: "24h"
- name: DATA_RETENTION_DAYS
value: "30"
- name: MODEL_BACKUP_PATH
value: "/backup/models"
resources:
requests:
cpu: 2000m
memory: 4Gi
limits:
cpu: 6000m
memory: 16Gi
volumeMounts:
- name: model-storage
mountPath: /models
- name: backup-storage
mountPath: /backup
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: ml-models-pvc
- name: backup-storage
persistentVolumeClaim:
claimName: ml-backup-pvc
- name: config-volume
configMap:
name: ml-optimizer-config
---
apiVersion: v1
kind: Service
metadata:
name: ml-deployment-optimizer
namespace: aiops-system
labels:
app: ml-deployment-optimizer
spec:
selector:
app: ml-deployment-optimizer
ports:
- name: http
port: 8080
targetPort: 8080
protocol: TCP
- name: grpc
port: 8081
targetPort: 8081
protocol: TCP
type: ClusterIP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ml-models-pvc
namespace: aiops-system
spec:
accessModes:
- ReadWriteOnce
storageClassName: gp3
resources:
requests:
storage: 100Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ml-backup-pvc
namespace: aiops-system
spec:
accessModes:
- ReadWriteOnce
storageClassName: gp3
resources:
requests:
storage: 500Gi
---
apiVersion: v1
kind: ConfigMap
metadata:
name: ml-optimizer-config
namespace: aiops-system
data:
optimization-config.yaml: |
# ML Deployment Optimization Configuration
models:
deployment_success:
type: "gradient_boosting"
features:
- cpu_utilization
- memory_usage
- request_rate
- error_rate
- deployment_size
- complexity_score
- test_coverage
target: "deployment_success_rate"
retrain_interval: "24h"
performance_prediction:
type: "lstm"
features:
- historical_performance
- traffic_patterns
- resource_usage
- error_trends
- deployment_frequency
target: "response_time_p95"
sequence_length: 100
retrain_interval: "168h"
resource_optimization:
type: "neural_network"
features:
- pod_cpu_utilization
- pod_memory_usage
- request_latency
- throughput
- error_rate
- cost_metrics
targets:
- optimal_cpu_limit
- optimal_memory_limit
- optimal_replica_count
retrain_interval: "72h"
optimization_strategies:
rollout_strategy:
enabled: true
weights:
success_rate: 0.4
performance: 0.3
cost: 0.2
risk: 0.1
max_rollout_percentage: 20
min_promotion_delay: "300s"
resource_tuning:
enabled: true
cpu_buffer_percentage: 10
memory_buffer_percentage: 15
auto_scale_range:
min_replicas: 2
max_replicas: 50
optimization_window: "3600s"
failure_prevention:
enabled: true
prediction_horizon: "1800s"
confidence_threshold: 0.85
auto_rollback_threshold: 0.95
alternative_strategies:
- canary
- blue_green
- rolling
monitoring:
metrics_collection:
interval: "30s"
retention: "30d"
prometheus_query_timeout: "10s"
model_performance:
accuracy_threshold: 0.90
drift_detection_interval: "3600s"
alert_on_degradation: true
business_metrics:
- deployment_success_rate
- mean_recovery_time
- cost_savings
- performance_improvement
- user_satisfaction
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: ml-model-retraining
namespace: aiops-system
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: model-trainer
image: zenith/ml-trainer:latest
command:
- /bin/sh
- -c
- |
echo "Starting ML model retraining..."
# Retrain deployment success model
python3 -m zenith.ml.train \
--model deployment_success \
--data-source prometheus \
--lookback-days 30 \
--output-path /models/deployment_success_latest.pkl
# Retrain performance prediction model
python3 -m zenith.ml.train \
--model performance_prediction \
--data-source prometheus \
--lookback-days 90 \
--sequence_length 100 \
--output-path /models/performance_prediction_latest.pkl
# Retrain resource optimization model
python3 -m zenith.ml.train \
--model resource_optimization \
--data-source prometheus \
--lookback-days 60 \
--output-path /models/resource_optimization_latest.pkl
echo "Model retraining completed"
# Validate models
python3 -m zenith.ml.validate \
--models deployment_success,performance_prediction,resource_optimization \
--threshold 0.90
echo "Model validation completed"
env:
- name: PROMETHEUS_URL
value: "http://prometheus.monitoring.svc.cluster.local:9090"
- name: KUBERNETES_CONFIG
value: "/config/kubeconfig"
volumeMounts:
- name: model-storage
mountPath: /models
- name: config-volume
mountPath: /config
resources:
requests:
cpu: 4000m
memory: 8Gi
limits:
cpu: 8000m
memory: 16Gi
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: ml-models-pvc
- name: config-volume
configMap:
name: ml-optimizer-config
restartPolicy: OnFailure
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: ml-deployment-optimizer-metrics
namespace: aiops-system
spec:
selector:
matchLabels:
app: ml-deployment-optimizer
endpoints:
- port: http
path: /metrics
interval: 30s
scrapeTimeout: 10s