Spaces:

teoat
/

zenith-backend

Paused

App Files Files Community

zenith-backend / app /services /ai /ops /ml-deployment-optimizer.yaml

teoat

Upload folder using huggingface_hub

4ae946d verified 3 months ago

raw

history blame contribute delete

9.06 kB

	apiVersion: v1
	kind: Namespace
	metadata:
	name: aiops-system
	labels:
	name: aiops-system
	purpose: "intelligent-operations"
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: zenith-ml-deployment-optimizer
	namespace: aiops-system
	labels:
	app: ml-deployment-optimizer
	component: aiops
	spec:
	replicas: 2
	selector:
	matchLabels:
	app: ml-deployment-optimizer
	template:
	metadata:
	labels:
	app: ml-deployment-optimizer
	component: aiops
	spec:
	containers:
	- name: ml-optimizer
	image: zenith/ml-deployment-optimizer:latest
	ports:
	- containerPort: 8080
	name: http
	- containerPort: 8081
	name: grpc
	env:
	- name: MODEL_PATH
	value: "/models/deployment-optimization"
	- name: PROMETHEUS_URL
	value: "http://prometheus.monitoring.svc.cluster.local:9090"
	- name: KUBERNETES_SERVICE_HOST
	valueFrom:
	fieldRef:
	fieldPath: status.hostIP
	- name: KAFKA_BOOTSTRAP_SERVERS
	value: "kafka.kafka.svc.cluster.local:9092"
	- name: REDIS_URL
	value: "redis://redis-cluster.zenith-production.svc.cluster.local:6379"
	- name: LOG_LEVEL
	value: "INFO"
	- name: OPTIMIZATION_INTERVAL
	value: "300s"
	- name: CONFIDENCE_THRESHOLD
	value: "0.85"
	resources:
	requests:
	cpu: 1000m
	memory: 2Gi
	limits:
	cpu: 4000m
	memory: 8Gi
	volumeMounts:
	- name: model-storage
	mountPath: /models
	readOnly: true
	- name: config-volume
	mountPath: /config
	readOnly: true
	livenessProbe:
	httpGet:
	path: /health
	port: 8080
	initialDelaySeconds: 60
	periodSeconds: 30
	timeoutSeconds: 10
	readinessProbe:
	httpGet:
	path: /ready
	port: 8080
	initialDelaySeconds: 30
	periodSeconds: 10
	timeoutSeconds: 5
	- name: ml-trainer
	image: zenith/ml-trainer:latest
	env:
	- name: TRAINING_INTERVAL
	value: "24h"
	- name: DATA_RETENTION_DAYS
	value: "30"
	- name: MODEL_BACKUP_PATH
	value: "/backup/models"
	resources:
	requests:
	cpu: 2000m
	memory: 4Gi
	limits:
	cpu: 6000m
	memory: 16Gi
	volumeMounts:
	- name: model-storage
	mountPath: /models
	- name: backup-storage
	mountPath: /backup
	volumes:
	- name: model-storage
	persistentVolumeClaim:
	claimName: ml-models-pvc
	- name: backup-storage
	persistentVolumeClaim:
	claimName: ml-backup-pvc
	- name: config-volume
	configMap:
	name: ml-optimizer-config
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: ml-deployment-optimizer
	namespace: aiops-system
	labels:
	app: ml-deployment-optimizer
	spec:
	selector:
	app: ml-deployment-optimizer
	ports:
	- name: http
	port: 8080
	targetPort: 8080
	protocol: TCP
	- name: grpc
	port: 8081
	targetPort: 8081
	protocol: TCP
	type: ClusterIP
	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: ml-models-pvc
	namespace: aiops-system
	spec:
	accessModes:
	- ReadWriteOnce
	storageClassName: gp3
	resources:
	requests:
	storage: 100Gi
	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: ml-backup-pvc
	namespace: aiops-system
	spec:
	accessModes:
	- ReadWriteOnce
	storageClassName: gp3
	resources:
	requests:
	storage: 500Gi
	---
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: ml-optimizer-config
	namespace: aiops-system
	data:
	optimization-config.yaml: \|
	# ML Deployment Optimization Configuration
	models:
	deployment_success:
	type: "gradient_boosting"
	features:
	- cpu_utilization
	- memory_usage
	- request_rate
	- error_rate
	- deployment_size
	- complexity_score
	- test_coverage
	target: "deployment_success_rate"
	retrain_interval: "24h"

	performance_prediction:
	type: "lstm"
	features:
	- historical_performance
	- traffic_patterns
	- resource_usage
	- error_trends
	- deployment_frequency
	target: "response_time_p95"
	sequence_length: 100
	retrain_interval: "168h"

	resource_optimization:
	type: "neural_network"
	features:
	- pod_cpu_utilization
	- pod_memory_usage
	- request_latency
	- throughput
	- error_rate
	- cost_metrics
	targets:
	- optimal_cpu_limit
	- optimal_memory_limit
	- optimal_replica_count
	retrain_interval: "72h"

	optimization_strategies:
	rollout_strategy:
	enabled: true
	weights:
	success_rate: 0.4
	performance: 0.3
	cost: 0.2
	risk: 0.1
	max_rollout_percentage: 20
	min_promotion_delay: "300s"

	resource_tuning:
	enabled: true
	cpu_buffer_percentage: 10
	memory_buffer_percentage: 15
	auto_scale_range:
	min_replicas: 2
	max_replicas: 50
	optimization_window: "3600s"

	failure_prevention:
	enabled: true
	prediction_horizon: "1800s"
	confidence_threshold: 0.85
	auto_rollback_threshold: 0.95
	alternative_strategies:
	- canary
	- blue_green
	- rolling

	monitoring:
	metrics_collection:
	interval: "30s"
	retention: "30d"
	prometheus_query_timeout: "10s"

	model_performance:
	accuracy_threshold: 0.90
	drift_detection_interval: "3600s"
	alert_on_degradation: true

	business_metrics:
	- deployment_success_rate
	- mean_recovery_time
	- cost_savings
	- performance_improvement
	- user_satisfaction
	---
	apiVersion: batch/v1
	kind: CronJob
	metadata:
	name: ml-model-retraining
	namespace: aiops-system
	spec:
	schedule: "0 2 * * *" # Daily at 2 AM
	jobTemplate:
	spec:
	template:
	spec:
	containers:
	- name: model-trainer
	image: zenith/ml-trainer:latest
	command:
	- /bin/sh
	- -c
	- \|
	echo "Starting ML model retraining..."

	# Retrain deployment success model
	python3 -m zenith.ml.train \
	--model deployment_success \
	--data-source prometheus \
	--lookback-days 30 \
	--output-path /models/deployment_success_latest.pkl

	# Retrain performance prediction model
	python3 -m zenith.ml.train \
	--model performance_prediction \
	--data-source prometheus \
	--lookback-days 90 \
	--sequence_length 100 \
	--output-path /models/performance_prediction_latest.pkl

	# Retrain resource optimization model
	python3 -m zenith.ml.train \
	--model resource_optimization \
	--data-source prometheus \
	--lookback-days 60 \
	--output-path /models/resource_optimization_latest.pkl

	echo "Model retraining completed"

	# Validate models
	python3 -m zenith.ml.validate \
	--models deployment_success,performance_prediction,resource_optimization \
	--threshold 0.90

	echo "Model validation completed"
	env:
	- name: PROMETHEUS_URL
	value: "http://prometheus.monitoring.svc.cluster.local:9090"
	- name: KUBERNETES_CONFIG
	value: "/config/kubeconfig"
	volumeMounts:
	- name: model-storage
	mountPath: /models
	- name: config-volume
	mountPath: /config
	resources:
	requests:
	cpu: 4000m
	memory: 8Gi
	limits:
	cpu: 8000m
	memory: 16Gi
	volumes:
	- name: model-storage
	persistentVolumeClaim:
	claimName: ml-models-pvc
	- name: config-volume
	configMap:
	name: ml-optimizer-config
	restartPolicy: OnFailure
	---
	apiVersion: monitoring.coreos.com/v1
	kind: ServiceMonitor
	metadata:
	name: ml-deployment-optimizer-metrics
	namespace: aiops-system
	spec:
	selector:
	matchLabels:
	app: ml-deployment-optimizer
	endpoints:
	- port: http
	path: /metrics
	interval: 30s
	scrapeTimeout: 10s