Spaces:
Paused
Paused
| apiVersion: v1 | |
| kind: Namespace | |
| metadata: | |
| name: aiops-system | |
| labels: | |
| name: aiops-system | |
| purpose: "intelligent-operations" | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: zenith-ml-deployment-optimizer | |
| namespace: aiops-system | |
| labels: | |
| app: ml-deployment-optimizer | |
| component: aiops | |
| spec: | |
| replicas: 2 | |
| selector: | |
| matchLabels: | |
| app: ml-deployment-optimizer | |
| template: | |
| metadata: | |
| labels: | |
| app: ml-deployment-optimizer | |
| component: aiops | |
| spec: | |
| containers: | |
| - name: ml-optimizer | |
| image: zenith/ml-deployment-optimizer:latest | |
| ports: | |
| - containerPort: 8080 | |
| name: http | |
| - containerPort: 8081 | |
| name: grpc | |
| env: | |
| - name: MODEL_PATH | |
| value: "/models/deployment-optimization" | |
| - name: PROMETHEUS_URL | |
| value: "http://prometheus.monitoring.svc.cluster.local:9090" | |
| - name: KUBERNETES_SERVICE_HOST | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: status.hostIP | |
| - name: KAFKA_BOOTSTRAP_SERVERS | |
| value: "kafka.kafka.svc.cluster.local:9092" | |
| - name: REDIS_URL | |
| value: "redis://redis-cluster.zenith-production.svc.cluster.local:6379" | |
| - name: LOG_LEVEL | |
| value: "INFO" | |
| - name: OPTIMIZATION_INTERVAL | |
| value: "300s" | |
| - name: CONFIDENCE_THRESHOLD | |
| value: "0.85" | |
| resources: | |
| requests: | |
| cpu: 1000m | |
| memory: 2Gi | |
| limits: | |
| cpu: 4000m | |
| memory: 8Gi | |
| volumeMounts: | |
| - name: model-storage | |
| mountPath: /models | |
| readOnly: true | |
| - name: config-volume | |
| mountPath: /config | |
| readOnly: true | |
| livenessProbe: | |
| httpGet: | |
| path: /health | |
| port: 8080 | |
| initialDelaySeconds: 60 | |
| periodSeconds: 30 | |
| timeoutSeconds: 10 | |
| readinessProbe: | |
| httpGet: | |
| path: /ready | |
| port: 8080 | |
| initialDelaySeconds: 30 | |
| periodSeconds: 10 | |
| timeoutSeconds: 5 | |
| - name: ml-trainer | |
| image: zenith/ml-trainer:latest | |
| env: | |
| - name: TRAINING_INTERVAL | |
| value: "24h" | |
| - name: DATA_RETENTION_DAYS | |
| value: "30" | |
| - name: MODEL_BACKUP_PATH | |
| value: "/backup/models" | |
| resources: | |
| requests: | |
| cpu: 2000m | |
| memory: 4Gi | |
| limits: | |
| cpu: 6000m | |
| memory: 16Gi | |
| volumeMounts: | |
| - name: model-storage | |
| mountPath: /models | |
| - name: backup-storage | |
| mountPath: /backup | |
| volumes: | |
| - name: model-storage | |
| persistentVolumeClaim: | |
| claimName: ml-models-pvc | |
| - name: backup-storage | |
| persistentVolumeClaim: | |
| claimName: ml-backup-pvc | |
| - name: config-volume | |
| configMap: | |
| name: ml-optimizer-config | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: ml-deployment-optimizer | |
| namespace: aiops-system | |
| labels: | |
| app: ml-deployment-optimizer | |
| spec: | |
| selector: | |
| app: ml-deployment-optimizer | |
| ports: | |
| - name: http | |
| port: 8080 | |
| targetPort: 8080 | |
| protocol: TCP | |
| - name: grpc | |
| port: 8081 | |
| targetPort: 8081 | |
| protocol: TCP | |
| type: ClusterIP | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: ml-models-pvc | |
| namespace: aiops-system | |
| spec: | |
| accessModes: | |
| - ReadWriteOnce | |
| storageClassName: gp3 | |
| resources: | |
| requests: | |
| storage: 100Gi | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: ml-backup-pvc | |
| namespace: aiops-system | |
| spec: | |
| accessModes: | |
| - ReadWriteOnce | |
| storageClassName: gp3 | |
| resources: | |
| requests: | |
| storage: 500Gi | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: ml-optimizer-config | |
| namespace: aiops-system | |
| data: | |
| optimization-config.yaml: | | |
| # ML Deployment Optimization Configuration | |
| models: | |
| deployment_success: | |
| type: "gradient_boosting" | |
| features: | |
| - cpu_utilization | |
| - memory_usage | |
| - request_rate | |
| - error_rate | |
| - deployment_size | |
| - complexity_score | |
| - test_coverage | |
| target: "deployment_success_rate" | |
| retrain_interval: "24h" | |
| performance_prediction: | |
| type: "lstm" | |
| features: | |
| - historical_performance | |
| - traffic_patterns | |
| - resource_usage | |
| - error_trends | |
| - deployment_frequency | |
| target: "response_time_p95" | |
| sequence_length: 100 | |
| retrain_interval: "168h" | |
| resource_optimization: | |
| type: "neural_network" | |
| features: | |
| - pod_cpu_utilization | |
| - pod_memory_usage | |
| - request_latency | |
| - throughput | |
| - error_rate | |
| - cost_metrics | |
| targets: | |
| - optimal_cpu_limit | |
| - optimal_memory_limit | |
| - optimal_replica_count | |
| retrain_interval: "72h" | |
| optimization_strategies: | |
| rollout_strategy: | |
| enabled: true | |
| weights: | |
| success_rate: 0.4 | |
| performance: 0.3 | |
| cost: 0.2 | |
| risk: 0.1 | |
| max_rollout_percentage: 20 | |
| min_promotion_delay: "300s" | |
| resource_tuning: | |
| enabled: true | |
| cpu_buffer_percentage: 10 | |
| memory_buffer_percentage: 15 | |
| auto_scale_range: | |
| min_replicas: 2 | |
| max_replicas: 50 | |
| optimization_window: "3600s" | |
| failure_prevention: | |
| enabled: true | |
| prediction_horizon: "1800s" | |
| confidence_threshold: 0.85 | |
| auto_rollback_threshold: 0.95 | |
| alternative_strategies: | |
| - canary | |
| - blue_green | |
| - rolling | |
| monitoring: | |
| metrics_collection: | |
| interval: "30s" | |
| retention: "30d" | |
| prometheus_query_timeout: "10s" | |
| model_performance: | |
| accuracy_threshold: 0.90 | |
| drift_detection_interval: "3600s" | |
| alert_on_degradation: true | |
| business_metrics: | |
| - deployment_success_rate | |
| - mean_recovery_time | |
| - cost_savings | |
| - performance_improvement | |
| - user_satisfaction | |
| apiVersion: batch/v1 | |
| kind: CronJob | |
| metadata: | |
| name: ml-model-retraining | |
| namespace: aiops-system | |
| spec: | |
| schedule: "0 2 * * *" # Daily at 2 AM | |
| jobTemplate: | |
| spec: | |
| template: | |
| spec: | |
| containers: | |
| - name: model-trainer | |
| image: zenith/ml-trainer:latest | |
| command: | |
| - /bin/sh | |
| - -c | |
| - | | |
| echo "Starting ML model retraining..." | |
| # Retrain deployment success model | |
| python3 -m zenith.ml.train \ | |
| --model deployment_success \ | |
| --data-source prometheus \ | |
| --lookback-days 30 \ | |
| --output-path /models/deployment_success_latest.pkl | |
| # Retrain performance prediction model | |
| python3 -m zenith.ml.train \ | |
| --model performance_prediction \ | |
| --data-source prometheus \ | |
| --lookback-days 90 \ | |
| --sequence_length 100 \ | |
| --output-path /models/performance_prediction_latest.pkl | |
| # Retrain resource optimization model | |
| python3 -m zenith.ml.train \ | |
| --model resource_optimization \ | |
| --data-source prometheus \ | |
| --lookback-days 60 \ | |
| --output-path /models/resource_optimization_latest.pkl | |
| echo "Model retraining completed" | |
| # Validate models | |
| python3 -m zenith.ml.validate \ | |
| --models deployment_success,performance_prediction,resource_optimization \ | |
| --threshold 0.90 | |
| echo "Model validation completed" | |
| env: | |
| - name: PROMETHEUS_URL | |
| value: "http://prometheus.monitoring.svc.cluster.local:9090" | |
| - name: KUBERNETES_CONFIG | |
| value: "/config/kubeconfig" | |
| volumeMounts: | |
| - name: model-storage | |
| mountPath: /models | |
| - name: config-volume | |
| mountPath: /config | |
| resources: | |
| requests: | |
| cpu: 4000m | |
| memory: 8Gi | |
| limits: | |
| cpu: 8000m | |
| memory: 16Gi | |
| volumes: | |
| - name: model-storage | |
| persistentVolumeClaim: | |
| claimName: ml-models-pvc | |
| - name: config-volume | |
| configMap: | |
| name: ml-optimizer-config | |
| restartPolicy: OnFailure | |
| apiVersion: monitoring.coreos.com/v1 | |
| kind: ServiceMonitor | |
| metadata: | |
| name: ml-deployment-optimizer-metrics | |
| namespace: aiops-system | |
| spec: | |
| selector: | |
| matchLabels: | |
| app: ml-deployment-optimizer | |
| endpoints: | |
| - port: http | |
| path: /metrics | |
| interval: 30s | |
| scrapeTimeout: 10s |