todo-api / phase-5 /monitoring /prometheus.yaml
Nanny7's picture
feat: Phase 5 Complete - Production-Ready AI Todo Application ๐ŸŽ‰
edcd2ef
# Prometheus Monitoring Stack - Phase 5
# Production monitoring configuration
---
# Prometheus ConfigMap
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'phase-5-production'
environment: 'production'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them
rule_files:
- '/etc/prometheus/rules/*.yml'
# Scrape configurations
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Kubernetes API servers
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Backend service metrics
- job_name: 'backend-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- default
- phase-5
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: backend
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: node
metrics_path: /metrics
scrape_interval: 10s
# Notification service metrics
- job_name: 'notification-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- default
- phase-5
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: notification
metrics_path: /metrics
scrape_interval: 10s
# Chatbot service metrics
- job_name: 'chatbot-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- default
- phase-5
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: chatbot
metrics_path: /metrics
scrape_interval: 10s
# Kafka metrics (via JMX exporter)
- job_name: 'kafka'
static_configs:
- targets: ['my-cluster-kafka-0.kafka:9304']
scrape_interval: 30s
# Dapr metrics
- job_name: 'dapr'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- default
- phase-5
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: backend|notification|chatbot
- source_labels: [__meta_kubernetes_pod_name]
regex: '(.*)-(.*)'
target_label: dapr_sidecar
replacement: '${1}-dapr'
metrics_path: /metrics
scrape_interval: 15s
# Node exporter (system metrics)
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
scrape_interval: 30s
---
# Prometheus Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.48.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- containerPort: 9090
name: http
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: rules
mountPath: /etc/prometheus/rules
- name: storage
mountPath: /prometheus
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
volumes:
- name: config
configMap:
name: prometheus-config
- name: rules
configMap:
name: prometheus-rules
- name: storage
emptyDir: {}
---
# Prometheus Service
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: http
selector:
app: prometheus
---
# Prometheus ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
# Prometheus Role & RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring