Configure an alerting backend
GitOps Configuration
Available Metrics
Name
Prometheus Metric
Source
Sample Alert Configuration
groups:
- name: development_foo_turing-hello-turing-router_cpu_util
rules:
- alert: turing-hello-turing-router_cpu_util_violation_development
expr: |-
sum by(cluster) (rate(container_cpu_usage_seconds_total{environment="staging",pod=~"turing-hello-turing-router-[0-9]*.*"}[1m])) / sum by(cluster) (kube_pod_container_resource_requests{resource="cpu",environment="staging",pod=~"turing-hello-turing-router-[0-9]*.*"}) * 100 > 90
for: 5m
labels:
owner: foo
service_name: turing-hello-turing-router
severity: warning
annotations:
dashboard: http://monitoring.com/turing-dashboard?var-cluster=test-kube-cluster&var-project=test-project&var-experiment=turing-hello
description: 'cpu_util for the past 5m: {{ $value }}%'
playbook: http://docs.com/Alert+Troubleshooting+Playbook
summary: 'cpu_util is higher than the threshold: 90%'
- alert: turing-hello-turing-router_cpu_util_violation_development
expr: |-
sum by(cluster) (rate(container_cpu_usage_seconds_total{environment="staging",pod=~"turing-hello-turing-router-[0-9]*.*"}[1m])) / sum by(cluster) (kube_pod_container_resource_requests{resource="cpu",environment="staging",pod=~"turing-hello-turing-router-[0-9]*.*"}) * 100 > 95
for: 5m
labels:
owner: foo
service_name: turing-hello-turing-router
severity: critical
annotations:
dashboard: http://monitoring.com/turing-dashboard?var-cluster=test-kube-cluster&var-project=test-project&var-experiment=turing-hello
description: 'cpu_util for the past 5m: {{ $value }}%'
playbook: http://docs.com/Alert+Troubleshooting+Playbook
summary: 'cpu_util is higher than the threshold: 95%'Last updated