Set up Thanos Ruler to create a unified alerting layer across distributed Prometheus instances. This tutorial covers installation, global rule configuration, and cross-cluster alert federation for enterprise monitoring.
Prerequisites
- Existing Prometheus clusters with Thanos sidecars
- Alertmanager instance
- Basic understanding of PromQL
- Network connectivity between clusters
What this solves
Thanos Ruler provides a centralized alerting layer for distributed Prometheus deployments, allowing you to create global alert rules that evaluate metrics across multiple clusters. This is essential for enterprise environments where you need consistent alerting policies across different data centers, regions, or clusters without duplicating rule configuration on each Prometheus instance.
Step-by-step installation
Update system packages
Start by updating your package manager to ensure you have the latest security patches and dependencies.
sudo apt update && sudo apt upgrade -y
Create Thanos user and directories
Create a dedicated system user for Thanos Ruler to enhance security and proper file ownership.
sudo useradd --no-create-home --shell /bin/false thanos
sudo mkdir -p /etc/thanos/rules /var/lib/thanos/ruler
sudo chown -R thanos:thanos /etc/thanos /var/lib/thanos
Download and install Thanos
Download the latest stable release of Thanos and install the ruler binary to the system path.
cd /tmp
wget https://github.com/thanos-io/thanos/releases/download/v0.32.4/thanos-0.32.4.linux-amd64.tar.gz
tar -xzf thanos-0.32.4.linux-amd64.tar.gz
sudo cp thanos-0.32.4.linux-amd64/thanos /usr/local/bin/
sudo chown root:root /usr/local/bin/thanos
sudo chmod 755 /usr/local/bin/thanos
Configure Thanos Ruler
Create the main configuration file for Thanos Ruler with query endpoints and alertmanager integration.
# Thanos Ruler Configuration
type: RULER
config:
query_frontend:
- "http://thanos-query-frontend:9090"
query:
- "http://thanos-query:9090"
alertmanagers:
- "http://alertmanager:9093"
rule_files:
- "/etc/thanos/rules/*.yml"
evaluation_interval: 30s
external_labels:
cluster: "global"
replica: "ruler-1"
tsdb:
path: "/var/lib/thanos/ruler"
retention: "30d"
log:
level: info
format: json
Create systemd service file
Configure Thanos Ruler as a systemd service for automatic startup and process management.
[Unit]
Description=Thanos Ruler
After=network.target
Wants=network.target
[Service]
User=thanos
Group=thanos
Type=simple
ExecStart=/usr/local/bin/thanos rule \
--data-dir=/var/lib/thanos/ruler \
--eval-interval=30s \
--rule-file=/etc/thanos/rules/*.yml \
--alertmanagers.url=http://alertmanager:9093 \
--query=http://thanos-query:9090 \
--http-address=0.0.0.0:10902 \
--grpc-address=0.0.0.0:10901 \
--label=replica="ruler-1" \
--label=cluster="global" \
--log.level=info \
--log.format=json
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-ruler
KillMode=mixed
KillSignal=SIGTERM
[Install]
WantedBy=multi-user.target
Configure global alerting rules
Create global alert rules
Define global alerting rules that evaluate metrics across all connected Prometheus clusters.
groups:
- name: global.infrastructure
interval: 30s
rules:
- alert: HighMemoryUsage
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High memory usage detected on {{ $labels.instance }}"
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }} in cluster {{ $labels.cluster }}"
- alert: PrometheusTargetDown
expr: up == 0
for: 2m
labels:
severity: critical
team: monitoring
annotations:
summary: "Prometheus target is down"
description: "Target {{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $labels.cluster }} has been down for more than 2 minutes"
- alert: DiskSpaceLow
expr: |
(
node_filesystem_avail_bytes{fstype!="tmpfs"} /
node_filesystem_size_bytes{fstype!="tmpfs"} * 100
) < 15
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk usage is above 85% on {{ $labels.instance }} at {{ $labels.mountpoint }} in cluster {{ $labels.cluster }}"
- name: global.application
interval: 30s
rules:
- alert: HighErrorRate
expr: |
(
rate(http_requests_total{status=~"5.."}[5m]) /
rate(http_requests_total[5m])
) * 100 > 5
for: 2m
labels:
severity: critical
team: application
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.job }} in cluster {{ $labels.cluster }}"
- alert: ResponseTimeHigh
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
team: application
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for {{ $labels.job }} in cluster {{ $labels.cluster }}"
Create cross-cluster federation rules
Configure recording rules that aggregate metrics across multiple clusters for global visibility.
groups:
- name: global.aggregations
interval: 60s
rules:
- record: global:node_cpu_usage:avg
expr: |
avg by (cluster) (
100 - (avg by (instance, cluster) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
)
labels:
aggregation: "cluster_average"
- record: global:memory_usage:avg
expr: |
avg by (cluster) (
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) /
node_memory_MemTotal_bytes * 100
)
labels:
aggregation: "cluster_average"
- record: global:disk_usage:avg
expr: |
avg by (cluster) (
100 - (node_filesystem_avail_bytes{fstype!="tmpfs"} /
node_filesystem_size_bytes{fstype!="tmpfs"} * 100)
)
labels:
aggregation: "cluster_average"
- record: global:http_requests:rate5m
expr: |
sum by (cluster, job) (
rate(http_requests_total[5m])
)
labels:
aggregation: "cluster_sum"
- record: global:error_rate:rate5m
expr: |
sum by (cluster, job) (
rate(http_requests_total{status=~"5.."}[5m])
) /
sum by (cluster, job) (
rate(http_requests_total[5m])
)
labels:
aggregation: "cluster_ratio"
Set proper file permissions
Ensure Thanos Ruler can read the configuration files while maintaining security.
sudo chown -R thanos:thanos /etc/thanos
sudo chmod 755 /etc/thanos /etc/thanos/rules
sudo chmod 644 /etc/thanos/rules/*.yml /etc/thanos/ruler.yml
sudo chmod 755 /var/lib/thanos/ruler
Configure cross-cluster alert federation
Create Alertmanager configuration
Configure Alertmanager to handle alerts from Thanos Ruler with proper routing and grouping.
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'your-smtp-password'
route:
group_by: ['cluster', 'alertname', 'severity']
group_wait: 10s
group_interval: 30s
repeat_interval: 4h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 1h
- match:
team: infrastructure
receiver: 'infrastructure-team'
- match:
team: application
receiver: 'application-team'
receivers:
- name: 'default-receiver'
email_configs:
- to: 'ops-team@example.com'
subject: '[{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
body: |
Cluster: {{ range .GroupLabels }}{{ . }}{{ end }}
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'oncall@example.com'
subject: '[CRITICAL] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
body: |
CRITICAL ALERT TRIGGERED
Cluster: {{ .GroupLabels.cluster }}
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
{{ end }}
- name: 'infrastructure-team'
email_configs:
- to: 'infrastructure@example.com'
subject: '[INFRA] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
- name: 'application-team'
email_configs:
- to: 'developers@example.com'
subject: '[APP] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['cluster', 'instance']
Configure query endpoints
Set up Thanos Query to federate metrics from multiple Prometheus clusters for global querying.
[Unit]
Description=Thanos Query
After=network.target
Wants=network.target
[Service]
User=thanos
Group=thanos
Type=simple
ExecStart=/usr/local/bin/thanos query \
--http-address=0.0.0.0:9090 \
--grpc-address=0.0.0.0:10901 \
--store=prometheus-cluster-1:10901 \
--store=prometheus-cluster-2:10901 \
--store=prometheus-cluster-3:10901 \
--store=thanos-sidecar-1:10901 \
--store=thanos-sidecar-2:10901 \
--query.timeout=5m \
--query.replica-label=replica \
--log.level=info
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-query
[Install]
WantedBy=multi-user.target
Enable and start services
Start Thanos Ruler and enable it to start automatically on system boot.
sudo systemctl daemon-reload
sudo systemctl enable --now thanos-ruler
sudo systemctl enable --now thanos-query
sudo systemctl status thanos-ruler
Monitor and troubleshoot Thanos Ruler
Configure monitoring for Thanos Ruler
Set up Prometheus scraping configuration to monitor Thanos Ruler metrics and performance.
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'monitoring'
replica: 'prometheus-1'
rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'thanos-ruler'
static_configs:
- targets: ['localhost:10902']
scrape_interval: 15s
metrics_path: /metrics
- job_name: 'thanos-query'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 15s
metrics_path: /metrics
- job_name: 'thanos-sidecar'
static_configs:
- targets:
- 'prometheus-cluster-1:10902'
- 'prometheus-cluster-2:10902'
- 'prometheus-cluster-3:10902'
scrape_interval: 30s
Create Thanos Ruler monitoring rules
Add specific alerting rules to monitor the health and performance of Thanos Ruler itself.
groups:
- name: thanos.ruler
interval: 30s
rules:
- alert: ThanosRulerDown
expr: up{job="thanos-ruler"} == 0
for: 1m
labels:
severity: critical
component: thanos-ruler
annotations:
summary: "Thanos Ruler is down"
description: "Thanos Ruler has been down for more than 1 minute"
- alert: ThanosRulerEvaluationFailures
expr: increase(thanos_rule_evaluation_failures_total[5m]) > 0
for: 1m
labels:
severity: warning
component: thanos-ruler
annotations:
summary: "Thanos Ruler evaluation failures"
description: "Thanos Ruler has {{ $value }} rule evaluation failures in the last 5 minutes"
- alert: ThanosRulerHighLatency
expr: histogram_quantile(0.99, rate(thanos_rule_evaluation_duration_seconds_bucket[5m])) > 10
for: 5m
labels:
severity: warning
component: thanos-ruler
annotations:
summary: "High rule evaluation latency"
description: "99th percentile rule evaluation latency is {{ $value }}s"
- alert: ThanosQueryUnavailable
expr: thanos_rule_query_apis_down > 0
for: 2m
labels:
severity: critical
component: thanos-ruler
annotations:
summary: "Thanos Query API unavailable"
description: "{{ $value }} Thanos Query APIs are unavailable for rule evaluation"
- alert: ThanosRulerConfigReloadFailure
expr: increase(thanos_rule_config_last_reload_successful[5m]) == 0
for: 5m
labels:
severity: warning
component: thanos-ruler
annotations:
summary: "Thanos Ruler config reload failure"
description: "Thanos Ruler configuration reload has failed"
Verify your setup
sudo systemctl status thanos-ruler
curl http://localhost:10902/metrics | grep thanos_rule
curl http://localhost:10902/api/v1/rules
curl http://localhost:10902/api/v1/alerts
/usr/local/bin/thanos rule --help
Check the Thanos Ruler web interface at http://your-server:10902 to view active rules and alerts.
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Service fails to start | Permission denied on data directory | sudo chown -R thanos:thanos /var/lib/thanos |
| Rules not loading | Invalid YAML syntax in rule files | sudo -u thanos /usr/local/bin/thanos rule --rule-file=/etc/thanos/rules/*.yml --dry-run |
| No metrics from clusters | Query endpoints unreachable | Check network connectivity and firewall rules for ports 10901/9090 |
| Alerts not firing | Evaluation interval too long | Reduce eval-interval in systemd service or rule group intervals |
| High memory usage | Too many active series | Increase --query.max-concurrent and add resource limits |
| Alertmanager not receiving alerts | Wrong alertmanager URL | Verify --alertmanagers.url parameter and network connectivity |
Next steps
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Global variables
THANOS_VERSION="v0.32.4"
THANOS_USER="thanos"
THANOS_HOME="/var/lib/thanos"
THANOS_CONFIG_DIR="/etc/thanos"
THANOS_RULES_DIR="/etc/thanos/rules"
THANOS_DATA_DIR="/var/lib/thanos/ruler"
# Default configuration
ALERTMANAGER_URL=${1:-"http://alertmanager:9093"}
QUERY_URL=${2:-"http://thanos-query:9090"}
CLUSTER_NAME=${3:-"global"}
usage() {
echo "Usage: $0 [alertmanager_url] [query_url] [cluster_name]"
echo " alertmanager_url: URL to Alertmanager (default: http://alertmanager:9093)"
echo " query_url: URL to Thanos Query (default: http://thanos-query:9090)"
echo " cluster_name: Cluster identifier (default: global)"
exit 1
}
log() {
echo -e "${GREEN}[INFO]${NC} $1"
}
warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
cleanup() {
error "Installation failed. Cleaning up..."
systemctl stop thanos-ruler 2>/dev/null || true
systemctl disable thanos-ruler 2>/dev/null || true
rm -f /etc/systemd/system/thanos-ruler.service
userdel -r ${THANOS_USER} 2>/dev/null || true
rm -rf ${THANOS_CONFIG_DIR} ${THANOS_HOME}
rm -f /usr/local/bin/thanos
exit 1
}
trap cleanup ERR
check_prerequisites() {
if [[ $EUID -ne 0 ]]; then
error "This script must be run as root or with sudo"
exit 1
fi
if ! command -v wget &> /dev/null && ! command -v curl &> /dev/null; then
error "Either wget or curl is required"
exit 1
fi
if ! command -v tar &> /dev/null; then
error "tar is required"
exit 1
fi
}
detect_distro() {
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
PKG_UPDATE="apt update && apt upgrade -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
PKG_UPDATE="yum update -y"
;;
*)
error "Unsupported distribution: $ID"
exit 1
;;
esac
else
error "Cannot detect distribution"
exit 1
fi
}
update_system() {
log "[1/7] Updating system packages..."
eval $PKG_UPDATE
}
create_user_directories() {
log "[2/7] Creating Thanos user and directories..."
if ! id ${THANOS_USER} &>/dev/null; then
useradd --no-create-home --shell /bin/false --system ${THANOS_USER}
fi
mkdir -p ${THANOS_CONFIG_DIR} ${THANOS_RULES_DIR} ${THANOS_DATA_DIR}
chown -R ${THANOS_USER}:${THANOS_USER} ${THANOS_CONFIG_DIR} ${THANOS_HOME}
chmod 750 ${THANOS_CONFIG_DIR} ${THANOS_RULES_DIR} ${THANOS_DATA_DIR}
}
install_thanos() {
log "[3/7] Downloading and installing Thanos..."
cd /tmp
THANOS_TARBALL="thanos-${THANOS_VERSION#v}.linux-amd64.tar.gz"
THANOS_URL="https://github.com/thanos-io/thanos/releases/download/${THANOS_VERSION}/${THANOS_TARBALL}"
if command -v wget &> /dev/null; then
wget -q ${THANOS_URL}
else
curl -sLO ${THANOS_URL}
fi
tar -xzf ${THANOS_TARBALL}
cp thanos-${THANOS_VERSION#v}.linux-amd64/thanos /usr/local/bin/
chown root:root /usr/local/bin/thanos
chmod 755 /usr/local/bin/thanos
rm -rf thanos-${THANOS_VERSION#v}.linux-amd64 ${THANOS_TARBALL}
}
create_alerting_rules() {
log "[4/7] Creating global alerting rules..."
cat > ${THANOS_RULES_DIR}/global.yml << 'EOF'
groups:
- name: global.infrastructure
interval: 30s
rules:
- alert: HighMemoryUsage
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High memory usage detected on {{ $labels.instance }}"
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }} in cluster {{ $labels.cluster }}"
- alert: PrometheusTargetDown
expr: up == 0
for: 2m
labels:
severity: critical
team: monitoring
annotations:
summary: "Prometheus target is down"
description: "Target {{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $labels.cluster }} has been down for more than 2 minutes"
- alert: DiskSpaceLow
expr: |
(
node_filesystem_avail_bytes{fstype!="tmpfs"} /
node_filesystem_size_bytes{fstype!="tmpfs"}
) * 100 < 15
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk space is below 15% on {{ $labels.instance }} at {{ $labels.mountpoint }}"
EOF
chown ${THANOS_USER}:${THANOS_USER} ${THANOS_RULES_DIR}/global.yml
chmod 644 ${THANOS_RULES_DIR}/global.yml
}
create_systemd_service() {
log "[5/7] Creating systemd service..."
cat > /etc/systemd/system/thanos-ruler.service << EOF
[Unit]
Description=Thanos Ruler
After=network.target
Wants=network.target
[Service]
User=${THANOS_USER}
Group=${THANOS_USER}
Type=simple
ExecStart=/usr/local/bin/thanos rule \\
--data-dir=${THANOS_DATA_DIR} \\
--eval-interval=30s \\
--rule-file=${THANOS_RULES_DIR}/*.yml \\
--alertmanagers.url=${ALERTMANAGER_URL} \\
--query=${QUERY_URL} \\
--http-address=0.0.0.0:10902 \\
--grpc-address=0.0.0.0:10901 \\
--label=replica="ruler-1" \\
--label=cluster="${CLUSTER_NAME}" \\
--log.level=info \\
--log.format=json
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-ruler
KillMode=mixed
KillSignal=SIGTERM
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/thanos-ruler.service
}
configure_firewall() {
log "[6/7] Configuring firewall..."
if command -v ufw &> /dev/null; then
ufw allow 10901/tcp comment "Thanos Ruler gRPC"
ufw allow 10902/tcp comment "Thanos Ruler HTTP"
elif command -v firewall-cmd &> /dev/null; then
firewall-cmd --permanent --add-port=10901/tcp
firewall-cmd --permanent --add-port=10902/tcp
firewall-cmd --reload
else
warn "No firewall management tool found. Please manually open ports 10901 and 10902"
fi
}
start_service() {
log "[7/7] Starting and enabling Thanos Ruler service..."
systemctl daemon-reload
systemctl enable thanos-ruler
systemctl start thanos-ruler
sleep 5
if systemctl is-active --quiet thanos-ruler; then
log "Thanos Ruler service started successfully"
else
error "Failed to start Thanos Ruler service"
systemctl status thanos-ruler --no-pager
exit 1
fi
}
verify_installation() {
log "Verifying installation..."
if ! systemctl is-enabled --quiet thanos-ruler; then
error "Service is not enabled"
return 1
fi
if ! systemctl is-active --quiet thanos-ruler; then
error "Service is not running"
return 1
fi
if ! /usr/local/bin/thanos --version &>/dev/null; then
error "Thanos binary is not working"
return 1
fi
sleep 10
if command -v curl &> /dev/null; then
if curl -sf http://localhost:10902/-/healthy &>/dev/null; then
log "Health check passed"
else
warn "Health check failed - service may still be starting up"
fi
fi
log "Installation completed successfully!"
log "Thanos Ruler is running on:"
log " HTTP: http://localhost:10902"
log " gRPC: localhost:10901"
log " Rules directory: ${THANOS_RULES_DIR}"
log " Data directory: ${THANOS_DATA_DIR}"
}
main() {
check_prerequisites
detect_distro
update_system
create_user_directories
install_thanos
create_alerting_rules
create_systemd_service
configure_firewall
start_service
verify_installation
}
main "$@"
Review the script before running. Execute with: bash install.sh