Configure Thanos Ruler for Distributed Alerting

Set up Thanos Ruler to create a unified alerting layer across distributed Prometheus instances. This tutorial covers installation, global rule configuration, and cross-cluster alert federation for enterprise monitoring.

Prerequisites

Existing Prometheus clusters with Thanos sidecars
Alertmanager instance
Basic understanding of PromQL
Network connectivity between clusters

What this solves

Thanos Ruler provides a centralized alerting layer for distributed Prometheus deployments, allowing you to create global alert rules that evaluate metrics across multiple clusters. This is essential for enterprise environments where you need consistent alerting policies across different data centers, regions, or clusters without duplicating rule configuration on each Prometheus instance.

Step-by-step installation

Update system packages

Start by updating your package manager to ensure you have the latest security patches and dependencies.

sudo apt update && sudo apt upgrade -y

sudo dnf update -y

Create Thanos user and directories

Create a dedicated system user for Thanos Ruler to enhance security and proper file ownership.

sudo useradd --no-create-home --shell /bin/false thanos
sudo mkdir -p /etc/thanos/rules /var/lib/thanos/ruler
sudo chown -R thanos:thanos /etc/thanos /var/lib/thanos

Download and install Thanos

Download the latest stable release of Thanos and install the ruler binary to the system path.

cd /tmp
wget https://github.com/thanos-io/thanos/releases/download/v0.32.4/thanos-0.32.4.linux-amd64.tar.gz
tar -xzf thanos-0.32.4.linux-amd64.tar.gz
sudo cp thanos-0.32.4.linux-amd64/thanos /usr/local/bin/
sudo chown root:root /usr/local/bin/thanos
sudo chmod 755 /usr/local/bin/thanos

Configure Thanos Ruler

Create the main configuration file for Thanos Ruler with query endpoints and alertmanager integration.

# Thanos Ruler Configuration
type: RULER
config:
  query_frontend:
    - "http://thanos-query-frontend:9090"
  query:
    - "http://thanos-query:9090"
  alertmanagers:
    - "http://alertmanager:9093"
  rule_files:
    - "/etc/thanos/rules/*.yml"
  evaluation_interval: 30s
  external_labels:
    cluster: "global"
    replica: "ruler-1"
tsdb:
  path: "/var/lib/thanos/ruler"
  retention: "30d"
log:
  level: info
  format: json

Create systemd service file

Configure Thanos Ruler as a systemd service for automatic startup and process management.

[Unit]
Description=Thanos Ruler
After=network.target
Wants=network.target

[Service]
User=thanos
Group=thanos
Type=simple
ExecStart=/usr/local/bin/thanos rule \
  --data-dir=/var/lib/thanos/ruler \
  --eval-interval=30s \
  --rule-file=/etc/thanos/rules/*.yml \
  --alertmanagers.url=http://alertmanager:9093 \
  --query=http://thanos-query:9090 \
  --http-address=0.0.0.0:10902 \
  --grpc-address=0.0.0.0:10901 \
  --label=replica="ruler-1" \
  --label=cluster="global" \
  --log.level=info \
  --log.format=json

Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-ruler
KillMode=mixed
KillSignal=SIGTERM

[Install]
WantedBy=multi-user.target

Configure global alerting rules

Create global alert rules

Define global alerting rules that evaluate metrics across all connected Prometheus clusters.

groups:
  - name: global.infrastructure
    interval: 30s
    rules:
      - alert: HighMemoryUsage
        expr: |
          (
            node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
          ) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "High memory usage detected on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }} in cluster {{ $labels.cluster }}"
          
      - alert: PrometheusTargetDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
          team: monitoring
        annotations:
          summary: "Prometheus target is down"
          description: "Target {{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $labels.cluster }} has been down for more than 2 minutes"
          
      - alert: DiskSpaceLow
        expr: |
          (
            node_filesystem_avail_bytes{fstype!="tmpfs"} / 
            node_filesystem_size_bytes{fstype!="tmpfs"} * 100
          ) < 15
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Disk usage is above 85% on {{ $labels.instance }} at {{ $labels.mountpoint }} in cluster {{ $labels.cluster }}"

  - name: global.application
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: |
          (
            rate(http_requests_total{status=~"5.."}[5m]) /
            rate(http_requests_total[5m])
          ) * 100 > 5
        for: 2m
        labels:
          severity: critical
          team: application
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.job }} in cluster {{ $labels.cluster }}"
          
      - alert: ResponseTimeHigh
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
          team: application
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is {{ $value }}s for {{ $labels.job }} in cluster {{ $labels.cluster }}"

Create cross-cluster federation rules

Configure recording rules that aggregate metrics across multiple clusters for global visibility.

groups:
  - name: global.aggregations
    interval: 60s
    rules:
      - record: global:node_cpu_usage:avg
        expr: |
          avg by (cluster) (
            100 - (avg by (instance, cluster) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
          )
        labels:
          aggregation: "cluster_average"
          
      - record: global:memory_usage:avg
        expr: |
          avg by (cluster) (
            (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 
            node_memory_MemTotal_bytes * 100
          )
        labels:
          aggregation: "cluster_average"
          
      - record: global:disk_usage:avg
        expr: |
          avg by (cluster) (
            100 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / 
            node_filesystem_size_bytes{fstype!="tmpfs"} * 100)
          )
        labels:
          aggregation: "cluster_average"
          
      - record: global:http_requests:rate5m
        expr: |
          sum by (cluster, job) (
            rate(http_requests_total[5m])
          )
        labels:
          aggregation: "cluster_sum"
          
      - record: global:error_rate:rate5m
        expr: |
          sum by (cluster, job) (
            rate(http_requests_total{status=~"5.."}[5m])
          ) / 
          sum by (cluster, job) (
            rate(http_requests_total[5m])
          )
        labels:
          aggregation: "cluster_ratio"

Set proper file permissions

Ensure Thanos Ruler can read the configuration files while maintaining security.

sudo chown -R thanos:thanos /etc/thanos
sudo chmod 755 /etc/thanos /etc/thanos/rules
sudo chmod 644 /etc/thanos/rules/*.yml /etc/thanos/ruler.yml
sudo chmod 755 /var/lib/thanos/ruler

Configure cross-cluster alert federation

Create Alertmanager configuration

Configure Alertmanager to handle alerts from Thanos Ruler with proper routing and grouping.

global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: 'your-smtp-password'

route:
  group_by: ['cluster', 'alertname', 'severity']
  group_wait: 10s
  group_interval: 30s
  repeat_interval: 4h
  receiver: 'default-receiver'
  routes:
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 5s
      repeat_interval: 1h
      
    - match:
        team: infrastructure
      receiver: 'infrastructure-team'
      
    - match:
        team: application
      receiver: 'application-team'

receivers:
  - name: 'default-receiver'
    email_configs:
      - to: 'ops-team@example.com'
        subject: '[{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
        body: |
          Cluster: {{ range .GroupLabels }}{{ . }}{{ end }}
          
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          {{ end }}
          
  - name: 'critical-alerts'
    email_configs:
      - to: 'oncall@example.com'
        subject: '[CRITICAL] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
        body: |
          CRITICAL ALERT TRIGGERED
          
          Cluster: {{ .GroupLabels.cluster }}
          
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Instance: {{ .Labels.instance }}
          {{ end }}
          
  - name: 'infrastructure-team'
    email_configs:
      - to: 'infrastructure@example.com'
        subject: '[INFRA] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'
        
  - name: 'application-team'
    email_configs:
      - to: 'developers@example.com'
        subject: '[APP] [{{ .GroupLabels.cluster }}] {{ .GroupLabels.alertname }}'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['cluster', 'instance']

Configure query endpoints

Set up Thanos Query to federate metrics from multiple Prometheus clusters for global querying.

[Unit]
Description=Thanos Query
After=network.target
Wants=network.target

[Service]
User=thanos
Group=thanos
Type=simple
ExecStart=/usr/local/bin/thanos query \
  --http-address=0.0.0.0:9090 \
  --grpc-address=0.0.0.0:10901 \
  --store=prometheus-cluster-1:10901 \
  --store=prometheus-cluster-2:10901 \
  --store=prometheus-cluster-3:10901 \
  --store=thanos-sidecar-1:10901 \
  --store=thanos-sidecar-2:10901 \
  --query.timeout=5m \
  --query.replica-label=replica \
  --log.level=info

Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-query

[Install]
WantedBy=multi-user.target

Enable and start services

Start Thanos Ruler and enable it to start automatically on system boot.

sudo systemctl daemon-reload
sudo systemctl enable --now thanos-ruler
sudo systemctl enable --now thanos-query
sudo systemctl status thanos-ruler

Monitor and troubleshoot Thanos Ruler

Configure monitoring for Thanos Ruler

Set up Prometheus scraping configuration to monitor Thanos Ruler metrics and performance.

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'monitoring'
    replica: 'prometheus-1'

rule_files:
  - "/etc/prometheus/rules/*.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  - job_name: 'thanos-ruler'
    static_configs:
      - targets: ['localhost:10902']
    scrape_interval: 15s
    metrics_path: /metrics
    
  - job_name: 'thanos-query'
    static_configs:
      - targets: ['localhost:9090']
    scrape_interval: 15s
    metrics_path: /metrics
    
  - job_name: 'thanos-sidecar'
    static_configs:
      - targets:
        - 'prometheus-cluster-1:10902'
        - 'prometheus-cluster-2:10902'
        - 'prometheus-cluster-3:10902'
    scrape_interval: 30s

Create Thanos Ruler monitoring rules

Add specific alerting rules to monitor the health and performance of Thanos Ruler itself.

groups:
  - name: thanos.ruler
    interval: 30s
    rules:
      - alert: ThanosRulerDown
        expr: up{job="thanos-ruler"} == 0
        for: 1m
        labels:
          severity: critical
          component: thanos-ruler
        annotations:
          summary: "Thanos Ruler is down"
          description: "Thanos Ruler has been down for more than 1 minute"
          
      - alert: ThanosRulerEvaluationFailures
        expr: increase(thanos_rule_evaluation_failures_total[5m]) > 0
        for: 1m
        labels:
          severity: warning
          component: thanos-ruler
        annotations:
          summary: "Thanos Ruler evaluation failures"
          description: "Thanos Ruler has {{ $value }} rule evaluation failures in the last 5 minutes"
          
      - alert: ThanosRulerHighLatency
        expr: histogram_quantile(0.99, rate(thanos_rule_evaluation_duration_seconds_bucket[5m])) > 10
        for: 5m
        labels:
          severity: warning
          component: thanos-ruler
        annotations:
          summary: "High rule evaluation latency"
          description: "99th percentile rule evaluation latency is {{ $value }}s"
          
      - alert: ThanosQueryUnavailable
        expr: thanos_rule_query_apis_down > 0
        for: 2m
        labels:
          severity: critical
          component: thanos-ruler
        annotations:
          summary: "Thanos Query API unavailable"
          description: "{{ $value }} Thanos Query APIs are unavailable for rule evaluation"
          
      - alert: ThanosRulerConfigReloadFailure
        expr: increase(thanos_rule_config_last_reload_successful[5m]) == 0
        for: 5m
        labels:
          severity: warning
          component: thanos-ruler
        annotations:
          summary: "Thanos Ruler config reload failure"
          description: "Thanos Ruler configuration reload has failed"

Note: For production deployments, consider setting up Thanos long-term storage to retain ruler evaluation results and enable historical analysis of alert patterns.

Verify your setup

sudo systemctl status thanos-ruler
curl http://localhost:10902/metrics | grep thanos_rule
curl http://localhost:10902/api/v1/rules
curl http://localhost:10902/api/v1/alerts
/usr/local/bin/thanos rule --help

Check the Thanos Ruler web interface at http://your-server:10902 to view active rules and alerts.

Common issues

Symptom	Cause	Fix
Service fails to start	Permission denied on data directory	`sudo chown -R thanos:thanos /var/lib/thanos`
Rules not loading	Invalid YAML syntax in rule files	`sudo -u thanos /usr/local/bin/thanos rule --rule-file=/etc/thanos/rules/*.yml --dry-run`
No metrics from clusters	Query endpoints unreachable	Check network connectivity and firewall rules for ports 10901/9090
Alerts not firing	Evaluation interval too long	Reduce `eval-interval` in systemd service or rule group intervals
High memory usage	Too many active series	Increase `--query.max-concurrent` and add resource limits
Alertmanager not receiving alerts	Wrong alertmanager URL	Verify `--alertmanagers.url` parameter and network connectivity

Next steps

Automated install script

Run this to automate the entire setup

install.sh

#!/usr/bin/env bash

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

# Global variables
THANOS_VERSION="v0.32.4"
THANOS_USER="thanos"
THANOS_HOME="/var/lib/thanos"
THANOS_CONFIG_DIR="/etc/thanos"
THANOS_RULES_DIR="/etc/thanos/rules"
THANOS_DATA_DIR="/var/lib/thanos/ruler"

# Default configuration
ALERTMANAGER_URL=${1:-"http://alertmanager:9093"}
QUERY_URL=${2:-"http://thanos-query:9090"}
CLUSTER_NAME=${3:-"global"}

usage() {
    echo "Usage: $0 [alertmanager_url] [query_url] [cluster_name]"
    echo "  alertmanager_url: URL to Alertmanager (default: http://alertmanager:9093)"
    echo "  query_url: URL to Thanos Query (default: http://thanos-query:9090)"
    echo "  cluster_name: Cluster identifier (default: global)"
    exit 1
}

log() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

error() {
    echo -e "${RED}[ERROR]${NC} $1" >&2
}

cleanup() {
    error "Installation failed. Cleaning up..."
    systemctl stop thanos-ruler 2>/dev/null || true
    systemctl disable thanos-ruler 2>/dev/null || true
    rm -f /etc/systemd/system/thanos-ruler.service
    userdel -r ${THANOS_USER} 2>/dev/null || true
    rm -rf ${THANOS_CONFIG_DIR} ${THANOS_HOME}
    rm -f /usr/local/bin/thanos
    exit 1
}

trap cleanup ERR

check_prerequisites() {
    if [[ $EUID -ne 0 ]]; then
        error "This script must be run as root or with sudo"
        exit 1
    fi

    if ! command -v wget &> /dev/null && ! command -v curl &> /dev/null; then
        error "Either wget or curl is required"
        exit 1
    fi

    if ! command -v tar &> /dev/null; then
        error "tar is required"
        exit 1
    fi
}

detect_distro() {
    if [ -f /etc/os-release ]; then
        . /etc/os-release
        case "$ID" in
            ubuntu|debian)
                PKG_MGR="apt"
                PKG_INSTALL="apt install -y"
                PKG_UPDATE="apt update && apt upgrade -y"
                ;;
            almalinux|rocky|centos|rhel|ol|fedora)
                PKG_MGR="dnf"
                PKG_INSTALL="dnf install -y"
                PKG_UPDATE="dnf update -y"
                ;;
            amzn)
                PKG_MGR="yum"
                PKG_INSTALL="yum install -y"
                PKG_UPDATE="yum update -y"
                ;;
            *)
                error "Unsupported distribution: $ID"
                exit 1
                ;;
        esac
    else
        error "Cannot detect distribution"
        exit 1
    fi
}

update_system() {
    log "[1/7] Updating system packages..."
    eval $PKG_UPDATE
}

create_user_directories() {
    log "[2/7] Creating Thanos user and directories..."
    
    if ! id ${THANOS_USER} &>/dev/null; then
        useradd --no-create-home --shell /bin/false --system ${THANOS_USER}
    fi
    
    mkdir -p ${THANOS_CONFIG_DIR} ${THANOS_RULES_DIR} ${THANOS_DATA_DIR}
    chown -R ${THANOS_USER}:${THANOS_USER} ${THANOS_CONFIG_DIR} ${THANOS_HOME}
    chmod 750 ${THANOS_CONFIG_DIR} ${THANOS_RULES_DIR} ${THANOS_DATA_DIR}
}

install_thanos() {
    log "[3/7] Downloading and installing Thanos..."
    
    cd /tmp
    THANOS_TARBALL="thanos-${THANOS_VERSION#v}.linux-amd64.tar.gz"
    THANOS_URL="https://github.com/thanos-io/thanos/releases/download/${THANOS_VERSION}/${THANOS_TARBALL}"
    
    if command -v wget &> /dev/null; then
        wget -q ${THANOS_URL}
    else
        curl -sLO ${THANOS_URL}
    fi
    
    tar -xzf ${THANOS_TARBALL}
    cp thanos-${THANOS_VERSION#v}.linux-amd64/thanos /usr/local/bin/
    chown root:root /usr/local/bin/thanos
    chmod 755 /usr/local/bin/thanos
    
    rm -rf thanos-${THANOS_VERSION#v}.linux-amd64 ${THANOS_TARBALL}
}

create_alerting_rules() {
    log "[4/7] Creating global alerting rules..."
    
    cat > ${THANOS_RULES_DIR}/global.yml << 'EOF'
groups:
  - name: global.infrastructure
    interval: 30s
    rules:
      - alert: HighMemoryUsage
        expr: |
          (
            node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
          ) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "High memory usage detected on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }} in cluster {{ $labels.cluster }}"
          
      - alert: PrometheusTargetDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
          team: monitoring
        annotations:
          summary: "Prometheus target is down"
          description: "Target {{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $labels.cluster }} has been down for more than 2 minutes"
          
      - alert: DiskSpaceLow
        expr: |
          (
            node_filesystem_avail_bytes{fstype!="tmpfs"} / 
            node_filesystem_size_bytes{fstype!="tmpfs"}
          ) * 100 < 15
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Disk space is below 15% on {{ $labels.instance }} at {{ $labels.mountpoint }}"
EOF
    
    chown ${THANOS_USER}:${THANOS_USER} ${THANOS_RULES_DIR}/global.yml
    chmod 644 ${THANOS_RULES_DIR}/global.yml
}

create_systemd_service() {
    log "[5/7] Creating systemd service..."
    
    cat > /etc/systemd/system/thanos-ruler.service << EOF
[Unit]
Description=Thanos Ruler
After=network.target
Wants=network.target

[Service]
User=${THANOS_USER}
Group=${THANOS_USER}
Type=simple
ExecStart=/usr/local/bin/thanos rule \\
  --data-dir=${THANOS_DATA_DIR} \\
  --eval-interval=30s \\
  --rule-file=${THANOS_RULES_DIR}/*.yml \\
  --alertmanagers.url=${ALERTMANAGER_URL} \\
  --query=${QUERY_URL} \\
  --http-address=0.0.0.0:10902 \\
  --grpc-address=0.0.0.0:10901 \\
  --label=replica="ruler-1" \\
  --label=cluster="${CLUSTER_NAME}" \\
  --log.level=info \\
  --log.format=json

Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=thanos-ruler
KillMode=mixed
KillSignal=SIGTERM

[Install]
WantedBy=multi-user.target
EOF
    
    chmod 644 /etc/systemd/system/thanos-ruler.service
}

configure_firewall() {
    log "[6/7] Configuring firewall..."
    
    if command -v ufw &> /dev/null; then
        ufw allow 10901/tcp comment "Thanos Ruler gRPC"
        ufw allow 10902/tcp comment "Thanos Ruler HTTP"
    elif command -v firewall-cmd &> /dev/null; then
        firewall-cmd --permanent --add-port=10901/tcp
        firewall-cmd --permanent --add-port=10902/tcp
        firewall-cmd --reload
    else
        warn "No firewall management tool found. Please manually open ports 10901 and 10902"
    fi
}

start_service() {
    log "[7/7] Starting and enabling Thanos Ruler service..."
    
    systemctl daemon-reload
    systemctl enable thanos-ruler
    systemctl start thanos-ruler
    
    sleep 5
    
    if systemctl is-active --quiet thanos-ruler; then
        log "Thanos Ruler service started successfully"
    else
        error "Failed to start Thanos Ruler service"
        systemctl status thanos-ruler --no-pager
        exit 1
    fi
}

verify_installation() {
    log "Verifying installation..."
    
    if ! systemctl is-enabled --quiet thanos-ruler; then
        error "Service is not enabled"
        return 1
    fi
    
    if ! systemctl is-active --quiet thanos-ruler; then
        error "Service is not running"
        return 1
    fi
    
    if ! /usr/local/bin/thanos --version &>/dev/null; then
        error "Thanos binary is not working"
        return 1
    fi
    
    sleep 10
    
    if command -v curl &> /dev/null; then
        if curl -sf http://localhost:10902/-/healthy &>/dev/null; then
            log "Health check passed"
        else
            warn "Health check failed - service may still be starting up"
        fi
    fi
    
    log "Installation completed successfully!"
    log "Thanos Ruler is running on:"
    log "  HTTP: http://localhost:10902"
    log "  gRPC: localhost:10901"
    log "  Rules directory: ${THANOS_RULES_DIR}"
    log "  Data directory: ${THANOS_DATA_DIR}"
}

main() {
    check_prerequisites
    detect_distro
    update_system
    create_user_directories
    install_thanos
    create_alerting_rules
    create_systemd_service
    configure_firewall
    start_service
    verify_installation
}

main "$@"

Review the script before running. Execute with: bash install.sh

#thanos #prometheus #alerting #monitoring #distributed-systems

Configure Thanos Ruler for distributed alerting across multiple Prometheus clusters