Deploy a complete Prometheus and Grafana monitoring stack using Docker Compose with persistent storage, custom dashboards, and alerting rules for production monitoring.

Prerequisites

Root or sudo access
At least 4GB RAM
Docker and Docker Compose
Open ports 3000, 9090, 9093

What this solves

This tutorial shows you how to deploy a complete monitoring stack with Prometheus for metrics collection and Grafana for visualization using Docker Compose. You'll configure persistent storage, create monitoring dashboards, and set up alerting rules for production infrastructure monitoring.

Step-by-step installation

Update system packages

Start by updating your package manager to ensure you get the latest versions of all dependencies.

sudo apt update && sudo apt upgrade -y

sudo dnf update -y

Install Docker and Docker Compose

Install Docker Engine and Docker Compose to manage the containerized monitoring stack.

sudo apt install -y ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin

sudo dnf install -y yum-utils
sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
sudo dnf install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin

Enable and start Docker service

Enable Docker to start automatically on boot and add your user to the docker group to run commands without sudo.

sudo systemctl enable --now docker
sudo usermod -aG docker $USER
newgrp docker

Create project directory structure

Create a dedicated directory structure for the monitoring stack with separate folders for configuration files and persistent data.

mkdir -p ~/monitoring-stack/{prometheus,grafana,alertmanager}
mkdir -p ~/monitoring-stack/data/{prometheus,grafana}
cd ~/monitoring-stack

Create Prometheus configuration

Configure Prometheus to scrape metrics from itself and prepare for additional targets. This configuration includes basic scraping rules and retention settings.

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    monitor: 'production-monitor'

rule_files:
  - "alert_rules.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
        labels:
          group: 'monitoring'

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          group: 'infrastructure'

  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']
        labels:
          group: 'containers'

Create Prometheus alert rules

Define alerting rules for common infrastructure issues like high CPU usage, memory consumption, and service availability.

groups:
  - name: infrastructure_alerts
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is above 80% for more than 5 minutes"

      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is above 85% for more than 5 minutes"

      - alert: ServiceDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "Service {{ $labels.instance }} has been down for more than 2 minutes"

Create Alertmanager configuration

Configure Alertmanager to handle alert routing and notifications. This example includes email notifications and alert grouping.

global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alerts@example.com'

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'

receivers:
  - name: 'web.hook'
    email_configs:
      - to: 'admin@example.com'
        subject: 'Alert: {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Labels: {{ .Labels }}
          {{ end }}

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

Create Docker Compose configuration

Define the complete monitoring stack with Prometheus, Grafana, Alertmanager, Node Exporter, and cAdvisor for comprehensive monitoring coverage.

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
      - ./data/prometheus:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    restart: unless-stopped
    networks:
      - monitoring

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3000:3000"
    volumes:
      - ./data/grafana:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
    restart: unless-stopped
    networks:
      - monitoring

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    restart: unless-stopped
    networks:
      - monitoring

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped
    networks:
      - monitoring

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    privileged: true
    restart: unless-stopped
    networks:
      - monitoring

networks:
  monitoring:
    driver: bridge

Create Grafana provisioning configuration

Set up Grafana to automatically provision Prometheus as a data source and load default dashboards on startup.

mkdir -p grafana/provisioning/{datasources,dashboards}

apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true

apiVersion: 1

providers:
  - name: 'default'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards

Set correct permissions for data directories

Configure proper ownership and permissions for persistent data directories. Grafana runs as UID 472, while Prometheus uses UID 65534.

Never use chmod 777. It gives every user on the system full access to your files. Instead, fix ownership with chown and use minimal permissions.

sudo chown -R 472:472 data/grafana
sudo chown -R 65534:65534 data/prometheus
chmod -R 755 data/

Deploy the monitoring stack

Start all services using Docker Compose. This will pull the required images and start the complete monitoring stack.

docker compose up -d
docker compose ps

Configure firewall rules

Open the necessary ports for accessing Grafana, Prometheus, and other monitoring services.

sudo ufw allow 3000/tcp comment 'Grafana'
sudo ufw allow 9090/tcp comment 'Prometheus'
sudo ufw allow 9093/tcp comment 'Alertmanager'
sudo ufw reload

sudo firewall-cmd --permanent --add-port=3000/tcp --add-port=9090/tcp --add-port=9093/tcp
sudo firewall-cmd --reload

Import system monitoring dashboard

Create a basic system monitoring dashboard that displays CPU, memory, disk, and network metrics from Node Exporter.

{
  "dashboard": {
    "id": null,
    "title": "System Overview",
    "tags": ["system", "overview"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "CPU Usage",
        "type": "stat",
        "targets": [
          {
            "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
            "legendFormat": "CPU Usage %"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100
          }
        },
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
      },
      {
        "id": 2,
        "title": "Memory Usage",
        "type": "stat",
        "targets": [
          {
            "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100",
            "legendFormat": "Memory Usage %"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100
          }
        },
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
      }
    ],
    "time": {"from": "now-1h", "to": "now"},
    "refresh": "5s"
  }
}

Verify your setup

Check that all services are running and accessible through their web interfaces.

docker compose ps
curl -s http://localhost:9090/-/healthy
curl -s http://localhost:3000/api/health
ss -tlnp | grep -E ':(3000|9090|9093)'
prometheus --version 2>/dev/null || docker exec prometheus prometheus --version

Access the web interfaces:

Grafana: http://your-server-ip:3000 (admin/admin123)
Prometheus: http://your-server-ip:9090
Alertmanager: http://your-server-ip:9093

Common issues

Symptom	Cause	Fix
Grafana shows "Permission denied" errors	Incorrect ownership of data directory	`sudo chown -R 472:472 data/grafana`
Prometheus fails to start	Invalid YAML configuration	Check config with `promtool check config prometheus.yml`
Node Exporter metrics missing	Container lacks host filesystem access	Verify volume mounts in docker-compose.yml
Containers restart continuously	Port conflicts or resource limits	`docker compose logs [service-name]`
Cannot access web interfaces	Firewall blocking ports	Open ports 3000, 9090, 9093 in firewall
Alerts not firing	Alertmanager not connected to Prometheus	Check alerting section in prometheus.yml

Next steps

Automated install script

Run this to automate the entire setup

install.sh

#!/usr/bin/env bash

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

# Configuration
INSTALL_DIR="${1:-/opt/monitoring-stack}"
COMPOSE_VERSION="v2.24.1"

# Cleanup function for error handling
cleanup() {
    echo -e "${RED}[ERROR] Installation failed. Cleaning up...${NC}"
    if [[ -d "$INSTALL_DIR" ]]; then
        rm -rf "$INSTALL_DIR"
    fi
    exit 1
}

trap cleanup ERR

usage() {
    echo "Usage: $0 [install_directory]"
    echo "Example: $0 /opt/monitoring-stack"
    exit 1
}

log() {
    echo -e "${GREEN}$1${NC}"
}

warn() {
    echo -e "${YELLOW}$1${NC}"
}

error() {
    echo -e "${RED}$1${NC}"
}

# Check if running as root or with sudo
check_privileges() {
    if [[ $EUID -ne 0 ]]; then
        error "This script must be run as root or with sudo"
        exit 1
    fi
}

# Detect distribution and set package manager
detect_distro() {
    if [[ ! -f /etc/os-release ]]; then
        error "Cannot detect distribution. /etc/os-release not found."
        exit 1
    fi
    
    . /etc/os-release
    
    case "$ID" in
        ubuntu|debian)
            PKG_MGR="apt"
            PKG_INSTALL="apt install -y"
            PKG_UPDATE="apt update && apt upgrade -y"
            ;;
        almalinux|rocky|centos|rhel|ol|fedora)
            PKG_MGR="dnf"
            PKG_INSTALL="dnf install -y"
            PKG_UPDATE="dnf update -y"
            ;;
        amzn)
            PKG_MGR="yum"
            PKG_INSTALL="yum install -y"
            PKG_UPDATE="yum update -y"
            ;;
        *)
            error "Unsupported distribution: $ID"
            exit 1
            ;;
    esac
}

# Install Docker based on distribution
install_docker() {
    case "$PKG_MGR" in
        apt)
            $PKG_INSTALL ca-certificates curl gnupg lsb-release
            curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
            echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list
            apt update
            $PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
            ;;
        dnf)
            $PKG_INSTALL yum-utils
            yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
            $PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
            ;;
        yum)
            $PKG_INSTALL yum-utils
            yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
            $PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
            ;;
    esac
}

main() {
    check_privileges
    detect_distro
    
    log "[1/8] Updating system packages..."
    $PKG_UPDATE
    
    log "[2/8] Installing Docker and Docker Compose..."
    install_docker
    
    log "[3/8] Enabling and starting Docker service..."
    systemctl enable --now docker
    
    log "[4/8] Creating project directory structure..."
    mkdir -p "$INSTALL_DIR"/{prometheus,grafana,alertmanager}
    mkdir -p "$INSTALL_DIR"/data/{prometheus,grafana}
    
    log "[5/8] Creating Prometheus configuration..."
    cat > "$INSTALL_DIR/prometheus/prometheus.yml" << 'EOF'
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    monitor: 'production-monitor'

rule_files:
  - "alert_rules.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
        labels:
          group: 'monitoring'

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          group: 'infrastructure'

  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']
        labels:
          group: 'containers'
EOF

    cat > "$INSTALL_DIR/prometheus/alert_rules.yml" << 'EOF'
groups:
  - name: infrastructure_alerts
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is above 80% for more than 5 minutes"

      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is above 85% for more than 5 minutes"

      - alert: ServiceDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "Service {{ \$labels.instance }} has been down for more than 2 minutes"
EOF

    log "[6/8] Creating Alertmanager configuration..."
    cat > "$INSTALL_DIR/alertmanager/alertmanager.yml" << 'EOF'
global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alerts@example.com'

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'

receivers:
  - name: 'web.hook'
    email_configs:
      - to: 'admin@example.com'
        subject: 'Alert: {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          {{ end }}

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
EOF

    log "[7/8] Creating Docker Compose configuration..."
    cat > "$INSTALL_DIR/docker-compose.yml" << 'EOF'
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus:/etc/prometheus
      - ./data/prometheus:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3000:3000"
    volumes:
      - ./data/grafana:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
    restart: unless-stopped

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager:/etc/alertmanager
    restart: unless-stopped

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    restart: unless-stopped
EOF

    # Set proper permissions
    chown -R root:root "$INSTALL_DIR"
    chmod -R 755 "$INSTALL_DIR"
    find "$INSTALL_DIR" -type f -exec chmod 644 {} \;
    chmod 755 "$INSTALL_DIR"/data/{prometheus,grafana}

    log "[8/8] Starting monitoring stack..."
    cd "$INSTALL_DIR"
    docker compose up -d

    log "Installation completed successfully!"
    echo ""
    log "Access URLs:"
    log "- Prometheus: http://localhost:9090"
    log "- Grafana: http://localhost:3000 (admin/admin123)"
    log "- Alertmanager: http://localhost:9093"
    log "- Node Exporter: http://localhost:9100"
    log "- cAdvisor: http://localhost:8080"
    echo ""
    warn "Remember to:"
    warn "1. Change default Grafana password"
    warn "2. Configure email settings in alertmanager.yml"
    warn "3. Update firewall rules if needed"
}

main "$@"

Review the script before running. Execute with: bash install.sh

#prometheus #grafana #docker #monitoring #observability

Set up Prometheus and Grafana monitoring stack with Docker compose