Deploy a complete Prometheus and Grafana monitoring stack using Docker Compose with persistent storage, custom dashboards, and alerting rules for production monitoring.
Prerequisites
- Root or sudo access
- At least 4GB RAM
- Docker and Docker Compose
- Open ports 3000, 9090, 9093
What this solves
This tutorial shows you how to deploy a complete monitoring stack with Prometheus for metrics collection and Grafana for visualization using Docker Compose. You'll configure persistent storage, create monitoring dashboards, and set up alerting rules for production infrastructure monitoring.
Step-by-step installation
Update system packages
Start by updating your package manager to ensure you get the latest versions of all dependencies.
sudo apt update && sudo apt upgrade -y
Install Docker and Docker Compose
Install Docker Engine and Docker Compose to manage the containerized monitoring stack.
sudo apt install -y ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
Enable and start Docker service
Enable Docker to start automatically on boot and add your user to the docker group to run commands without sudo.
sudo systemctl enable --now docker
sudo usermod -aG docker $USER
newgrp docker
Create project directory structure
Create a dedicated directory structure for the monitoring stack with separate folders for configuration files and persistent data.
mkdir -p ~/monitoring-stack/{prometheus,grafana,alertmanager}
mkdir -p ~/monitoring-stack/data/{prometheus,grafana}
cd ~/monitoring-stack
Create Prometheus configuration
Configure Prometheus to scrape metrics from itself and prepare for additional targets. This configuration includes basic scraping rules and retention settings.
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'production-monitor'
rule_files:
- "alert_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
group: 'monitoring'
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
labels:
group: 'infrastructure'
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
labels:
group: 'containers'
Create Prometheus alert rules
Define alerting rules for common infrastructure issues like high CPU usage, memory consumption, and service availability.
groups:
- name: infrastructure_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 5 minutes"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 85% for more than 5 minutes"
- alert: ServiceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "Service {{ $labels.instance }} has been down for more than 2 minutes"
Create Alertmanager configuration
Configure Alertmanager to handle alert routing and notifications. This example includes email notifications and alert grouping.
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@example.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: 'admin@example.com'
subject: 'Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ .Labels }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
Create Docker Compose configuration
Define the complete monitoring stack with Prometheus, Grafana, Alertmanager, Node Exporter, and cAdvisor for comprehensive monitoring coverage.
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
- ./data/prometheus:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
restart: unless-stopped
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
volumes:
- ./data/grafana:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
restart: unless-stopped
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
restart: unless-stopped
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
restart: unless-stopped
networks:
- monitoring
networks:
monitoring:
driver: bridge
Create Grafana provisioning configuration
Set up Grafana to automatically provision Prometheus as a data source and load default dashboards on startup.
mkdir -p grafana/provisioning/{datasources,dashboards}
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
Set correct permissions for data directories
Configure proper ownership and permissions for persistent data directories. Grafana runs as UID 472, while Prometheus uses UID 65534.
sudo chown -R 472:472 data/grafana
sudo chown -R 65534:65534 data/prometheus
chmod -R 755 data/
Deploy the monitoring stack
Start all services using Docker Compose. This will pull the required images and start the complete monitoring stack.
docker compose up -d
docker compose ps
Configure firewall rules
Open the necessary ports for accessing Grafana, Prometheus, and other monitoring services.
sudo ufw allow 3000/tcp comment 'Grafana'
sudo ufw allow 9090/tcp comment 'Prometheus'
sudo ufw allow 9093/tcp comment 'Alertmanager'
sudo ufw reload
Import system monitoring dashboard
Create a basic system monitoring dashboard that displays CPU, memory, disk, and network metrics from Node Exporter.
{
"dashboard": {
"id": null,
"title": "System Overview",
"tags": ["system", "overview"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "CPU Usage",
"type": "stat",
"targets": [
{
"expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU Usage %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Memory Usage",
"type": "stat",
"targets": [
{
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100",
"legendFormat": "Memory Usage %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
}
],
"time": {"from": "now-1h", "to": "now"},
"refresh": "5s"
}
}
Verify your setup
Check that all services are running and accessible through their web interfaces.
docker compose ps
curl -s http://localhost:9090/-/healthy
curl -s http://localhost:3000/api/health
ss -tlnp | grep -E ':(3000|9090|9093)'
prometheus --version 2>/dev/null || docker exec prometheus prometheus --version
Access the web interfaces:
- Grafana: http://your-server-ip:3000 (admin/admin123)
- Prometheus: http://your-server-ip:9090
- Alertmanager: http://your-server-ip:9093
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Grafana shows "Permission denied" errors | Incorrect ownership of data directory | sudo chown -R 472:472 data/grafana |
| Prometheus fails to start | Invalid YAML configuration | Check config with promtool check config prometheus.yml |
| Node Exporter metrics missing | Container lacks host filesystem access | Verify volume mounts in docker-compose.yml |
| Containers restart continuously | Port conflicts or resource limits | docker compose logs [service-name] |
| Cannot access web interfaces | Firewall blocking ports | Open ports 3000, 9090, 9093 in firewall |
| Alerts not firing | Alertmanager not connected to Prometheus | Check alerting section in prometheus.yml |
Next steps
- Install and configure Loki for centralized log aggregation with Grafana integration
- Install and configure Jaeger for distributed tracing with Elasticsearch backend
- Configure Prometheus Blackbox Exporter for endpoint monitoring
- Set up Grafana alerting with Slack and Teams integration
- Monitor Kubernetes cluster with Prometheus Operator
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Configuration
INSTALL_DIR="${1:-/opt/monitoring-stack}"
COMPOSE_VERSION="v2.24.1"
# Cleanup function for error handling
cleanup() {
echo -e "${RED}[ERROR] Installation failed. Cleaning up...${NC}"
if [[ -d "$INSTALL_DIR" ]]; then
rm -rf "$INSTALL_DIR"
fi
exit 1
}
trap cleanup ERR
usage() {
echo "Usage: $0 [install_directory]"
echo "Example: $0 /opt/monitoring-stack"
exit 1
}
log() {
echo -e "${GREEN}$1${NC}"
}
warn() {
echo -e "${YELLOW}$1${NC}"
}
error() {
echo -e "${RED}$1${NC}"
}
# Check if running as root or with sudo
check_privileges() {
if [[ $EUID -ne 0 ]]; then
error "This script must be run as root or with sudo"
exit 1
fi
}
# Detect distribution and set package manager
detect_distro() {
if [[ ! -f /etc/os-release ]]; then
error "Cannot detect distribution. /etc/os-release not found."
exit 1
fi
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
PKG_UPDATE="apt update && apt upgrade -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
PKG_UPDATE="yum update -y"
;;
*)
error "Unsupported distribution: $ID"
exit 1
;;
esac
}
# Install Docker based on distribution
install_docker() {
case "$PKG_MGR" in
apt)
$PKG_INSTALL ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list
apt update
$PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
;;
dnf)
$PKG_INSTALL yum-utils
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
$PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
;;
yum)
$PKG_INSTALL yum-utils
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
$PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
;;
esac
}
main() {
check_privileges
detect_distro
log "[1/8] Updating system packages..."
$PKG_UPDATE
log "[2/8] Installing Docker and Docker Compose..."
install_docker
log "[3/8] Enabling and starting Docker service..."
systemctl enable --now docker
log "[4/8] Creating project directory structure..."
mkdir -p "$INSTALL_DIR"/{prometheus,grafana,alertmanager}
mkdir -p "$INSTALL_DIR"/data/{prometheus,grafana}
log "[5/8] Creating Prometheus configuration..."
cat > "$INSTALL_DIR/prometheus/prometheus.yml" << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'production-monitor'
rule_files:
- "alert_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
group: 'monitoring'
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
labels:
group: 'infrastructure'
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
labels:
group: 'containers'
EOF
cat > "$INSTALL_DIR/prometheus/alert_rules.yml" << 'EOF'
groups:
- name: infrastructure_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 5 minutes"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 85% for more than 5 minutes"
- alert: ServiceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "Service {{ \$labels.instance }} has been down for more than 2 minutes"
EOF
log "[6/8] Creating Alertmanager configuration..."
cat > "$INSTALL_DIR/alertmanager/alertmanager.yml" << 'EOF'
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@example.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: 'admin@example.com'
subject: 'Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
log "[7/8] Creating Docker Compose configuration..."
cat > "$INSTALL_DIR/docker-compose.yml" << 'EOF'
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus:/etc/prometheus
- ./data/prometheus:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
volumes:
- ./data/grafana:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager:/etc/alertmanager
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
restart: unless-stopped
EOF
# Set proper permissions
chown -R root:root "$INSTALL_DIR"
chmod -R 755 "$INSTALL_DIR"
find "$INSTALL_DIR" -type f -exec chmod 644 {} \;
chmod 755 "$INSTALL_DIR"/data/{prometheus,grafana}
log "[8/8] Starting monitoring stack..."
cd "$INSTALL_DIR"
docker compose up -d
log "Installation completed successfully!"
echo ""
log "Access URLs:"
log "- Prometheus: http://localhost:9090"
log "- Grafana: http://localhost:3000 (admin/admin123)"
log "- Alertmanager: http://localhost:9093"
log "- Node Exporter: http://localhost:9100"
log "- cAdvisor: http://localhost:8080"
echo ""
warn "Remember to:"
warn "1. Change default Grafana password"
warn "2. Configure email settings in alertmanager.yml"
warn "3. Update firewall rules if needed"
}
main "$@"
Review the script before running. Execute with: bash install.sh