Learn to build production-grade Grafana dashboards with advanced templating, custom Prometheus queries, sophisticated alerting rules, and automated provisioning for scalable monitoring infrastructure.
Prerequisites
- Running Prometheus server with metric collection
- Grafana instance with admin access
- Basic understanding of PromQL queries
- Sufficient disk space for dashboard and alert configurations
What this solves
Advanced Grafana dashboards with Prometheus integration provide comprehensive monitoring capabilities for production environments. This tutorial covers creating dynamic dashboards with variables and templating, writing custom PromQL queries for complex metrics analysis, configuring sophisticated alerting rules with multiple notification channels, and implementing dashboard provisioning for version control and automated deployment.
Prerequisites
You need a working Prometheus server collecting metrics and a Grafana instance with admin access. Both services should be accessible and properly configured with basic authentication. Ensure you have sufficient disk space for dashboard configurations and alert rule storage.
Step-by-step configuration
Configure Prometheus data source with advanced settings
Set up the Prometheus data source in Grafana with optimized query settings and authentication. This configuration enables efficient metric queries and proper security integration.
curl -X POST http://admin:admin@localhost:3000/api/datasources \
-H "Content-Type: application/json" \
-d '{
"name": "prometheus-advanced",
"type": "prometheus",
"url": "http://localhost:9090",
"access": "proxy",
"basicAuth": false,
"jsonData": {
"timeInterval": "15s",
"queryTimeout": "60s",
"httpMethod": "POST",
"customQueryParameters": "",
"manageAlerts": true,
"exemplarTraceIdDestinations": []
}
}'
Create dashboard provisioning directory structure
Set up the directory structure for dashboard provisioning to enable version control and automated deployment of dashboard configurations.
sudo mkdir -p /etc/grafana/provisioning/dashboards
sudo mkdir -p /etc/grafana/provisioning/datasources
sudo mkdir -p /etc/grafana/provisioning/alerting
sudo mkdir -p /var/lib/grafana/dashboards
sudo chown -R grafana:grafana /etc/grafana/provisioning
sudo chown -R grafana:grafana /var/lib/grafana/dashboards
sudo chmod -R 755 /etc/grafana/provisioning
sudo chmod -R 755 /var/lib/grafana/dashboards
Configure dashboard provisioning provider
Create the dashboard provisioning configuration to automatically load dashboards from the file system and enable version control integration.
apiVersion: 1
providers:
- name: 'production-dashboards'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
- name: 'infrastructure-dashboards'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: false
options:
path: /var/lib/grafana/dashboards/infrastructure
foldersFromFilesStructure: true
Create advanced system monitoring dashboard with templating
Build a comprehensive system monitoring dashboard with variables for dynamic server selection and advanced templating features for scalable monitoring.
sudo mkdir -p /var/lib/grafana/dashboards/infrastructure
{
"dashboard": {
"id": null,
"title": "Advanced System Monitoring",
"tags": ["infrastructure", "system", "monitoring"],
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"templating": {
"list": [
{
"name": "instance",
"type": "query",
"label": "Instance",
"multi": true,
"includeAll": true,
"allValue": ".*",
"query": "label_values(up, instance)",
"datasource": "prometheus-advanced",
"refresh": 1,
"regex": "",
"sort": 1
},
{
"name": "job",
"type": "query",
"label": "Job",
"multi": true,
"includeAll": true,
"allValue": ".*",
"query": "label_values(up{instance=~\"$instance\"}, job)",
"datasource": "prometheus-advanced",
"refresh": 1,
"regex": "",
"sort": 1
}
]
},
"panels": [
{
"id": 1,
"title": "CPU Usage by Instance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"targets": [
{
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\",job=~\"$job\"}[5m])) * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
}
},
{
"id": 2,
"title": "Memory Usage",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\",job=~\"$job\"} / node_memory_MemTotal_bytes{instance=~\"$instance\",job=~\"$job\"})) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
}
}
]
}
}
Create application performance dashboard with custom queries
Build an application-focused dashboard with advanced PromQL queries for request rates, error tracking, and performance analysis across multiple services.
{
"dashboard": {
"id": null,
"title": "Application Performance Monitoring",
"tags": ["application", "performance", "sla"],
"timezone": "browser",
"refresh": "15s",
"time": {
"from": "now-30m",
"to": "now"
},
"templating": {
"list": [
{
"name": "service",
"type": "query",
"label": "Service",
"multi": true,
"includeAll": true,
"query": "label_values(http_requests_total, service)",
"datasource": "prometheus-advanced"
}
]
},
"panels": [
{
"id": 1,
"title": "Request Rate (RPS)",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
"targets": [
{
"expr": "sum(rate(http_requests_total{service=~\"$service\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 100},
{"color": "red", "value": 500}
]
}
}
}
},
{
"id": 2,
"title": "Error Rate",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
"targets": [
{
"expr": "(sum(rate(http_requests_total{service=~\"$service\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=~\"$service\"}[5m]))) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
}
},
{
"id": 3,
"title": "Response Time P95",
"type": "timeseries",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 4},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\"}[5m])) by (le, service))",
"legendFormat": "{{service}} - P95",
"refId": "A"
},
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\"}[5m])) by (le, service))",
"legendFormat": "{{service}} - P50",
"refId": "B"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 0.5},
{"color": "red", "value": 2}
]
}
}
}
}
]
}
}
Configure alert rule provisioning
Set up alert rule provisioning to manage alerting rules as code and enable version control for alert configurations.
apiVersion: 1
groups:
- name: infrastructure-alerts
orgId: 1
folder: Infrastructure
interval: 1m
rules:
- uid: cpu-high-usage
title: High CPU Usage
condition: A
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus-advanced
model:
expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90'
intervalMs: 1000
maxDataPoints: 43200
refId: A
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
description: "CPU usage is above 90% on {{ $labels.instance }}"
runbook_url: "https://example.com/runbooks/high-cpu"
summary: "High CPU usage detected"
labels:
severity: warning
team: infrastructure
- uid: memory-high-usage
title: High Memory Usage
condition: A
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus-advanced
model:
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95'
intervalMs: 1000
maxDataPoints: 43200
refId: A
noDataState: NoData
execErrState: Alerting
for: 3m
annotations:
description: "Memory usage is above 95% on {{ $labels.instance }}"
runbook_url: "https://example.com/runbooks/high-memory"
summary: "High memory usage detected"
labels:
severity: critical
team: infrastructure
Configure notification channels
Set up multiple notification channels including Slack, email, and webhook integrations for comprehensive alert delivery across different communication platforms.
apiVersion: 1
notificationPolicies:
- orgId: 1
receiver: default-receiver
group_by:
- grafana_folder
- alertname
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
routes:
- receiver: infrastructure-slack
object_matchers:
- ['team', '=', 'infrastructure']
group_by: ['alertname', 'instance']
group_wait: 5s
group_interval: 5s
repeat_interval: 30m
routes:
- receiver: infrastructure-pager
object_matchers:
- ['severity', '=', 'critical']
group_wait: 0s
repeat_interval: 15m
- receiver: application-team
object_matchers:
- ['team', '=', 'application']
group_by: ['alertname', 'service']
group_wait: 10s
repeat_interval: 1h
Configure contact points for notifications
Create contact points for different notification channels with proper authentication and formatting for effective alert delivery.
apiVersion: 1
contactPoints:
- orgId: 1
name: default-receiver
receivers:
- uid: default-email
type: email
settings:
addresses: "admin@example.com"
subject: "Grafana Alert: {{ .GroupLabels.alertname }}"
message: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels:
{{ range .Labels.SortedPairs }}
{{ .Name }}: {{ .Value }}
{{ end }}
{{ end }}
- orgId: 1
name: infrastructure-slack
receivers:
- uid: infra-slack
type: slack
settings:
url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
channel: "#infrastructure-alerts"
username: "Grafana"
title: "{{ .GroupLabels.alertname }}"
text: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Runbook: {{ .Annotations.runbook_url }}
{{ end }}
- orgId: 1
name: infrastructure-pager
receivers:
- uid: infra-webhook
type: webhook
settings:
url: "https://events.pagerduty.com/integration/YOUR_INTEGRATION_KEY/enqueue"
httpMethod: "POST"
maxAlerts: 0
message: |
{
"routing_key": "YOUR_INTEGRATION_KEY",
"event_action": "trigger",
"dedup_key": "{{ .GroupLabels.alertname }}-{{ .GroupLabels.instance }}",
"payload": {
"summary": "{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}",
"source": "{{ .Labels.instance }}",
"severity": "{{ .Labels.severity }}",
"custom_details": {
"description": "{{ .Annotations.description }}",
"runbook_url": "{{ .Annotations.runbook_url }}"
}
}
}
- orgId: 1
name: application-team
receivers:
- uid: app-email
type: email
settings:
addresses: "developers@example.com"
subject: "Application Alert: {{ .GroupLabels.alertname }}"
message: |
Service: {{ .Labels.service }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Dashboard: http://grafana.example.com/d/app-perf
Runbook: {{ .Annotations.runbook_url }}
Create application alert rules
Configure application-specific alert rules for monitoring service health, error rates, and performance thresholds with appropriate severity levels.
apiVersion: 1
groups:
- name: application-alerts
orgId: 1
folder: Applications
interval: 30s
rules:
- uid: high-error-rate
title: High Error Rate
condition: A
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus-advanced
model:
expr: '(sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service)) * 100 > 5'
intervalMs: 1000
maxDataPoints: 43200
refId: A
noDataState: NoData
execErrState: Alerting
for: 2m
annotations:
description: "Error rate is {{ $value }}% for service {{ $labels.service }}"
runbook_url: "https://example.com/runbooks/high-error-rate"
summary: "High error rate detected in {{ $labels.service }}"
labels:
severity: critical
team: application
- uid: slow-response-time
title: Slow Response Time
condition: A
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus-advanced
model:
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service)) > 2'
intervalMs: 1000
maxDataPoints: 43200
refId: A
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}"
runbook_url: "https://example.com/runbooks/slow-response"
summary: "Slow response time in {{ $labels.service }}"
labels:
severity: warning
team: application
- uid: low-request-rate
title: Low Request Rate
condition: A
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus-advanced
model:
expr: 'sum(rate(http_requests_total[5m])) by (service) < 1'
intervalMs: 1000
maxDataPoints: 43200
refId: A
noDataState: Alerting
execErrState: NoData
for: 10m
annotations:
description: "Request rate is {{ $value }} RPS for service {{ $labels.service }}"
runbook_url: "https://example.com/runbooks/low-traffic"
summary: "Unusually low traffic in {{ $labels.service }}"
labels:
severity: warning
team: application
Configure Grafana alerting settings
Update the main Grafana configuration to enable alerting features and set appropriate retention policies for alert history and evaluation intervals.
[alerting]
Enable the new alerting system
enabled = true
Disable legacy alerting
execute_alerts = false
Alert evaluation timeout
evaluation_timeout_seconds = 30
Maximum number of alert rule evaluations
max_concurrent_evals = 4
Alert notification timeout
notification_timeout_seconds = 30
Maximum number of attempts to send a notification
max_attempts = 3
Minimum interval between alert rule evaluations
min_interval_seconds = 10
[unified_alerting]
Enable unified alerting
enabled = true
Disable legacy alerting completely
execute_alerts = false
Base URL for alert notifications
base_url = http://grafana.example.com
Alert state history retention
max_state_history_annotations = 0
Evaluation interval for alert rules
evaluation_timeout = 30s
max_concurrent_evals = 4
[unified_alerting.screenshots]
Enable alert rule screenshots
capture = true
capture_timeout = 10s
max_concurrent_screenshots = 2
upload_image_storage = true
Restart and reload Grafana configuration
Restart the Grafana service to apply the new configuration and verify that all provisioned dashboards and alert rules are loaded correctly.
sudo systemctl restart grafana-server
sudo systemctl status grafana-server
Verify that provisioning is working correctly by checking the Grafana logs for any errors.
sudo journalctl -u grafana-server -f --since "5 minutes ago"
Create custom PromQL queries for business metrics
Implement advanced PromQL queries that provide business-relevant metrics and SLA monitoring for comprehensive application observability.
{
"dashboard": {
"id": null,
"title": "Business Metrics Dashboard",
"tags": ["business", "sla", "kpi"],
"timezone": "browser",
"refresh": "1m",
"time": {
"from": "now-4h",
"to": "now"
},
"templating": {
"list": [
{
"name": "environment",
"type": "query",
"label": "Environment",
"query": "label_values(http_requests_total, environment)",
"datasource": "prometheus-advanced",
"current": {"value": "production", "text": "production"}
}
]
},
"panels": [
{
"id": 1,
"title": "SLA Compliance (99.9% target)",
"type": "stat",
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 0},
"targets": [
{
"expr": "(1 - (sum(rate(http_requests_total{status=~\"5..\",environment=~\"$environment\"}[1h])) / sum(rate(http_requests_total{environment=~\"$environment\"}[1h])))) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 3,
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 99.5},
{"color": "green", "value": 99.9}
]
}
}
}
},
{
"id": 2,
"title": "Apdex Score (T=500ms)",
"type": "stat",
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 0},
"targets": [
{
"expr": "(sum(rate(http_request_duration_seconds_bucket{le=\"0.5\",environment=~\"$environment\"}[5m])) + sum(rate(http_request_duration_seconds_bucket{le=\"2.0\",environment=~\"$environment\"}[5m])) / 2) / sum(rate(http_request_duration_seconds_count{environment=~\"$environment\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 3,
"min": 0,
"max": 1,
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 0.7},
{"color": "green", "value": 0.85}
]
}
}
}
}
]
}
}
Verify your setup
Test that your advanced Grafana configuration is working properly by checking dashboard loading, alert rule evaluation, and notification delivery.
# Check Grafana service status
sudo systemctl status grafana-server
Verify dashboard provisioning
curl -H "Authorization: Bearer YOUR_API_KEY" \
http://localhost:3000/api/search?type=dash-db | jq '.[] | .title'
Check alert rules status
curl -H "Authorization: Bearer YOUR_API_KEY" \
http://localhost:3000/api/v1/provisioning/alert-rules | jq '.[] | {title: .title, condition: .condition}'
Test notification channels
curl -X POST -H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{"message": "Test alert from Grafana"}' \
http://localhost:3000/api/alerts/test
Access your dashboards through the Grafana web interface and verify that variables are populated correctly and panels display appropriate data.
# Check provisioning logs
sudo journalctl -u grafana-server | grep -i provision
Verify alert manager integration
curl http://localhost:3000/api/alertmanager/grafana/api/v1/status
Advanced dashboard techniques
You can enhance your dashboards further with these advanced techniques. Use transform functions to manipulate query results, implement drill-down capabilities with dashboard links, and create custom panel plugins for specific visualization needs. The Kubernetes monitoring tutorial provides additional examples of complex dashboard configurations for container environments.
Consider implementing dashboard templates that can be easily duplicated for different environments or services. This approach enables consistent monitoring patterns across your infrastructure while maintaining flexibility for specific requirements. The Prometheus and Grafana monitoring stack tutorial covers additional deployment patterns and configuration strategies.
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Dashboards not loading after provisioning | Incorrect file permissions or invalid JSON syntax | Check file permissions with ls -la /var/lib/grafana/dashboards/ and validate JSON syntax |
| Alert rules not evaluating | Query returns no data or invalid PromQL syntax | Test queries in Prometheus UI and verify metric availability |
| Notifications not being sent | Incorrect contact point configuration or network issues | Test notification channels manually and check network connectivity |
| Template variables showing no values | Prometheus data source not configured or no matching metrics | Verify data source connection and ensure metrics are being collected |
| High memory usage in Grafana | Too many concurrent queries or large result sets | Implement query result caching and optimize PromQL queries with appropriate time ranges |
| Alert rule evaluation timeouts | Complex queries taking too long to execute | Optimize PromQL queries and increase evaluation timeout in configuration |
Next steps
- Implement Prometheus federation for multi-cluster monitoring - Scale your monitoring across multiple Prometheus instances
- Configure Thanos Ruler for distributed alerting - Set up global alerting across federated Prometheus deployments
- Set up Grafana Enterprise SSO authentication - Implement enterprise authentication and authorization
- Create custom Grafana plugins for business metrics - Build specialized visualizations for your specific needs
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Advanced Grafana Dashboards and Alerting with Prometheus Integration
# Production-quality installation script
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m'
# Configuration
PROMETHEUS_URL="${1:-http://localhost:9090}"
GRAFANA_ADMIN_USER="${2:-admin}"
GRAFANA_ADMIN_PASS="${3:-admin}"
GRAFANA_URL="${4:-http://localhost:3000}"
# Usage message
usage() {
echo "Usage: $0 [prometheus_url] [grafana_admin_user] [grafana_admin_pass] [grafana_url]"
echo "Example: $0 http://localhost:9090 admin admin http://localhost:3000"
exit 1
}
log() { echo -e "${GREEN}[INFO]${NC} $1"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
error() { echo -e "${RED}[ERROR]${NC} $1"; }
cleanup() {
warn "Script failed. Cleaning up..."
rm -f /tmp/grafana-*
}
trap cleanup ERR
# Check prerequisites
check_prerequisites() {
if [[ $EUID -ne 0 ]]; then
error "This script must be run as root or with sudo"
exit 1
fi
for cmd in curl jq systemctl; do
if ! command -v "$cmd" &> /dev/null; then
error "Required command '$cmd' not found"
exit 1
fi
done
}
# Detect OS and package manager
detect_os() {
if [ ! -f /etc/os-release ]; then
error "Cannot detect OS: /etc/os-release not found"
exit 1
fi
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
GRAFANA_CONFIG="/etc/grafana/grafana.ini"
GRAFANA_USER="grafana"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
GRAFANA_CONFIG="/etc/grafana/grafana.ini"
GRAFANA_USER="grafana"
FIREWALL_CMD="firewall-cmd"
;;
fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
GRAFANA_CONFIG="/etc/grafana/grafana.ini"
GRAFANA_USER="grafana"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
GRAFANA_CONFIG="/etc/grafana/grafana.ini"
GRAFANA_USER="grafana"
FIREWALL_CMD="firewall-cmd"
;;
*)
error "Unsupported distribution: $ID"
exit 1
;;
esac
}
# Install dependencies
install_dependencies() {
echo "[1/8] Installing dependencies..."
case "$PKG_MGR" in
apt)
apt update
$PKG_INSTALL curl jq
;;
dnf|yum)
$PKG_INSTALL curl jq
;;
esac
log "Dependencies installed successfully"
}
# Create provisioning directories
create_directories() {
echo "[2/8] Creating Grafana provisioning directories..."
mkdir -p /etc/grafana/provisioning/dashboards
mkdir -p /etc/grafana/provisioning/datasources
mkdir -p /etc/grafana/provisioning/alerting
mkdir -p /var/lib/grafana/dashboards/infrastructure
chown -R "$GRAFANA_USER:$GRAFANA_USER" /etc/grafana/provisioning
chown -R "$GRAFANA_USER:$GRAFANA_USER" /var/lib/grafana/dashboards
chmod -R 755 /etc/grafana/provisioning
chmod -R 755 /var/lib/grafana/dashboards
log "Directories created successfully"
}
# Configure Prometheus datasource
configure_datasource() {
echo "[3/8] Configuring Prometheus datasource..."
cat > /etc/grafana/provisioning/datasources/prometheus.yaml << 'EOF'
apiVersion: 1
datasources:
- name: prometheus-advanced
type: prometheus
access: proxy
url: http://localhost:9090
basicAuth: false
isDefault: true
jsonData:
timeInterval: "15s"
queryTimeout: "60s"
httpMethod: "POST"
manageAlerts: true
editable: true
EOF
chown "$GRAFANA_USER:$GRAFANA_USER" /etc/grafana/provisioning/datasources/prometheus.yaml
chmod 644 /etc/grafana/provisioning/datasources/prometheus.yaml
log "Prometheus datasource configured"
}
# Configure dashboard provisioning
configure_dashboard_provisioning() {
echo "[4/8] Configuring dashboard provisioning..."
cat > /etc/grafana/provisioning/dashboards/dashboards.yaml << 'EOF'
apiVersion: 1
providers:
- name: 'production-dashboards'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
- name: 'infrastructure-dashboards'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: false
options:
path: /var/lib/grafana/dashboards/infrastructure
foldersFromFilesStructure: true
EOF
chown "$GRAFANA_USER:$GRAFANA_USER" /etc/grafana/provisioning/dashboards/dashboards.yaml
chmod 644 /etc/grafana/provisioning/dashboards/dashboards.yaml
log "Dashboard provisioning configured"
}
# Create advanced system monitoring dashboard
create_system_dashboard() {
echo "[5/8] Creating advanced system monitoring dashboard..."
cat > /var/lib/grafana/dashboards/infrastructure/system-monitoring.json << 'EOF'
{
"dashboard": {
"id": null,
"title": "Advanced System Monitoring",
"tags": ["infrastructure", "system", "monitoring"],
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"templating": {
"list": [
{
"name": "instance",
"type": "query",
"label": "Instance",
"multi": true,
"includeAll": true,
"allValue": ".*",
"query": "label_values(up, instance)",
"datasource": "prometheus-advanced",
"refresh": 1,
"regex": "",
"sort": 1
}
]
},
"panels": [
{
"id": 1,
"title": "CPU Usage",
"type": "graph",
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m])) * 100)",
"legendFormat": "CPU Usage %"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\"} / node_memory_MemTotal_bytes{instance=~\"$instance\"})) * 100",
"legendFormat": "Memory Usage %"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
}
]
},
"folderId": 0,
"overwrite": true
}
EOF
chown "$GRAFANA_USER:$GRAFANA_USER" /var/lib/grafana/dashboards/infrastructure/system-monitoring.json
chmod 644 /var/lib/grafana/dashboards/infrastructure/system-monitoring.json
log "System monitoring dashboard created"
}
# Configure alerting
configure_alerting() {
echo "[6/8] Configuring alerting rules..."
mkdir -p /etc/grafana/provisioning/alerting/rules
cat > /etc/grafana/provisioning/alerting/rules/system-alerts.yaml << 'EOF'
groups:
- name: system-alerts
interval: 30s
rules:
- uid: high-cpu-usage
title: High CPU Usage
condition: A
data:
- refId: A
queryType: prometheus
model:
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
intervalMs: 30000
for: 5m
annotations:
description: "CPU usage is above 80% for more than 5 minutes"
summary: "High CPU usage detected"
EOF
chown -R "$GRAFANA_USER:$GRAFANA_USER" /etc/grafana/provisioning/alerting
chmod -R 644 /etc/grafana/provisioning/alerting/rules/*.yaml
log "Alerting rules configured"
}
# Restart Grafana service
restart_grafana() {
echo "[7/8] Restarting Grafana service..."
systemctl restart grafana-server
systemctl enable grafana-server
# Wait for Grafana to start
sleep 5
log "Grafana service restarted"
}
# Verify installation
verify_installation() {
echo "[8/8] Verifying installation..."
# Check if Grafana is running
if ! systemctl is-active --quiet grafana-server; then
error "Grafana service is not running"
exit 1
fi
# Check if Grafana is responding
if ! curl -s -o /dev/null -w "%{http_code}" "$GRAFANA_URL" | grep -q "200"; then
warn "Grafana web interface may not be accessible yet (still starting up)"
else
log "Grafana web interface is accessible"
fi
# Verify provisioning files exist
for file in "/etc/grafana/provisioning/datasources/prometheus.yaml" \
"/etc/grafana/provisioning/dashboards/dashboards.yaml" \
"/var/lib/grafana/dashboards/infrastructure/system-monitoring.json"; do
if [ ! -f "$file" ]; then
error "Required file $file is missing"
exit 1
fi
done
log "All configuration files are in place"
log "Advanced Grafana dashboards and alerting setup completed successfully!"
log "Access Grafana at: $GRAFANA_URL"
log "Default credentials: $GRAFANA_ADMIN_USER / $GRAFANA_ADMIN_PASS"
}
# Main execution
main() {
check_prerequisites
detect_os
install_dependencies
create_directories
configure_datasource
configure_dashboard_provisioning
create_system_dashboard
configure_alerting
restart_grafana
verify_installation
}
# Run main function
main "$@"
Review the script before running. Execute with: bash install.sh