Observability Stack Installation
This guide covers deploying the complete observability stack for RavenmaskOS, providing metrics collection, log aggregation, distributed tracing, and visualization.
Overview
The observability stack consists of:
| Component | Purpose | Port | URL |
|---|---|---|---|
| Prometheus | Metrics collection and alerting | 9090 | prometheus.ravenhelm.dev |
| Loki | Log aggregation | 3100 | - |
| Tempo | Distributed tracing | 3200 | - |
| Grafana | Visualization and dashboards | 3000 | grafana.ravenhelm.dev |
| Promtail | Log shipping agent | 9080 | - |
| Blackbox Exporter | Endpoint probing | 9115 | - |
Prerequisites
Before deploying the observability stack:
- Core Stack is running (PostgreSQL, Redis, Traefik)
- Identity Stack is configured (Zitadel for Grafana OAuth)
- Docker and Docker Compose installed
ravenhelm_netnetwork exists
Directory Structure
mkdir -p ~/ravenhelm/services/{prometheus,loki,tempo,grafana}
mkdir -p ~/ravenhelm/data/{prometheus,loki,tempo,grafana}
mkdir -p ~/ravenhelm/services/prometheus/rules
Step 1: Deploy Prometheus
Prometheus collects and stores metrics from all services.
Configuration
Create ~/ravenhelm/services/prometheus/prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: []
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'traefik'
static_configs:
- targets: ['traefik:8080']
- job_name: 'node'
static_configs:
- targets: ['host.docker.internal:9100']
- job_name: 'docker'
static_configs:
- targets: ['host.docker.internal:9323']
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://grafana.ravenhelm.dev
- https://auth.ravenhelm.dev
- https://n8n.ravenhelm.dev
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
Docker Compose
Create ~/ravenhelm/services/prometheus/docker-compose.yml:
services:
prometheus:
image: prom/prometheus:v2.54.1
container_name: prometheus
restart: unless-stopped
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
networks:
- ravenhelm_net
labels:
- "traefik.enable=true"
- "traefik.http.routers.prometheus.rule=Host(`prometheus.ravenhelm.dev`)"
- "traefik.http.routers.prometheus.entrypoints=websecure"
- "traefik.http.routers.prometheus.tls.certresolver=letsencrypt"
- "traefik.http.routers.prometheus.middlewares=oauth2-proxy@docker"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
blackbox-exporter:
image: prom/blackbox-exporter:v0.25.0
container_name: blackbox-exporter
restart: unless-stopped
command:
- '--config.file=/etc/blackbox/blackbox.yml'
volumes:
- ./blackbox.yml:/etc/blackbox/blackbox.yml:ro
networks:
- ravenhelm_net
volumes:
prometheus_data:
name: prometheus_data
networks:
ravenhelm_net:
external: true
Blackbox Exporter Configuration
Create ~/ravenhelm/services/prometheus/blackbox.yml:
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200, 301, 302]
method: GET
follow_redirects: true
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
timeout: 5s
Deploy Prometheus
cd ~/ravenhelm/services/prometheus
docker compose up -d
Step 2: Deploy Loki
Loki aggregates logs from all services.
Configuration
Create ~/ravenhelm/services/loki/loki-config.yml:
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: 720h
ingestion_rate_mb: 16
ingestion_burst_size_mb: 32
Docker Compose
Create ~/ravenhelm/services/loki/docker-compose.yml:
services:
loki:
image: grafana/loki:3.0.0
container_name: loki
restart: unless-stopped
command: -config.file=/etc/loki/loki-config.yml
volumes:
- ./loki-config.yml:/etc/loki/loki-config.yml:ro
- loki_data:/loki
ports:
- "3100:3100"
networks:
- ravenhelm_net
healthcheck:
test: ["CMD-SHELL", "wget -q --spider http://localhost:3100/ready || exit 1"]
interval: 30s
timeout: 10s
retries: 3
volumes:
loki_data:
name: loki_data
networks:
ravenhelm_net:
external: true
Deploy Loki
cd ~/ravenhelm/services/loki
docker compose up -d
Step 3: Deploy Promtail
Promtail ships logs from Docker containers to Loki.
Configuration
Create ~/ravenhelm/services/loki/promtail-config.yml:
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream'
- source_labels: ['__meta_docker_container_label_com_docker_compose_service']
target_label: 'service'
Add Promtail to the Loki docker-compose.yml:
promtail:
image: grafana/promtail:3.0.0
container_name: promtail
restart: unless-stopped
command: -config.file=/etc/promtail/promtail-config.yml
volumes:
- ./promtail-config.yml:/etc/promtail/promtail-config.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- ravenhelm_net
depends_on:
- loki
Restart Loki Stack
cd ~/ravenhelm/services/loki
docker compose up -d
Step 4: Deploy Tempo
Tempo provides distributed tracing capabilities.
Configuration
Create ~/ravenhelm/services/tempo/tempo.yml:
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
endpoint: "0.0.0.0:4318"
zipkin:
endpoint: "0.0.0.0:9411"
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
metrics_generator:
registry:
external_labels:
source: tempo
storage:
path: /var/tempo/generator/wal
remote_write:
- url: http://prometheus:9090/api/v1/write
send_exemplars: true
overrides:
defaults:
metrics_generator:
processors: [service-graphs, span-metrics]
Docker Compose
Create ~/ravenhelm/services/tempo/docker-compose.yml:
services:
tempo:
image: grafana/tempo:2.6.1
container_name: tempo
restart: unless-stopped
command: ["-config.file=/etc/tempo.yml"]
volumes:
- ./tempo.yml:/etc/tempo.yml:ro
- tempo_data:/var/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "9411:9411" # Zipkin
networks:
- ravenhelm_net
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3200/ready"]
interval: 30s
timeout: 10s
retries: 3
volumes:
tempo_data:
name: tempo_data
networks:
ravenhelm_net:
external: true
Deploy Tempo
cd ~/ravenhelm/services/tempo
docker compose up -d
Step 5: Deploy Grafana
Grafana provides visualization for all observability data.
Environment Variables
Add to ~/ravenhelm/secrets/.env:
# Grafana
GF_SECURITY_ADMIN_PASSWORD=your-admin-password
GF_DATABASE_PASSWORD=your-db-password
GRAFANA_OAUTH_CLIENT_ID=your-zitadel-client-id
GRAFANA_OAUTH_CLIENT_SECRET=your-zitadel-client-secret
Create OAuth Application in Zitadel
- Navigate to https://auth.ravenhelm.dev
- Create a new application for Grafana:
- Name: Grafana
- Type: Web Application
- Authentication Method: Code (PKCE)
- Redirect URIs:
https://grafana.ravenhelm.dev/login/generic_oauth
Docker Compose
Create ~/ravenhelm/services/grafana/docker-compose.yml:
services:
grafana:
image: grafana/grafana:11.3.0
container_name: grafana
restart: unless-stopped
environment:
# Database
- GF_DATABASE_TYPE=postgres
- GF_DATABASE_HOST=postgres:5432
- GF_DATABASE_NAME=grafana
- GF_DATABASE_USER=grafana
- GF_DATABASE_PASSWORD=${GF_DATABASE_PASSWORD}
- GF_DATABASE_SSL_MODE=disable
# Server
- GF_SERVER_ROOT_URL=https://grafana.ravenhelm.dev
- GF_SERVER_DOMAIN=grafana.ravenhelm.dev
# Security
- GF_SECURITY_ADMIN_PASSWORD=${GF_SECURITY_ADMIN_PASSWORD}
- GF_SECURITY_DISABLE_GRAVATAR=true
# OAuth via Zitadel
- GF_AUTH_GENERIC_OAUTH_ENABLED=true
- GF_AUTH_GENERIC_OAUTH_NAME=Zitadel
- GF_AUTH_GENERIC_OAUTH_CLIENT_ID=${GRAFANA_OAUTH_CLIENT_ID}
- GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET=${GRAFANA_OAUTH_CLIENT_SECRET}
- GF_AUTH_GENERIC_OAUTH_SCOPES=openid profile email
- GF_AUTH_GENERIC_OAUTH_AUTH_URL=https://auth.ravenhelm.dev/oauth/v2/authorize
- GF_AUTH_GENERIC_OAUTH_TOKEN_URL=https://auth.ravenhelm.dev/oauth/v2/token
- GF_AUTH_GENERIC_OAUTH_API_URL=https://auth.ravenhelm.dev/oidc/v1/userinfo
- GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP=true
- GF_AUTH_GENERIC_OAUTH_USE_PKCE=true
# Features
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor tempoSearch tempoBackendSearch tempoServiceGraph
volumes:
- grafana_data:/var/lib/grafana
networks:
- ravenhelm_net
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.ravenhelm.dev`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
depends_on:
- postgres
volumes:
grafana_data:
name: grafana_data
networks:
ravenhelm_net:
external: true
Create Grafana Database
docker exec -it postgres psql -U ravenhelm -d postgres -c "
CREATE USER grafana WITH PASSWORD 'your-db-password';
CREATE DATABASE grafana OWNER grafana;
GRANT ALL PRIVILEGES ON DATABASE grafana TO grafana;
"
Deploy Grafana
cd ~/ravenhelm/services/grafana
docker compose --env-file ~/ravenhelm/secrets/.env up -d
Step 6: Configure Grafana Data Sources
After Grafana is running, configure the data sources.
Via UI
- Navigate to https://grafana.ravenhelm.dev
- Go to Configuration → Data Sources
- Add each data source:
Prometheus:
- URL:
http://prometheus:9090 - Default: Yes
Loki:
- URL:
http://loki:3100
Tempo:
- URL:
http://tempo:3200 - Enable: TraceQL, Service Graph
- Link to Loki for logs
Via Provisioning
Create ~/ravenhelm/services/grafana/provisioning/datasources/datasources.yml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: "traceID=(\\w+)"
name: TraceID
url: "$${__value.raw}"
- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
editable: false
uid: tempo
jsonData:
httpMethod: GET
tracesToLogs:
datasourceUid: loki
tags: ['container', 'service']
serviceMap:
datasourceUid: prometheus
Mount the provisioning directory in the Grafana container:
volumes:
- grafana_data:/var/lib/grafana
- ./provisioning:/etc/grafana/provisioning:ro
Verification
Check Service Status
# All services running
docker ps | grep -E "(prometheus|loki|tempo|grafana|promtail|blackbox)"
# Prometheus targets
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets | length'
# Loki ready
curl -s http://localhost:3100/ready
# Tempo ready
curl -s http://localhost:3200/ready
# Grafana health
curl -s http://localhost:3000/api/health
Test Log Ingestion
# Generate a test log
docker run --rm --network ravenhelm_net alpine echo "Test log message"
# Query Loki
curl -G -s http://localhost:3100/loki/api/v1/query \
--data-urlencode 'query={container="alpine"}' | jq
Test Metrics
# Query Prometheus
curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result'
Troubleshooting
Prometheus Not Scraping Targets
# Check target status
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health, error: .lastError}'
# Reload configuration
curl -X POST http://localhost:9090/-/reload
Loki Not Receiving Logs
# Check Promtail status
docker logs promtail --tail 50
# Verify Promtail can reach Loki
docker exec promtail wget -q -O- http://loki:3100/ready
Grafana OAuth Issues
# Check Grafana logs
docker logs grafana --tail 100 | grep -i oauth
# Verify Zitadel connectivity
docker exec grafana wget -q -O- https://auth.ravenhelm.dev/.well-known/openid-configuration
Tempo Not Receiving Traces
# Check Tempo logs
docker logs tempo --tail 50
# Verify OTLP endpoint
curl -s http://localhost:3200/ready
Next Steps
- Dashboard Catalog - Import pre-built dashboards
- Alert Rules - Configure alerting
- SLIs and SLOs - Define service level objectives