Observability Stack Design
[[TOC]]
Alloy Configuration
Purpose: Transform unstructured logs into queryable JSON
File: /etc/alloy/config.alloy
// Collect logs from Docker containers
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.containers.targets
forward_to = [loki.process.enrich.receiver]
relabel_rules = discovery.relabel.docker.rules
}
// Enrich and structure logs
loki.process "enrich" {
forward_to = [loki.write.default.receiver]
// Parse Traefik access logs
stage.match {
selector = "{container=\"traefik\"}"
stage.regex {
expression = "(?P<client_ip>[\\d\\.]+) .* \"(?P<method>\\w+) (?P<path>/[^\\s]*) HTTP/[\\d\\.]+\" (?P<status>\\d{3}) (?P<bytes>\\d+) .* \"(?P<duration>\\d+)ms\""
}
stage.labels {
values = {
method = "",
status = "",
status_class = "",
}
}
stage.template {
source = "status_class"
template = "{{ .status | substr 0 1 }}xx"
}
}
// Parse JSON logs (for services using structured logging)
stage.match {
selector = "{container=~\"langgraph|ollama|n8n\"}"
stage.json {
expressions = {
level = "level",
trace_id = "trace_id",
service = "service",
message = "message",
duration_ms = "duration_ms",
}
}
stage.labels {
values = {
level = "",
service = "",
}
}
}
// Classify errors for agent routing
stage.match {
selector = "{status=~\"5..\"}"
stage.static_labels {
values = {
error_severity = "high",
agent_notify = "true",
}
}
}
stage.match {
selector = "{level=\"error\"}"
stage.static_labels {
values = {
error_severity = "medium",
agent_notify = "true",
}
}
}
// Extract PostgreSQL query logs
stage.match {
selector = "{container=\"postgres\"}"
stage.regex {
expression = "duration: (?P<duration_ms>[\\d\\.]+) ms.*statement: (?P<query>.*)"
}
stage.labels {
values = {
query_type = "",
}
}
stage.template {
source = "query_type"
template = "{{ .query | regexReplaceAll \"(SELECT|INSERT|UPDATE|DELETE).*\" \"$1\" }}"
}
}
}
// Write to Loki
loki.write "default" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}
Prometheus Configuration
File: /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'ravenhelm'
environment: 'production'
# Alert rule files
rule_files:
- /etc/prometheus/rules/*.yml
# Scrape configs
scrape_configs:
# Node metrics
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
# Container metrics
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# PostgreSQL
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'ravenmaskos-db'
# Redis
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
# Redpanda
- job_name: 'redpanda'
static_configs:
- targets: ['redpanda:9644']
# Traefik
- job_name: 'traefik'
static_configs:
- targets: ['traefik:8082']
# Grafana
- job_name: 'grafana'
static_configs:
- targets: ['grafana:3000']
# Custom application metrics
- job_name: 'langgraph-agent'
static_configs:
- targets: ['sre-agent:9090']
metrics_path: '/metrics'
# LiveKit
- job_name: 'livekit'
static_configs:
- targets: ['livekit:6789']
# Alertmanager config
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
Loki Configuration
File: /etc/loki/config.yml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /tmp/loki
storage:
filesystem:
chunks_directory: /tmp/loki/chunks
rules_directory: /tmp/loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
# Increase for high-volume logging
ingestion_rate_mb: 50
ingestion_burst_size_mb: 100
max_query_length: 721h # 30 days
max_query_parallelism: 32
# Per-stream limits
per_stream_rate_limit: 10MB
per_stream_rate_limit_burst: 20MB
# Query limits
max_entries_limit_per_query: 10000
max_streams_per_user: 0 # unlimited
# Retention
retention_period: 720h # 30 days
Next: [[AIOps-Dashboard-Strategy]] - Dashboard hierarchy and queries