Skip to main content

Runbook Registry (DIS Schema)

[[TOC]]

Runbook Schema Definition

File: runbooks/schema.yaml

# Domain Intelligence Schema (DIS) v1.6.0 - Runbook Extension

type: runbook_registry
version: 1.0.0

runbook_schema:
id:
type: string
required: true
pattern: "^[a-z0-9-]+$"
description: "Unique runbook identifier (kebab-case)"

version:
type: string
required: true
pattern: "^v\\d+$"
description: "Runbook version (v1, v2, etc.)"

name:
type: string
required: true
description: "Human-readable runbook name"

description:
type: string
required: true
description: "What this runbook remediates"

triggers:
type: array
required: true
items:
oneOf:
- type: pattern
properties:
pattern:
type: string
description: "Regex pattern to match in logs/errors"
- type: metric_condition
properties:
metric: string
operator: enum [>, <, ==, !=]
threshold: number

steps:
type: array
required: true
description: "Remediation steps (executed sequentially)"
items:
action:
type: string
enum: [check_status, restart_container, scale_container,
clear_cache, run_script, wait, verify_health, escalate]
tool:
type: string
description: "MCP tool to invoke"
params:
type: object
description: "Tool parameters"
retry:
type: integer
default: 0
interval:
type: integer
description: "Seconds to wait before retry"
required:
type: boolean
default: true
description: "If false, failure doesn't halt runbook"

success_criteria:
type: array
required: true
description: "How to verify remediation worked"
items:
metric: string
operator: enum [>, <, ==]
value: number

estimated_duration:
type: string
description: "Expected execution time (e.g., '45s')"

last_success_rate:
type: number
description: "Historical success rate (0.0-1.0)"

owner:
type: string
description: "Team responsible for maintaining runbook"

Example Runbooks

Runbook 1: Redis Connection Failure

File: runbooks/docs/infrastructure/redis-conn-failure-v1.yaml

id: redis-conn-failure-v1
version: v1
name: "Redis Connection Failure Recovery"
description: "Remediate Redis connection failures by restarting container"

triggers:
- pattern: "connection refused.*redis"
- pattern: "ECONNREFUSED.*6379"
- pattern: "Redis connection timeout"

conditions:
- service: redis
status: unhealthy

steps:
- action: check_status
tool: docker_inspect
params:
container: redis
required: true

- action: restart_container
tool: docker_restart
params:
container: redis
timeout: 30
required: true

- action: wait
params:
seconds: 10
required: true

- action: verify_health
tool: docker_healthcheck
params:
container: redis
retry: 3
interval: 10
required: true

- action: escalate
condition: "health_check.failed"
tool: notify_oncall
params:
team: platform
message: "Redis restart failed after 3 attempts"
required: false

success_criteria:
- metric: redis_up
operator: ==
value: 1
- metric: redis_connected_clients
operator: ">"
value: 0

estimated_duration: "45s"
last_success_rate: 0.94
owner: platform-team

Runbook 2: Container OOM Prevention

File: runbooks/oom-prevention-v1.yaml

id: oom-prevention-v1
version: v1
name: "OOM Prevention - Memory Limit Increase"
description: "Increase container memory limit before OOM kill occurs"

triggers:
- metric_condition:
metric: container_memory_usage_percentage
operator: ">"
threshold: 0.9

conditions:
- service: "*" # Apply to any service
status: unhealthy

steps:
- action: check_status
tool: docker_inspect
params:
container: "{{ .service }}"
required: true

- action: run_script
tool: bash_execute
params:
script: |
# Get current memory limit
CURRENT=$(docker inspect {{ .service }} --format='{{.HostConfig.Memory}}')

# Increase by 50%
NEW=$((CURRENT * 3 / 2))

# Update container
docker update --memory=${NEW} {{ .service }}

echo "Memory limit increased: ${CURRENT} -> ${NEW}"
required: true

- action: verify_health
tool: docker_healthcheck
params:
container: "{{ .service }}"
retry: 2
interval: 15
required: true

success_criteria:
- metric: container_memory_usage_percentage
operator: "<"
value: 0.8

estimated_duration: "30s"
last_success_rate: 0.78
owner: platform-team

Runbook 3: LangGraph Execution Failure

File: runbooks/langgraph-execution-failure-v1.yaml

id: langgraph-execution-failure-v1
version: v1
name: "LangGraph Execution Failure Investigation"
description: "Diagnose and remediate LangGraph agent failures"

triggers:
- pattern: "LangGraph execution failed"
- pattern: "Agent timeout"
- metric_condition:
metric: langgraph_execution_errors_rate
operator: ">"
threshold: 0.1

conditions:
- service: langgraph
status: unhealthy

steps:
- action: check_status
tool: query_logs
params:
service: langgraph
filter: "level=error"
limit: 50
required: true

- action: run_script
tool: bash_execute
params:
script: |
# Check for common failure patterns
if grep -q "ANTHROPIC_API_KEY" /var/log/langgraph.log; then
echo "FAILURE_REASON=api_key_invalid"
elif grep -q "timeout" /var/log/langgraph.log; then
echo "FAILURE_REASON=llm_timeout"
elif grep -q "OOM" /var/log/langgraph.log; then
echo "FAILURE_REASON=out_of_memory"
else
echo "FAILURE_REASON=unknown"
fi
required: true

- action: restart_container
condition: "FAILURE_REASON == 'llm_timeout' or FAILURE_REASON == 'out_of_memory'"
tool: docker_restart
params:
container: langgraph
required: false

- action: escalate
condition: "FAILURE_REASON == 'api_key_invalid' or FAILURE_REASON == 'unknown'"
tool: notify_oncall
params:
team: ai-team
message: "LangGraph failure requires manual investigation"
required: true

success_criteria:
- metric: langgraph_execution_success_rate
operator: ">"
value: 0.9

estimated_duration: "2m"
last_success_rate: 0.65
owner: ai-team

Runbook Registry Implementation

File: sre-agent/runbook_registry.py

import yaml
import re
from pathlib import Path
from typing import Optional, List, Dict
import logging

logger = logging.getLogger(__name__)

class Runbook:
"""Runbook data structure"""
def __init__(self, data: dict):
self.id = data['id']
self.version = data['version']
self.name = data['name']
self.description = data['description']
self.triggers = data['triggers']
self.conditions = data.get('conditions', [])
self.steps = data['steps']
self.success_criteria = data['success_criteria']
self.estimated_duration = data.get('estimated_duration', 'unknown')
self.last_success_rate = data.get('last_success_rate', 0.0)
self.owner = data.get('owner', 'platform-team')

class RunbookRegistry:
"""Load and match runbooks from YAML files."""

def __init__(self, runbook_dir: str = "/app/runbooks"):
self.runbook_dir = Path(runbook_dir)
self.runbooks: Dict[str, Runbook] = {}
self._load_runbooks()

def _load_runbooks(self):
"""Load all runbooks from directory"""
for yaml_file in self.runbook_dir.glob("*.yaml"):
if yaml_file.name == "schema.yaml":
continue

try:
with open(yaml_file, 'r') as f:
data = yaml.safe_load(f)

runbook = Runbook(data)
self.runbooks[runbook.id] = runbook
logger.info(f"Loaded runbook: {runbook.id}")

except Exception as e:
logger.error(f"Failed to load runbook {yaml_file}: {e}")

logger.info(f"Loaded {len(self.runbooks)} runbooks")

def get_by_id(self, runbook_id: str) -> Optional[Runbook]:
"""Get runbook by ID"""
return self.runbooks.get(runbook_id)

def match_by_pattern(self, error_message: str) -> Optional[Runbook]:
"""Match runbook by error pattern."""
candidates = []

for runbook in self.runbooks.values():
for trigger in runbook.triggers:
if 'pattern' in trigger:
pattern = trigger['pattern']
if re.search(pattern, error_message, re.IGNORECASE):
candidates.append(runbook)
break

if not candidates:
return None

# Sort by success rate (descending)
candidates.sort(key=lambda r: r.last_success_rate, reverse=True)

return candidates[0]

def list_all(self) -> List[Runbook]:
"""List all runbooks"""
return list(self.runbooks.values())

Next: [[AIOps-Demo-Roadmap]] - Demo choreography and implementation phases