Runbook Registry (DIS Schema)
[[TOC]]
Runbook Schema Definition
File: runbooks/schema.yaml
# Domain Intelligence Schema (DIS) v1.6.0 - Runbook Extension
type: runbook_registry
version: 1.0.0
runbook_schema:
id:
type: string
required: true
pattern: "^[a-z0-9-]+$"
description: "Unique runbook identifier (kebab-case)"
version:
type: string
required: true
pattern: "^v\\d+$"
description: "Runbook version (v1, v2, etc.)"
name:
type: string
required: true
description: "Human-readable runbook name"
description:
type: string
required: true
description: "What this runbook remediates"
triggers:
type: array
required: true
items:
oneOf:
- type: pattern
properties:
pattern:
type: string
description: "Regex pattern to match in logs/errors"
- type: metric_condition
properties:
metric: string
operator: enum [>, <, ==, !=]
threshold: number
steps:
type: array
required: true
description: "Remediation steps (executed sequentially)"
items:
action:
type: string
enum: [check_status, restart_container, scale_container,
clear_cache, run_script, wait, verify_health, escalate]
tool:
type: string
description: "MCP tool to invoke"
params:
type: object
description: "Tool parameters"
retry:
type: integer
default: 0
interval:
type: integer
description: "Seconds to wait before retry"
required:
type: boolean
default: true
description: "If false, failure doesn't halt runbook"
success_criteria:
type: array
required: true
description: "How to verify remediation worked"
items:
metric: string
operator: enum [>, <, ==]
value: number
estimated_duration:
type: string
description: "Expected execution time (e.g., '45s')"
last_success_rate:
type: number
description: "Historical success rate (0.0-1.0)"
owner:
type: string
description: "Team responsible for maintaining runbook"
Example Runbooks
Runbook 1: Redis Connection Failure
File: runbooks/docs/infrastructure/redis-conn-failure-v1.yaml
id: redis-conn-failure-v1
version: v1
name: "Redis Connection Failure Recovery"
description: "Remediate Redis connection failures by restarting container"
triggers:
- pattern: "connection refused.*redis"
- pattern: "ECONNREFUSED.*6379"
- pattern: "Redis connection timeout"
conditions:
- service: redis
status: unhealthy
steps:
- action: check_status
tool: docker_inspect
params:
container: redis
required: true
- action: restart_container
tool: docker_restart
params:
container: redis
timeout: 30
required: true
- action: wait
params:
seconds: 10
required: true
- action: verify_health
tool: docker_healthcheck
params:
container: redis
retry: 3
interval: 10
required: true
- action: escalate
condition: "health_check.failed"
tool: notify_oncall
params:
team: platform
message: "Redis restart failed after 3 attempts"
required: false
success_criteria:
- metric: redis_up
operator: ==
value: 1
- metric: redis_connected_clients
operator: ">"
value: 0
estimated_duration: "45s"
last_success_rate: 0.94
owner: platform-team
Runbook 2: Container OOM Prevention
File: runbooks/oom-prevention-v1.yaml
id: oom-prevention-v1
version: v1
name: "OOM Prevention - Memory Limit Increase"
description: "Increase container memory limit before OOM kill occurs"
triggers:
- metric_condition:
metric: container_memory_usage_percentage
operator: ">"
threshold: 0.9
conditions:
- service: "*" # Apply to any service
status: unhealthy
steps:
- action: check_status
tool: docker_inspect
params:
container: "{{ .service }}"
required: true
- action: run_script
tool: bash_execute
params:
script: |
# Get current memory limit
CURRENT=$(docker inspect {{ .service }} --format='{{.HostConfig.Memory}}')
# Increase by 50%
NEW=$((CURRENT * 3 / 2))
# Update container
docker update --memory=${NEW} {{ .service }}
echo "Memory limit increased: ${CURRENT} -> ${NEW}"
required: true
- action: verify_health
tool: docker_healthcheck
params:
container: "{{ .service }}"
retry: 2
interval: 15
required: true
success_criteria:
- metric: container_memory_usage_percentage
operator: "<"
value: 0.8
estimated_duration: "30s"
last_success_rate: 0.78
owner: platform-team
Runbook 3: LangGraph Execution Failure
File: runbooks/langgraph-execution-failure-v1.yaml
id: langgraph-execution-failure-v1
version: v1
name: "LangGraph Execution Failure Investigation"
description: "Diagnose and remediate LangGraph agent failures"
triggers:
- pattern: "LangGraph execution failed"
- pattern: "Agent timeout"
- metric_condition:
metric: langgraph_execution_errors_rate
operator: ">"
threshold: 0.1
conditions:
- service: langgraph
status: unhealthy
steps:
- action: check_status
tool: query_logs
params:
service: langgraph
filter: "level=error"
limit: 50
required: true
- action: run_script
tool: bash_execute
params:
script: |
# Check for common failure patterns
if grep -q "ANTHROPIC_API_KEY" /var/log/langgraph.log; then
echo "FAILURE_REASON=api_key_invalid"
elif grep -q "timeout" /var/log/langgraph.log; then
echo "FAILURE_REASON=llm_timeout"
elif grep -q "OOM" /var/log/langgraph.log; then
echo "FAILURE_REASON=out_of_memory"
else
echo "FAILURE_REASON=unknown"
fi
required: true
- action: restart_container
condition: "FAILURE_REASON == 'llm_timeout' or FAILURE_REASON == 'out_of_memory'"
tool: docker_restart
params:
container: langgraph
required: false
- action: escalate
condition: "FAILURE_REASON == 'api_key_invalid' or FAILURE_REASON == 'unknown'"
tool: notify_oncall
params:
team: ai-team
message: "LangGraph failure requires manual investigation"
required: true
success_criteria:
- metric: langgraph_execution_success_rate
operator: ">"
value: 0.9
estimated_duration: "2m"
last_success_rate: 0.65
owner: ai-team
Runbook Registry Implementation
File: sre-agent/runbook_registry.py
import yaml
import re
from pathlib import Path
from typing import Optional, List, Dict
import logging
logger = logging.getLogger(__name__)
class Runbook:
"""Runbook data structure"""
def __init__(self, data: dict):
self.id = data['id']
self.version = data['version']
self.name = data['name']
self.description = data['description']
self.triggers = data['triggers']
self.conditions = data.get('conditions', [])
self.steps = data['steps']
self.success_criteria = data['success_criteria']
self.estimated_duration = data.get('estimated_duration', 'unknown')
self.last_success_rate = data.get('last_success_rate', 0.0)
self.owner = data.get('owner', 'platform-team')
class RunbookRegistry:
"""Load and match runbooks from YAML files."""
def __init__(self, runbook_dir: str = "/app/runbooks"):
self.runbook_dir = Path(runbook_dir)
self.runbooks: Dict[str, Runbook] = {}
self._load_runbooks()
def _load_runbooks(self):
"""Load all runbooks from directory"""
for yaml_file in self.runbook_dir.glob("*.yaml"):
if yaml_file.name == "schema.yaml":
continue
try:
with open(yaml_file, 'r') as f:
data = yaml.safe_load(f)
runbook = Runbook(data)
self.runbooks[runbook.id] = runbook
logger.info(f"Loaded runbook: {runbook.id}")
except Exception as e:
logger.error(f"Failed to load runbook {yaml_file}: {e}")
logger.info(f"Loaded {len(self.runbooks)} runbooks")
def get_by_id(self, runbook_id: str) -> Optional[Runbook]:
"""Get runbook by ID"""
return self.runbooks.get(runbook_id)
def match_by_pattern(self, error_message: str) -> Optional[Runbook]:
"""Match runbook by error pattern."""
candidates = []
for runbook in self.runbooks.values():
for trigger in runbook.triggers:
if 'pattern' in trigger:
pattern = trigger['pattern']
if re.search(pattern, error_message, re.IGNORECASE):
candidates.append(runbook)
break
if not candidates:
return None
# Sort by success rate (descending)
candidates.sort(key=lambda r: r.last_success_rate, reverse=True)
return candidates[0]
def list_all(self) -> List[Runbook]:
"""List all runbooks"""
return list(self.runbooks.values())
Next: [[AIOps-Demo-Roadmap]] - Demo choreography and implementation phases