erp-java/rocketmq/monitoring/rocketmq_rules.yml

66 lines
2.2 KiB
YAML

# RocketMQ Alert Rules for Prometheus
groups:
- name: rocketmq
interval: 30s
rules:
# NameServer告警
- alert: RocketMQNameServerDown
expr: up{job="rocketmq-namesrv"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "RocketMQ NameServer 实例 {{ $labels.instance }} 宕机"
description: "RocketMQ NameServer 已经宕机超过1分钟"
# Broker告警
- alert: RocketMQBrokerDown
expr: up{job="rocketmq-broker"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "RocketMQ Broker {{ $labels.instance }} 宕机"
description: "RocketMQ Broker 已经宕机超过1分钟"
# 消费堆积告警
- alert: RocketMQConsumerLag
expr: rocketmq_consumer_tps > 0 and (rocketmq_consumer_tps - rocketmq_producer_tps) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "RocketMQ 消费堆积 - {{ $labels.group }}"
description: "消费者组 {{ $labels.group }} 在主题 {{ $labels.topic }} 上消费堆积"
# 生产者发送失败告警
- alert: RocketMQProducerSendFailed
expr: rate(rocketmq_producer_send_failed_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "RocketMQ 生产者发送失败率升高"
description: "消息发送失败率超过阈值"
# 内存使用率告警
- alert: RocketMQMemoryUsageHigh
expr: (rocketmq_memory_used / rocketmq_memory_total) > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "RocketMQ 内存使用率过高 - {{ $labels.instance }}"
description: "实例 {{ $labels.instance }} 内存使用率超过85%"
# 磁盘使用率告警
- alert: RocketMQDiskUsageHigh
expr: (rocketmq_disk_used / rocketmq_disk_total) > 0.80
for: 10m
labels:
severity: warning
annotations:
summary: "RocketMQ 磁盘使用率过高 - {{ $labels.instance }}"
description: "实例 {{ $labels.instance }} 磁盘使用率超过80%"