# RocketMQ Alert Rules for Prometheus groups: - name: rocketmq interval: 30s rules: # NameServer告警 - alert: RocketMQNameServerDown expr: up{job="rocketmq-namesrv"} == 0 for: 1m labels: severity: critical annotations: summary: "RocketMQ NameServer 实例 {{ $labels.instance }} 宕机" description: "RocketMQ NameServer 已经宕机超过1分钟" # Broker告警 - alert: RocketMQBrokerDown expr: up{job="rocketmq-broker"} == 0 for: 1m labels: severity: critical annotations: summary: "RocketMQ Broker {{ $labels.instance }} 宕机" description: "RocketMQ Broker 已经宕机超过1分钟" # 消费堆积告警 - alert: RocketMQConsumerLag expr: rocketmq_consumer_tps > 0 and (rocketmq_consumer_tps - rocketmq_producer_tps) > 1000 for: 5m labels: severity: warning annotations: summary: "RocketMQ 消费堆积 - {{ $labels.group }}" description: "消费者组 {{ $labels.group }} 在主题 {{ $labels.topic }} 上消费堆积" # 生产者发送失败告警 - alert: RocketMQProducerSendFailed expr: rate(rocketmq_producer_send_failed_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "RocketMQ 生产者发送失败率升高" description: "消息发送失败率超过阈值" # 内存使用率告警 - alert: RocketMQMemoryUsageHigh expr: (rocketmq_memory_used / rocketmq_memory_total) > 0.85 for: 10m labels: severity: warning annotations: summary: "RocketMQ 内存使用率过高 - {{ $labels.instance }}" description: "实例 {{ $labels.instance }} 内存使用率超过85%" # 磁盘使用率告警 - alert: RocketMQDiskUsageHigh expr: (rocketmq_disk_used / rocketmq_disk_total) > 0.80 for: 10m labels: severity: warning annotations: summary: "RocketMQ 磁盘使用率过高 - {{ $labels.instance }}" description: "实例 {{ $labels.instance }} 磁盘使用率超过80%"