/etc/prometheus/alert_rules.yml > alert.rules
|
Labels |
State |
Active Since |
Value |
alertname="AptUpdatePending"
instance="localhost:9100"
severity="info"
|
firing |
2025-04-01 09:54:53.098519178 +0000 UTC |
20 |
Annotations |
- summary
- localhost:9100 a 20 paquets en attente de mise à jour !
|
|
Labels |
State |
Active Since |
Value |
alertname="SystemdServiceFailed"
instance="localhost:9100"
job="node"
name="nvmf-autoconnect.service"
severity="warning"
state="failed"
type="oneshot"
|
firing |
2025-03-17 16:52:53.098519178 +0000 UTC |
1 |
Annotations |
- summary
- nvmf-autoconnect.service a échoué sur localhost:9100
|
alertname="SystemdServiceFailed"
instance="localhost:9100"
job="node"
name="openipmi.service"
severity="warning"
state="failed"
type="forking"
|
firing |
2025-03-17 16:52:53.098519178 +0000 UTC |
1 |
Annotations |
- summary
- openipmi.service a échoué sur localhost:9100
|
|
|
|
|
|
|
|
|
|
|
|
|
alert: InstanceDown
expr: up == 0
for: 10m
labels:
severity: critical
annotations:
summary: 'Federate : {{ $labels.instance }} est invisible depuis plus de 10 minutes
!'
|
alert: LoadUsage
expr: node_load1 > 3
for: 10m
labels:
severity: warning
annotations:
summary: La charge de {{ $labels.instance }} est à {{ $value }} !
|
|
|
|
alert: SmartCriticalWarning
expr: smartctl_device_critical_warning
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has critical warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart critical warning (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureCritical
expr: smartctl_device_temperature
> 80
for: 2m
labels:
severity: critical
annotations:
description: |-
Device temperature critical (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature critical (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureWarning
expr: smartctl_device_temperature
> 60
for: 2m
labels:
severity: warning
annotations:
description: |-
Device temperature warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature warning (instance {{ $labels.instance }})
|
alert: SmartMediaErrors
expr: smartctl_device_media_errors
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has media errors (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart media errors (instance {{ $labels.instance }})
|
|
|
|
|