Alerts

Inactive (27) Pending (0) Firing (1)

/etc/prometheus/alert_rules.yml > alert.rules

SystemdServiceFailed (2 active)

alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"}
  == 1
for: 10m
labels:
  severity: warning
annotations:
  summary: '{{ $labels.name }} a échoué sur {{ $labels.instance }}'

Labels	State	Active Since	Value
alertname="SystemdServiceFailed" instance="localhost:9100" job="node" name="openipmi.service" severity="warning" state="failed" type="forking"	firing	2025-06-23 00:26:53 +0000 UTC	1
Annotations
summary openipmi.service a échoué sur localhost:9100
alertname="SystemdServiceFailed" instance="localhost:9100" job="node" name="nvmf-autoconnect.service" severity="warning" state="failed" type="oneshot"	firing	2025-06-23 00:26:53 +0000 UTC	1
Annotations
summary nvmf-autoconnect.service a échoué sur localhost:9100

AptUpdatePending (0 active)

alert: AptUpdatePending
expr: sum
  by (instance) (apt_upgrades_pending) > 5
for: 2m
labels:
  severity: info
annotations:
  summary: '{{ $labels.instance }} a {{ $value }} paquets en attente de mise à jour
    !'

CertificateCritical (0 active)

alert: CertificateCritical
expr: (probe_ssl_earliest_cert_expiry
  - time()) / (24 * 3600) <= 2
for: 5m
labels:
  severity: critical
annotations:
  summary: Le certificat SSL de {{ $labels.instance }} arrive à échéance dans {{ humanize
    $value }} jours !

CertificateWarning (0 active)

alert: CertificateWarning
expr: ((probe_ssl_earliest_cert_expiry
  - time()) / (24 * 3600) < 10 and (probe_ssl_earliest_cert_expiry - time()) /
  (24 * 3600) > 2)
for: 5m
labels:
  severity: warning
annotations:
  summary: Le certificat SSL de {{ $labels.instance }} arrive à échéance dans {{ humanize
    $value }} jours !

CpuUsage (0 active)

alert: CpuUsage
expr: (100
  - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 75
for: 10m
labels:
  severity: warning
annotations:
  summary: CPU sur {{ $labels.instance }} à {{ humanize $value }}%.

DHT22HighHumidity (0 active)

alert: DHT22HighHumidity
expr: dht22_humidity_percent
  > 80
for: 5m
labels:
  severity: warning
annotations:
  description: 'L'humidité mesurée par DHT22 est supérieure à 80% depuis plus
    de 5 minutes (valeur actuelle: {{ $value }}%).'
  summary: Humidité élevée détectée

DHT22LowHumidity (0 active)

alert: DHT22LowHumidity
expr: dht22_humidity_percent
  < 20
for: 5m
labels:
  severity: warning
annotations:
  description: 'L'humidité mesurée par DHT22 est inférieure à 20% depuis plus
    de 5 minutes (valeur actuelle: {{ $value }}%).'
  summary: Humidité basse détectée

EndpointDown (0 active)

alert: EndpointDown
expr: probe_success
  == 0
for: 10m
labels:
  severity: critical
annotations:
  summary: Site {{ $labels.instance }} inaccessible !

EndpointError (0 active)

alert: EndpointError
expr: (probe_http_status_code
  != 0 and probe_http_status_code != 200 and probe_http_status_code != 401)
for: 10m
labels:
  severity: critical
annotations:
  summary: Site {{ $labels.instance }} retourne une erreur HTTP {{ $value }} !

EnvironmentalTemperature (0 active)

alert: EnvironmentalTemperature
expr: dht22_temperature_c
  <= 15
for: 10m
labels:
  severity: warning
annotations:
  summary: Température environmentale sur {{ $labels.instance }} à {{ $value }}° !

EnvironmentalTemperature (0 active)

alert: EnvironmentalTemperature
expr: dht22_temperature_c
  >= 25
for: 10m
labels:
  severity: warning
annotations:
  summary: Température environmentale sur {{ $labels.instance }} à {{ $value }}° !

EnvironmentalTemperature (0 active)

alert: EnvironmentalTemperature
expr: dht22_temperature_c
  >= 35
for: 10m
labels:
  severity: critical
annotations:
  summary: Température environmentale sur {{ $labels.instance }} à {{ $value }}° !

EnvironmentalTemperature (0 active)

alert: EnvironmentalTemperature
expr: dht22_temperature_c
  <= 10
for: 10m
labels:
  severity: critical
annotations:
  summary: Température environmentale sur {{ $labels.instance }} à {{ $value }}° !

HostRaidArrayGotInactive (0 active)

alert: HostRaidArrayGotInactive
expr: (node_md_state{state="inactive"}
  > 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
labels:
  severity: critical
annotations:
  description: |-
    RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host RAID array got inactive (instance {{ $labels.instance }})

HostRaidDiskFailure (0 active)

alert: HostRaidDiskFailure
expr: (node_md_disks{state="failed"}
  > 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host RAID disk failure (instance {{ $labels.instance }})

InstanceDown (0 active)

alert: InstanceDown
expr: up == 0
for: 10m
labels:
  severity: critical
annotations:
  summary: 'Federate : {{ $labels.instance }} est invisible depuis plus de 10 minutes
    !'

LoadUsage (0 active)

alert: LoadUsage
expr: node_load1 > 3
for: 10m
labels:
  severity: warning
annotations:
  summary: La charge de {{ $labels.instance }} est à {{ $value }} !

OutOfDiskSpace (0 active)

alert: OutOfDiskSpace
expr: node_filesystem_free_bytes{fstype="ext4"}
  / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
for: 5m
labels:
  severity: warning
annotations:
  summary: Espace libre de {{ $labels.mountpoint }} sur {{ $labels.exported_instance
    }} à {{ humanize $value }}%.

OutOfInodes (0 active)

alert: OutOfInodes
expr: node_filesystem_files_free{fstype="ext4"}
  / node_filesystem_files{fstype="ext4"} * 100 < 10
for: 5m
labels:
  severity: warning
annotations:
  summary: 'Federate : Presque plus d'inodes disponibles ({{ $value }}% restant)
    dans {{ $labels.mountpoint }} sur {{ $labels.instance }}.'

OutOfMemory (0 active)

alert: OutOfMemory
expr: (node_memory_MemFree_bytes
  + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes
  * 100 < 10
for: 5m
labels:
  severity: warning
annotations:
  summary: 'Federate : Mémoire libre de {{ $labels.instance }} à {{ humanize $value
    }}%.'

SmartCriticalWarning (0 active)

alert: SmartCriticalWarning
expr: smartctl_device_critical_warning
  > 0
for: 15m
labels:
  severity: critical
annotations:
  description: |-
    device has critical warning (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Smart critical warning (instance {{ $labels.instance }})

SmartDeviceTemperatureCritical (0 active)

alert: SmartDeviceTemperatureCritical
expr: smartctl_device_temperature
  > 80
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    Device temperature critical  (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Smart device temperature critical (instance {{ $labels.instance }})

SmartDeviceTemperatureWarning (0 active)

alert: SmartDeviceTemperatureWarning
expr: smartctl_device_temperature
  > 60
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Device temperature  warning (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Smart device temperature warning (instance {{ $labels.instance }})

SmartMediaErrors (0 active)

alert: SmartMediaErrors
expr: smartctl_device_media_errors
  > 0
for: 15m
labels:
  severity: critical
annotations:
  description: |-
    device has media errors (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Smart media errors (instance {{ $labels.instance }})

SmartNvmeWearoutIndicator (0 active)

alert: SmartNvmeWearoutIndicator
expr: smartctl_device_available_spare{device=~"nvme.*"}
  < smartctl_device_available_spare_threshold{device=~"nvme.*"}
for: 15m
labels:
  severity: critical
annotations:
  description: |-
    NVMe device is wearing out (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})

TemperatureCritical (0 active)

alert: TemperatureCritical
expr: node_hwmon_temp_celsius
  > 55
for: 5m
labels:
  severity: critical
annotations:
  summary: La température de {{ $labels.instance }} est à {{ $value }}°C ( {{ $labels.sensor
    }} !

TemperatureWarning (0 active)

alert: TemperatureWarning
expr: node_hwmon_temp_celsius
  > 45
for: 5m
labels:
  severity: warning
annotations:
  summary: La température de {{ $labels.instance }} est à {{ $value }}°C ( {{ $labels.sensor
    }} !

UnhealthyDisk (0 active)

alert: UnhealthyDisk
expr: smartmon_device_smart_healthy
  < 1
for: 10m
labels:
  severity: critical
annotations:
  summary: Disque {{ $labels.disk }} sur {{ $labels.instance }} n'est pas en bonne
    santé !