selfhost/monitoring/telegraf_host.conf

# Telegraf Configuration
#
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
#
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
#
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
#
# Environment variables can be used anywhere in this config file, simply surround
# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"),
# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR})

# Config Sample under https://github.com/influxdata/telegraf/blob/master/etc/telegraf.conf

# Global tags can be specified here in key="value" format.
[global_tags]
  # datacenter
  dc="florianzirker.de"
  source="telegraf_host"

# Configuration for telegraf agent
[agent]
  ## Default data collection interval for all inputs
  interval = "10s"
  ## Rounds collection interval to 'interval'
  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  ## Telegraf will send metrics to outputs in batches of at most
  ## metric_batch_size metrics.
  ## This controls the size of writes that Telegraf sends to output plugins.
  metric_batch_size = 1000

  ## Maximum number of unwritten metrics per output.  Increasing this value
  ## allows for longer periods of output downtime without dropping metrics at the
  ## cost of higher maximum memory usage.
  metric_buffer_limit = 10000

  ## Collection jitter is used to jitter the collection by a random amount.
  ## Each plugin will sleep for a random time within jitter before collecting.
  ## This can be used to avoid many plugins querying things like sysfs at the
  ## same time, which can have a measurable effect on the system.
  collection_jitter = "0s"

  ## Default flushing interval for all outputs. Maximum flush_interval will be
  ## flush_interval + flush_jitter
  flush_interval = "10s"
  ## Jitter the flush interval by a random amount. This is primarily to avoid
  ## large write spikes for users running a large number of telegraf instances.
  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "0s"

  ## By default or when set to "0s", precision will be set to the same
  ## timestamp order as the collection interval, with the maximum being 1s.
  ##   ie, when interval = "10s", precision will be "1s"
  ##       when interval = "250ms", precision will be "1ms"
  ## Precision will NOT be used for service inputs. It is up to each individual
  ## service input to set the timestamp at the appropriate precision.
  ## Valid time units are "ns", "us" (or "µs"), "ms", "s".
  precision = ""

  ## Override default hostname, if empty use os.Hostname()
  hostname = "wong"
  ## If set to true, do no set the "host" tag in the telegraf agent.
  omit_hostname = false


###############################################################################
#                            OUTPUT PLUGINS                                   #
###############################################################################

# Configuration for sending metrics to InfluxDB
[[outputs.influxdb]]
  ## The full HTTP or UDP URL for your InfluxDB instance.
  ##
  ## Multiple URLs can be specified for a single cluster, only ONE of the
  ## urls will be written to each interval.
  # urls = ["unix:///var/run/influxdb.sock"]
  # urls = ["udp://127.0.0.1:8089"]
  # urls = ["http://127.0.0.1:8086"]

  ## HTTP Basic Auth
  username = "${INFLUXDB_HTTP_BASIC_AUTH_USER}"
  password = "${INFLUXDB_HTTP_BASIC_AUTH_PASSWORD}"
  urls = ["https://influxdb.florianzirker.de"] # required


###############################################################################
#                            INPUT PLUGINS                                    #
###############################################################################


# Read metrics about cpu usage
[[inputs.cpu]]
  ## Whether to report per-cpu stats or not
  percpu = true
  ## Whether to report total system cpu stats or not
  totalcpu = true
  ## If true, collect raw CPU time metrics.
  collect_cpu_time = false
  ## If true, compute and report the sum of all non-idle CPU states.
  report_active = false


# Read metrics about disk usage by mount point
[[inputs.disk]]
  ## By default stats will be gathered for all mount points.
  ## Set mount_points will restrict the stats to only the specified mount points.
  mount_points = ["/hostfs", "/hostfs/boot"]

  ## Ignore mount points by filesystem type.
  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]


# Read metrics about disk IO by device
[[inputs.diskio]]
  ## By default, telegraf will gather stats for all devices including
  ## disk partitions.
  ## Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb", "vd*"]
  ## Uncomment the following line if you need disk serial numbers.
  # skip_serial_number = false
  #
  ## On systems which support it, device metadata can be added in the form of
  ## tags.
  ## Currently only Linux is supported via udev properties. You can view
  ## available properties for a device by running:
  ## 'udevadm info -q property -n /dev/sda'
  ## Note: Most, but not all, udev properties can be accessed this way. Properties
  ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH.
  # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"]
  #
  ## Using the same metadata source as device_tags, you can also customize the
  ## name of the device via templates.
  ## The 'name_templates' parameter is a list of templates to try and apply to
  ## the device. The template may contain variables in the form of '$PROPERTY' or
  ## '${PROPERTY}'. The first template which does not contain any variables not
  ## present for the device is used as the device name tag.
  ## The typical use case is for LVM volumes, to get the VG/LV name instead of
  ## the near-meaningless DM-0 name.
  # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"]


# Get kernel statistics from /proc/stat
[[inputs.kernel]]
  # no configuration


# Read metrics about memory usage
[[inputs.mem]]
  # no configuration


# Get the number of processes and group them by status
[[inputs.processes]]
  # no configuration


# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration


# Read metrics about system load & uptime
[[inputs.system]]
  ## Uncomment to remove deprecated metrics.
  # fielddrop = ["uptime_format"]


# Read metrics about docker containers
[[inputs.docker]]
  ## Docker Endpoint
  ##   To use TCP, set endpoint = "tcp://[ip]:[port]"
  ##   To use environment variables (ie, docker-machine), set endpoint = "ENV"
  endpoint = "unix:///var/run/docker.sock"

  ## Set to true to collect Swarm metrics(desired_replicas, running_replicas)
  ## Note: configure this in one of the manager nodes in a Swarm cluster.
  ## configuring in multiple Swarm managers results in duplication of metrics.
  gather_services = false

  ## Only collect metrics for these containers. Values will be appended to
  ## container_name_include.
  ## Deprecated (1.4.0), use container_name_include
  container_names = []

  ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
  source_tag = false

  ## Containers to include and exclude. Collect all if empty. Globs accepted.
  container_name_include = []
  container_name_exclude = []

  ## Container states to include and exclude. Globs accepted.
  ## When empty only containers in the "running" state will be captured.
  ## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
  ## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
  # container_state_include = []
  # container_state_exclude = []

  ## Timeout for docker list, info, and stats commands
  timeout = "5s"

  ## Whether to report for each container per-device blkio (8:0, 8:1...),
  ## network (eth0, eth1, ...) and cpu (cpu0, cpu1, ...) stats or not.
  ## Usage of this setting is discouraged since it will be deprecated in favor of 'perdevice_include'.
  ## Default value is 'true' for backwards compatibility, please set it to 'false' so that 'perdevice_include' setting 
  ## is honored.
  perdevice = false
  
  ## Specifies for which classes a per-device metric should be issued
  ## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...)
  ## Please note that this setting has no effect if 'perdevice' is set to 'true'
  perdevice_include = ["cpu", "blkio", "network"]
  
  ## Whether to report for each container total blkio and network stats or not.
  ## Usage of this setting is discouraged since it will be deprecated in favor of 'total_include'.
  ## Default value is 'false' for backwards compatibility, please set it to 'true' so that 'total_include' setting 
  ## is honored.
  total = true
  
  ## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values.
  ## Possible values are 'cpu', 'blkio' and 'network'  
  ## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin.
  ## Please note that this setting has no effect if 'total' is set to 'false'
  total_include = ["cpu", "blkio", "network"]

  ## docker labels to include and exclude as tags.  Globs accepted.
  ## Note that an empty array for both will include all labels as tags
  docker_label_include = []
  docker_label_exclude = []

  ## Which environment variables should we use as a tag
  tag_env = ["JAVA_HOME", "HEAP_SIZE"]

  ## Optional TLS Config
  # tls_ca = "/etc/telegraf/ca.pem"
  # tls_cert = "/etc/telegraf/cert.pem"
  # tls_key = "/etc/telegraf/key.pem"
  ## Use TLS but skip chain & host verification
  # insecure_skip_verify = false


# Gather metrics about network interfaces
[[inputs.net]]
  ## By default, telegraf gathers stats from any up interface (excluding loopback)
  ## Setting interfaces will tell it to gather these explicit interfaces,
  ## regardless of status. When specifying an interface, glob-style
  ## patterns are also supported.
  ##
  interfaces = ["eth*"]
  ##
  ## On linux systems telegraf also collects protocol stats.
  ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics.
  ##
  # ignore_protocol_stats = false
  ##


# # Read TCP metrics such as established, time wait and sockets counts.
[[inputs.netstat]]
  # no configuration


# Collect kernel snmp counters and network interface statistics
[[inputs.nstat]]
  ## file paths for proc files. If empty default paths will be used:
  ##    /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6
  ## These can also be overridden with env variables, see README.
  proc_net_netstat = "/proc/net/netstat"
  proc_net_snmp = "/proc/net/snmp"
  proc_net_snmp6 = "/proc/net/snmp6"
  ## dump metrics with 0 values too
  dump_zeros       = true
Monitoring with telegraf, influxdb grafana: - Influxdb (1.8) behind traefik proxy so that telegraf can run in host_mode. Secured with HTTP basic auth - Grafana (7.4) with Postgres database for beatifull dashboards - Telegraf (1.18) in docker host mode with local configuration file - Added docker.group to all other docker services to track them in monitoring 2021-03-28 13:50:02 +02:00			`# Telegraf Configuration`
			`#`
			`# Telegraf is entirely plugin driven. All metrics are gathered from the`
			`# declared inputs, and sent to the declared outputs.`
			`#`
			`# Plugins must be declared in here to be active.`
			`# To deactivate a plugin, comment out the name and any variables.`
			`#`
			`# Use 'telegraf -config telegraf.conf -test' to see what metrics a config`
			`# file would generate.`
			`#`
			`# Environment variables can be used anywhere in this config file, simply surround`
			`# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"),`
			`# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR})`

			`# Config Sample under https://github.com/influxdata/telegraf/blob/master/etc/telegraf.conf`

			`# Global tags can be specified here in key="value" format.`
			`[global_tags]`
			`# datacenter`
			`dc="florianzirker.de"`
Monitoring for jitsi meet - Therefore split telegraf into two instances one for host and docker and other for network queries (network monitoring) 2021-03-28 15:47:32 +02:00			`source="telegraf_host"`
Monitoring with telegraf, influxdb grafana: - Influxdb (1.8) behind traefik proxy so that telegraf can run in host_mode. Secured with HTTP basic auth - Grafana (7.4) with Postgres database for beatifull dashboards - Telegraf (1.18) in docker host mode with local configuration file - Added docker.group to all other docker services to track them in monitoring 2021-03-28 13:50:02 +02:00
			`# Configuration for telegraf agent`
			`[agent]`
			`## Default data collection interval for all inputs`
			`interval = "10s"`
			`## Rounds collection interval to 'interval'`
			`## ie, if interval="10s" then always collect on :00, :10, :20, etc.`
			`round_interval = true`

			`## Telegraf will send metrics to outputs in batches of at most`
			`## metric_batch_size metrics.`
			`## This controls the size of writes that Telegraf sends to output plugins.`
			`metric_batch_size = 1000`

			`## Maximum number of unwritten metrics per output. Increasing this value`
			`## allows for longer periods of output downtime without dropping metrics at the`
			`## cost of higher maximum memory usage.`
			`metric_buffer_limit = 10000`

			`## Collection jitter is used to jitter the collection by a random amount.`
			`## Each plugin will sleep for a random time within jitter before collecting.`
			`## This can be used to avoid many plugins querying things like sysfs at the`
			`## same time, which can have a measurable effect on the system.`
			`collection_jitter = "0s"`

			`## Default flushing interval for all outputs. Maximum flush_interval will be`
			`## flush_interval + flush_jitter`
			`flush_interval = "10s"`
			`## Jitter the flush interval by a random amount. This is primarily to avoid`
			`## large write spikes for users running a large number of telegraf instances.`
			`## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s`
			`flush_jitter = "0s"`

			`## By default or when set to "0s", precision will be set to the same`
			`## timestamp order as the collection interval, with the maximum being 1s.`
			`## ie, when interval = "10s", precision will be "1s"`
			`## when interval = "250ms", precision will be "1ms"`
			`## Precision will NOT be used for service inputs. It is up to each individual`
			`## service input to set the timestamp at the appropriate precision.`
			`## Valid time units are "ns", "us" (or "µs"), "ms", "s".`
			`precision = ""`

			`## Override default hostname, if empty use os.Hostname()`
			`hostname = "wong"`
			`## If set to true, do no set the "host" tag in the telegraf agent.`
			`omit_hostname = false`


			`###############################################################################`
			`# OUTPUT PLUGINS #`
			`###############################################################################`

			`# Configuration for sending metrics to InfluxDB`
			`[[outputs.influxdb]]`
			`## The full HTTP or UDP URL for your InfluxDB instance.`
			`##`
			`## Multiple URLs can be specified for a single cluster, only ONE of the`
			`## urls will be written to each interval.`
			`# urls = ["unix:///var/run/influxdb.sock"]`
			`# urls = ["udp://127.0.0.1:8089"]`
			`# urls = ["http://127.0.0.1:8086"]`

			`## HTTP Basic Auth`
			`username = "${INFLUXDB_HTTP_BASIC_AUTH_USER}"`
			`password = "${INFLUXDB_HTTP_BASIC_AUTH_PASSWORD}"`
			`urls = ["https://influxdb.florianzirker.de"] # required`


			`###############################################################################`
			`# INPUT PLUGINS #`
			`###############################################################################`


			`# Read metrics about cpu usage`
			`[[inputs.cpu]]`
			`## Whether to report per-cpu stats or not`
			`percpu = true`
			`## Whether to report total system cpu stats or not`
			`totalcpu = true`
			`## If true, collect raw CPU time metrics.`
			`collect_cpu_time = false`
			`## If true, compute and report the sum of all non-idle CPU states.`
			`report_active = false`


			`# Read metrics about disk usage by mount point`
			`[[inputs.disk]]`
			`## By default stats will be gathered for all mount points.`
			`## Set mount_points will restrict the stats to only the specified mount points.`
			`mount_points = ["/hostfs", "/hostfs/boot"]`

			`## Ignore mount points by filesystem type.`
			`ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]`


			`# Read metrics about disk IO by device`
			`[[inputs.diskio]]`
			`## By default, telegraf will gather stats for all devices including`
			`## disk partitions.`
			`## Setting devices will restrict the stats to the specified devices.`
			`# devices = ["sda", "sdb", "vd*"]`
			`## Uncomment the following line if you need disk serial numbers.`
			`# skip_serial_number = false`
			`#`
			`## On systems which support it, device metadata can be added in the form of`
			`## tags.`
			`## Currently only Linux is supported via udev properties. You can view`
			`## available properties for a device by running:`
			`## 'udevadm info -q property -n /dev/sda'`
			`## Note: Most, but not all, udev properties can be accessed this way. Properties`
			`## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH.`
			`# device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"]`
			`#`
			`## Using the same metadata source as device_tags, you can also customize the`
			`## name of the device via templates.`
			`## The 'name_templates' parameter is a list of templates to try and apply to`
			`## the device. The template may contain variables in the form of '$PROPERTY' or`
			`## '${PROPERTY}'. The first template which does not contain any variables not`
			`## present for the device is used as the device name tag.`
			`## The typical use case is for LVM volumes, to get the VG/LV name instead of`
			`## the near-meaningless DM-0 name.`
			`# name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"]`


			`# Get kernel statistics from /proc/stat`
			`[[inputs.kernel]]`
			`# no configuration`


			`# Read metrics about memory usage`
			`[[inputs.mem]]`
			`# no configuration`


			`# Get the number of processes and group them by status`
			`[[inputs.processes]]`
			`# no configuration`


			`# Read metrics about swap memory usage`
			`[[inputs.swap]]`
			`# no configuration`


			`# Read metrics about system load & uptime`
			`[[inputs.system]]`
			`## Uncomment to remove deprecated metrics.`
			`# fielddrop = ["uptime_format"]`



			`# Read metrics about docker containers`
			`[[inputs.docker]]`
			`## Docker Endpoint`
			`## To use TCP, set endpoint = "tcp://[ip]:[port]"`
			`## To use environment variables (ie, docker-machine), set endpoint = "ENV"`
			`endpoint = "unix:///var/run/docker.sock"`

			`## Set to true to collect Swarm metrics(desired_replicas, running_replicas)`
			`## Note: configure this in one of the manager nodes in a Swarm cluster.`
			`## configuring in multiple Swarm managers results in duplication of metrics.`
			`gather_services = false`

			`## Only collect metrics for these containers. Values will be appended to`
			`## container_name_include.`
			`## Deprecated (1.4.0), use container_name_include`
			`container_names = []`

			`## Set the source tag for the metrics to the container ID hostname, eg first 12 chars`
			`source_tag = false`

			`## Containers to include and exclude. Collect all if empty. Globs accepted.`
			`container_name_include = []`
			`container_name_exclude = []`

			`## Container states to include and exclude. Globs accepted.`
			`## When empty only containers in the "running" state will be captured.`
			`## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]`
			`## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]`
			`# container_state_include = []`
			`# container_state_exclude = []`

			`## Timeout for docker list, info, and stats commands`
			`timeout = "5s"`

			`## Whether to report for each container per-device blkio (8:0, 8:1...),`
			`## network (eth0, eth1, ...) and cpu (cpu0, cpu1, ...) stats or not.`
			`## Usage of this setting is discouraged since it will be deprecated in favor of 'perdevice_include'.`
			`## Default value is 'true' for backwards compatibility, please set it to 'false' so that 'perdevice_include' setting`
			`## is honored.`
			`perdevice = false`

			`## Specifies for which classes a per-device metric should be issued`
			`## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...)`
			`## Please note that this setting has no effect if 'perdevice' is set to 'true'`
			`perdevice_include = ["cpu", "blkio", "network"]`

			`## Whether to report for each container total blkio and network stats or not.`
			`## Usage of this setting is discouraged since it will be deprecated in favor of 'total_include'.`
			`## Default value is 'false' for backwards compatibility, please set it to 'true' so that 'total_include' setting`
			`## is honored.`
			`total = true`

			`## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values.`
			`## Possible values are 'cpu', 'blkio' and 'network'`
			`## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin.`
			`## Please note that this setting has no effect if 'total' is set to 'false'`
			`total_include = ["cpu", "blkio", "network"]`

			`## docker labels to include and exclude as tags. Globs accepted.`
			`## Note that an empty array for both will include all labels as tags`
			`docker_label_include = []`
			`docker_label_exclude = []`

			`## Which environment variables should we use as a tag`
			`tag_env = ["JAVA_HOME", "HEAP_SIZE"]`

			`## Optional TLS Config`
			`# tls_ca = "/etc/telegraf/ca.pem"`
			`# tls_cert = "/etc/telegraf/cert.pem"`
			`# tls_key = "/etc/telegraf/key.pem"`
			`## Use TLS but skip chain & host verification`
			`# insecure_skip_verify = false`


			`# Gather metrics about network interfaces`
			`[[inputs.net]]`
			`## By default, telegraf gathers stats from any up interface (excluding loopback)`
			`## Setting interfaces will tell it to gather these explicit interfaces,`
			`## regardless of status. When specifying an interface, glob-style`
			`## patterns are also supported.`
			`##`
			`interfaces = ["eth*"]`
			`##`
			`## On linux systems telegraf also collects protocol stats.`
			`## Setting ignore_protocol_stats to true will skip reporting of protocol metrics.`
			`##`
			`# ignore_protocol_stats = false`
			`##`


			`# # Read TCP metrics such as established, time wait and sockets counts.`
			`[[inputs.netstat]]`
			`# no configuration`


			`# Collect kernel snmp counters and network interface statistics`
			`[[inputs.nstat]]`
			`## file paths for proc files. If empty default paths will be used:`
			`## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6`
			`## These can also be overridden with env variables, see README.`
			`proc_net_netstat = "/proc/net/netstat"`
			`proc_net_snmp = "/proc/net/snmp"`
			`proc_net_snmp6 = "/proc/net/snmp6"`
			`## dump metrics with 0 values too`
			`dump_zeros = true`