Hi All,
This issue has been bothering me for quite some time, I’m getting a high number of stale passive check alerts. It seems like some passive checks are not being processed. I currently have 6596 incoming passive checks every 5 minutes. The rest of the relevant configuration are as follows:
define service{
name template_passive
active_checks_enabled 0
passive_checks_enabled 1
parallelize_check 0
obsess_over_service 0
check_freshness 1
freshness_threshold 600
check_command check_stale_passive
notifications_enabled 1
event_handler_enabled 0
flap_detection_enabled 1
failure_prediction_enabled 0
process_perf_data 0
retain_status_information 1
retain_nonstatus_information 1
is_volatile 0
check_period 24x7
max_check_attempts 1
normal_check_interval 1
retry_check_interval 1
contact_groups admin
notification_options c
notification_interval 0
notification_period 24x7
register 0
}
nagios.cfg
max_check_result_reaper_time=15
check_result_reaper_frequency=5
service_freshness_check_interval=780
host_freshness_check_interval=90
status_update_interval=20
check_external_commands=1
command_check_interval=-1
external_command_buffer_slots=8192
event_broker_options=-1
use_syslog=0
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_external_commands=1
log_passive_checks=1
max_service_check_spread=30
max_host_check_spread=30
max_concurrent_checks=0
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
sleep_time=0.125
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
perfdata_timeout=5
retain_state_information=1
retention_update_interval=60
use_retained_program_state=0
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=1
enable_event_handlers=1
process_performance_data=0
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
check_host_freshness=1
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
p1_file=/usr/local/nagios/sbin/p1.pl
enable_embedded_perl=1
use_embedded_perl_implicitly=1
use_regexp_matching=1
use_true_regexp_matching=0
daemon_dumps_core=0
use_large_installation_tweaks=1
enable_environment_macros=0
free_child_process_memory=0
child_processes_fork_twice=0
debug_level=0
debug_verbosity=1
max_debug_file_size=1000000
My current situation: nagios miss/fails to process approximately an average of 600 out of 6596 passive check results every 5 mins.
I admint I don’t know nagios that well, I started installing/using nagios only recently, and I don’t know where/how to start troubleshooting this. I did install mrtg and did a good amount of trial and error with the config, especially max_check_result_reaper_time and check_result_reaper_frequency, but increasing or decreasing the values of these variables only worsens the current situation.
However, this pstree output looks like a qualified starting point:
[root@foobar nagios]# pstree -cpG | grep nagios
??nagios(7943)???{nagios}(7944)
[root@foobar tmp]# strace -s50 -p 7944
Process 7944 attached - interrupt to quit
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN}], 1, 500) = 0
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291780] PROCESS_SERVICE_CHECK_RESULT;foopet”…, 4096) = 94
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291780] PROCESS_SERVICE_CHECK_RESULT;foopet”…, 4096) = 92
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooaptm”…, 4096) = 94
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooaptm”…, 4096) = 92
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet”…, 4096) = 93
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet”…, 4096) = 94
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;foopet”…, 4096) = 92
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, “[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet”…, 4096) = 94
read(4, 0x2aaaaaaad000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
poll({fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
[root@foobar tmp]# ls -l /proc/7944/fd
total 0
lr-x------ 1 root root 64 Nov 21 13:14 0 -> /dev/null
l-wx------ 1 root root 64 Nov 21 13:14 1 -> /dev/null
l-wx------ 1 root root 64 Nov 21 13:14 2 -> /dev/null
lrwx------ 1 root root 64 Nov 21 13:14 3 -> /var/run/nagios.pid
lrwx------ 1 root root 64 Nov 21 13:14 4 -> /var/log/nagios/rw/nagios.cmd
The “EAGAIN/resource temporarily available” messages, is this normal?
If yes, what kind of output do I need to produce in order to verify/abandon my gut feeling that nagios is not processing all results?
if no, any suggestions how to attack the problem?
Thank you in advance.
Regards,
Marc
server specs:
[root@foobar tmp]# cat /etc/*release
Red Hat Enterprise Linux Server release 5.1 (Tikanga)
[root@foobar tmp]# free -m
total used free shared buffers cached
Mem: 31905 23681 8224 0 553 15672
8 cpus
processor : 7
vendor_id : AuthenticAMD
cpu family : 15
model : 33
model name : AMD Opteron ™ Processor 880
stepping : 2
cpu MHz : 2400.000
cache size : 1024 KB
[root@foobar tmp]# /usr/local/nagios/sbin/nagios -v /etc/nagios/nagios.cfg
Nagios 3.0.3
Copyright © 1999-2008 Ethan Galstad (nagios.org)
Last Modified: 06-25-2008
License: GPL
Reading configuration data…
Running pre-flight check on configuration data…
Checking services…
Checked 7491 services.
Checking hosts…
Checked 460 hosts.
Checking host groups…
Checked 30 host groups.
Checking service groups…
Checked 0 service groups.
Checking contacts…
Checked 3 contacts.
Checking contact groups…
Checked 3 contact groups.
Checking service escalations…
Checked 0 service escalations.
Checking service dependencies…
Checked 0 service dependencies.
Checking host escalations…
Checked 0 host escalations.
Checking host dependencies…
Checked 0 host dependencies.
Checking commands…
Checked 28 commands.
Checking time periods…
Checked 6 time periods.
Checking for circular paths between hosts…
Checking for circular host and service dependencies…
Checking global event handlers…
Checking obsessive compulsive processor commands…
Checking misc settings…
Total Warnings: 0
Total Errors: 0
Things look okay - No serious problems were detected during the pre-flight check