Service/Host checking pauses during notification process


#1

We’ve got a decent sized Nagios installation running (135 hosts, 650 services) and it’s performing just fine except for when notifications are being sent out.

We have checks running every minute and have optimized the config. Average load for the box is around 4 (it’s a quad core system, so that’s fine). The problem is when notifications are triggered. For some reason mail notifications are taking ~10 seconds to execute and they’re stopping all checks until they’ve completed. We have 7 people set up in a group for notifications.

For example: if one box goes down that has 10 services on it goes down, Nagios sends out 70 emails for those services. Each email is taking ~10 seconds and they’re running serially, so that’s 700 seconds. Until all the emails have gone out, we’re not seeing any checks being performed. This leaves us with a 10-15 minute gap in our monitoring, which isn’t acceptable.

We’re looking into why emails are taking so long to send, but in the meantime is there any way to fork the notifications? It doesn’t make sense for all checks to be delayed because of slowness in notification. Our box is far from overloaded, so it’d be fine to spawn 50-70 processes and have them send the emails while Nagios resumes service checks.

Please let me know if there’s any other information that would be helpful in figuring this out. Thanks.

Here’s our service notification command (it’s the stock out-of-the-box one):
define command{
command_name notify-service-by-email
command_line /usr/bin/printf “%b” “***** Nagios ***\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" | /usr/bin/mail -s " $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **” $CONTACTEMAIL$
}

And here’s our config
log_file=/var/log/nagios3/nagios.log
cfg_dir=/etc/nagios-plugins/config
cfg_dir=/home/nagios/angel.conf.d/basic-config
cfg_dir=/home/nagios/angel.conf.d/ext-config
object_cache_file=/var/cache/nagios3/objects.cache
precached_object_file=/var/cache/nagios3/objects.precache
resource_file=/home/nagios/angel.conf.d/resource.cfg
status_file=/var/cache/nagios3/status.dat
status_update_interval=10
nagios_user=nagios
nagios_group=nagios
check_external_commands=1
command_check_interval=-1
command_file=/var/lib/nagios3/rw/nagios.cmd
external_command_buffer_slots=4096
lock_file=/var/run/nagios3/nagios3.pid
temp_file=/var/cache/nagios3/nagios.tmp
temp_path=/tmp
event_broker_options=-1
log_rotation_method=d
log_archive_path=/var/log/nagios3/archives
use_syslog=1
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_external_commands=1
log_passive_checks=1
service_inter_check_delay_method=s
max_service_check_spread=3
service_interleave_factor=s
host_inter_check_delay_method=s
max_host_check_spread=2
max_concurrent_checks=0
check_result_reaper_frequency=5
max_check_result_reaper_time=30
check_result_path=/var/lib/nagios3/spool/checkresults
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
sleep_time=0.25
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
perfdata_timeout=5
retain_state_information=1
state_retention_file=/var/lib/nagios3/retention.dat
retention_update_interval=60
use_retained_program_state=1
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=1
enable_event_handlers=1
process_performance_data=1
service_perfdata_file=/var/nagios/service-perfdata.dat
service_perfdata_file_template=$LASTSERVICECHECK$||$HOSTNAME$||$SERVICEDESC$||$SERVICEOUTPUT$||$SERVICEPERFDATA$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=60
service_perfdata_file_processing_command=process-service-perfdata-nagiosgraph
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
service_freshness_check_interval=60
check_host_freshness=0
host_freshness_check_interval=60
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
date_format=iso8601
p1_file=/usr/lib/nagios3/p1.pl
enable_embedded_perl=0
use_embedded_perl_implicitly=1
illegal_object_name_chars=~!$%^&*|'"<>?,()= illegal_macro_output_chars=~$&|’"<>
use_regexp_matching=0
use_true_regexp_matching=0
admin_email=root@localhost
admin_pager=pageroot@localhost
daemon_dumps_core=0
use_large_installation_tweaks=0
enable_environment_macros=1
debug_level=0
debug_verbosity=1
debug_file=/var/log/nagios3/nagios.debug
max_debug_file_size=1000000