Good day,
I hate to have to post on such a common problem but I am unable to determine root cause.
Problem: It appears that all of my active service checks are stuck in pending. The status information indicates it was scheduled in the past but that time comes and goes without an update to the service status. The passive checks appear to be working fine.
Hardware/Version information:
nagios 3.2.0
RHEL 5.3 (on vmware)
performance - the servce is not working hard. There doesn’t appear to be any issues with cpu/mem/IO
I will use a single host/service to provide specifics on my problem. This host is using check-host-alive for both the host check and service check, i know lame, but I am just troubleshooting for now.
nagios.cfg
log_file=/opt/nagios/var/nagios.log
cfg_dir=/opt/nagios/etc/objects/hosts
cfg_dir=/opt/nagios/etc/objects/rel_hosts
cfg_dir=/opt/nagios/etc/objects/services
cfg_dir=/opt/nagios/etc/objects/host_groups
cfg_dir=/opt/nagios/etc/objects/global
object_cache_file=/opt/nagios/var/objects.cache
precached_object_file=/opt/nagios/var/objects.precache
resource_file=/opt/nagios/etc/resource.cfg
status_file=/opt/nagios/var/status.dat
status_update_interval=10
nagios_user=ccadmin
nagios_group=ccadmin
check_external_commands=1
command_check_interval=-1
command_file=/opt/nagios/var/rw/nagios.cmd
external_command_buffer_slots=4096
lock_file=/opt/nagios/var/nagios.lock
temp_file=/opt/nagios/var/nagios.tmp
temp_path=/tmp
event_broker_options=-1
broker_module=/usr/local/pnp4nagios/bin/npcdmod.o config_file=/usr/local/pnp4nagios/etc/npcd.cfg
log_rotation_method=d
log_archive_path=/opt/nagios/var/archives
use_syslog=0
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_external_commands=1
log_passive_checks=1
service_inter_check_delay_method=s
max_service_check_spread=10 #ASH was 30
service_interleave_factor=s
host_inter_check_delay_method=s
max_host_check_spread=30
max_concurrent_checks=0
check_result_reaper_frequency=10
max_check_result_reaper_time=30
check_result_path=/opt/nagios/var/spool/checkresults
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
sleep_time=0.25
service_check_timeout=30 #ASH was 60
host_check_timeout=30 #ASH was 30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
perfdata_timeout=5
retain_state_information=1
state_retention_file=/opt/nagios/var/retention.dat
retention_update_interval=60
use_retained_program_state=1
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
check_for_updates=1
bare_update_check=0
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=0
enable_event_handlers=1
process_performance_data=1
service_perfdata_file=/usr/local/pnp4nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file
host_perfdata_file=/usr/local/pnp4nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file
obsess_over_services=1
ocsp_command=submit_check_result
obsess_over_hosts=1
ochp_command=submit_host_check
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
service_freshness_check_interval=3600
check_host_freshness=0
host_freshness_check_interval=60
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
date_format=us
p1_file=/opt/nagios/bin/p1.pl
enable_embedded_perl=1
use_embedded_perl_implicitly=1
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
illegal_macro_output_chars=`~$&|'"<>
use_regexp_matching=0
use_true_regexp_matching=0
[email protected]
admin_pager=pageccadmin@localhost
daemon_dumps_core=0
use_large_installation_tweaks=1
enable_environment_macros=1
debug_level=-1
debug_verbosity=1
debug_file=/opt/nagios/var/nagios.debug
max_debug_file_size=1000000
host.cfg
define host{
host_name xx
address xx.bla.ORG.
alias xx.bla.org
notes Created by nagiosPollerWorker as parent
hostgroups GENERIC_PING
use generic-host
}
host_group is excluded but it exists.
service.cfg
define service{
service_description GENERIC_PING
display_name Generic ping, typically used for parent hosts
hostgroup_name GENERIC_PING
check_command check-host-alive
use generic-service
}
template.cfg for generic-service
define service{
name generic-service ; The 'name' of this service template
action_url /pnp4nagios/graph?host=$HOSTNAME$&srv=$SERVICEDESC$
active_checks_enabled 1 ; Active service checks are enabled
passive_checks_enabled 1 ; Passive service checks are enabled/accepted
parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems)
obsess_over_service 1 ; We should obsess over this service (if necessary)
check_freshness 0 ; Default is to NOT check service 'freshness'
notifications_enabled 0 ; Service notifications are enabled
event_handler_enabled 1 ; Service event handler is enabled
flap_detection_enabled 0 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
is_volatile 0 ; The service is not volatile
check_period 24x7 ; The service can be checked at any time of the day
max_check_attempts 2 ; Re-check the service up to 3 times in order to determine its final (hard) state
normal_check_interval 10 ; Check the service every 10 minutes under normal conditions
retry_check_interval 1 ; Re-check the service every two minutes until a hard state can be determined
contact_groups admins ; Notifications get sent out to everyone in the 'admins' group
notification_options w,u,c,r ; Send notifications about warning, unknown, critical, and recovery events
notification_interval 60 ; Re-notify about service problems every hour
notification_period 24x7 ; Notifications can be sent out at any time
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}
$ grep -i xxx nagios.*
nagios.log:[1269838800] CURRENT HOST STATE: xxx;UP;HARD;1;
nagios.log:[1269838800] CURRENT SERVICE STATE: xxx;GENERIC_PING;OK;HARD;1;
general info from status.dat
[code]info {
created=1269888487
version=3.2.0
last_update_check=0
update_available=0
last_version=
new_version=
}
programstatus {
modified_host_attributes=0
modified_service_attributes=0
nagios_pid=15333
daemon_mode=1
program_start=1269883473
last_command_check=1269888456
last_log_rotation=0
enable_notifications=0
active_service_checks_enabled=1
passive_service_checks_enabled=1
active_host_checks_enabled=1
passive_host_checks_enabled=1
enable_event_handlers=1
obsess_over_services=1
obsess_over_hosts=1
check_service_freshness=1
check_host_freshness=0
enable_flap_detection=1
enable_failure_prediction=1
process_performance_data=1
global_host_event_handler=
global_service_event_handler=
next_comment_id=1
next_downtime_id=1
next_event_id=40
next_problem_id=39
next_notification_id=1
total_external_command_buffer_slots=4096
used_external_command_buffer_slots=12
high_external_command_buffer_slots=185
active_scheduled_host_check_stats=0,0,0
active_ondemand_host_check_stats=4,25,96
passive_host_check_stats=0,0,0
active_scheduled_service_check_stats=0,0,0
active_ondemand_service_check_stats=0,0,0
passive_service_check_stats=0,0,31
cached_host_check_stats=4,25,96
cached_service_check_stats=0,0,0
external_command_stats=154,464,1400
parallel_host_check_stats=0,0,0
serial_host_check_stats=0,0,0
}
[/code]
Please help…