diff options
author | Laurent Bercot <ska-skaware@skarnet.org> | 2021-06-18 15:47:13 +0000 |
---|---|---|
committer | Laurent Bercot <ska@appnovation.com> | 2021-06-18 15:47:13 +0000 |
commit | ffb9de6131bb7b6b8bb2a57d135160e177587002 (patch) | |
tree | e361a4d1c5bc78f6ac48267aecc3053685cb7af7 | |
parent | 19d34ffe466d8f67988e6fab2da5e2949d306583 (diff) | |
download | s6-ffb9de6131bb7b6b8bb2a57d135160e177587002.tar.xz |
Add lock-fd feature to s6-supervise
Signed-off-by: Laurent Bercot <ska@appnovation.com>
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | doc/servicedir.html | 23 | ||||
-rw-r--r-- | src/supervision/s6-supervise.c | 118 |
3 files changed, 118 insertions, 25 deletions
@@ -5,6 +5,8 @@ In 2.10.1.0 - Bugfixes. - s6-svwait now supports -r and -R options to wait for restarts. + - New service directory file: lock-fd, to take a lock before +starting a service, protecting against concurrent instances. In 2.10.0.3 diff --git a/doc/servicedir.html b/doc/servicedir.html index 0eadb4c..75f08f9 100644 --- a/doc/servicedir.html +++ b/doc/servicedir.html @@ -94,6 +94,7 @@ them afterwards. </li> daemon. That process must not "background itself": being run by a supervision tree already makes it a "background" task. </li> </ul> </li> + <li style="margin-bottom:1em"> An optional executable file named <tt>finish</tt>. Like <tt>run</tt>, it can be any executable file. This <em>finish script</em>, if present, is executed everytime the <tt>run</tt> script dies. Generally, its main @@ -118,14 +119,17 @@ the service dies, via a <tt>s6-svc -x</tt> command or a SIGHUP, then the next invocation of <tt>finish</tt> will (obviously) be the last, and it will run with stdin and stdout pointing to <tt>/dev/null</tt>. </li> </ul> </li> + <li style="margin-bottom:1em"> A directory named <tt>supervise</tt>. It is automatically created by <a href="s6-supervise.html">s6-supervise</a> if it does not exist. This is where <a href="s6-supervise.html">s6-supervise</a> stores its information. The directory must be writable. </li> + <li style="margin-bottom:1em"> An optional, empty, regular file named <tt>down</tt>. If such a file exists, the default state of the service is considered down, not up: s6-supervise will not automatically start it until it receives a <tt>s6-svc -u</tt> command. If no <tt>down</tt> file exists, the default state of the service is up. </li> + <li style="margin-bottom:1em"> An optional regular file named <tt>notification-fd</tt>. If such a file exists, it means that the service supports <a href="notifywhenup.html">readiness notification</a>. The file must only @@ -140,6 +144,20 @@ notification from the service and broadcast readiness, i.e. any <a href="s6-svlisten1.html">s6-svlisten1 -U</a> or <a href="s6-svlisten.html">s6-svlisten -U</a> processes will be triggered. </li> + + <li style="margin-bottom:1em"> An optional regular file named <tt>lock-fd</tt>. If such a file +exists, it must contain an unsigned integer, representing a file descriptor that +will be open in the service. The service <em>should not write to that descriptor</em> +and <em>should not close it</em>. In other words, it should totally ignore it. That +file descriptor holds a lock, that will naturally be released when the service dies. +The point of this feature is to prevent s6-supervise from accidentally spawning several +copies of the service in case something goes wrong: for instance, the service +backgrounds itself (which it shouldn't do when running under a supervision suite), or +s6-supervise is killed, restarted by s6-svscan, and attempts to start another copy of +the service while the first copy is still alive. If s6-supervise detects that the lock +is held when it tries to start the service, it will print a warning message and delay +the starting attempt for 60 seconds. </li> + <li style="margin-bottom:1em"> An optional regular file named <tt>timeout-kill</tt>. If such a file exists, it must only contain an unsigned integer <em>t</em>. If <em>t</em> is nonzero, then on receipt of an <a href="s6-svc.html">s6-svc -d</a> command, @@ -150,12 +168,14 @@ milliseconds, then it is sent a SIGKILL. If <tt>timeout-kill</tt> does not exist, or contains 0 or an invalid value, then the service is never forcibly killed (unless, of course, an <a href="s6-svc.html">s6-svc -k</a> command is sent). </li> + <li style="margin-bottom:1em"> An optional regular file named <tt>timeout-finish</tt>. If such a file exists, it must only contain an unsigned integer, which is the number of milliseconds after which the <tt>./finish</tt> script, if it exists, will be killed with a SIGKILL. The default is 5000: finish scripts are killed if they're still alive after 5 seconds. A value of 0 allows finish scripts to run forever. </li> + <li style="margin-bottom:1em"> An optional regular file named <tt>max-death-tally</tt>. If such a file exists, it must only contain an unsigned integer, which is the maximum number of service death events that s6-supervise will keep track of. If the service dies @@ -163,16 +183,19 @@ more than this number of times, the oldest events will be forgotten. Tracking death events is useful, for instance, when throttling service restarts. The value cannot be greater than 4096. If the file does not exist, a default of 100 is used. </li> + <li style="margin-bottom:1em"> An optional regular file named <tt>down-signal</tt>. If such a file exists, it must only contain the name or number of a signal, followed by a newline. This signal will be used to kill the supervised process when a <a href="s6-svc.html">s6-svc -d</a> or <a href="s6-svc.html">s6-svc -r</a> command is used. If the file does not exist, SIGTERM will be used by default. </li> + <li style="margin-bottom:1em"> A <a href="fifodir.html">fifodir</a> named <tt>event</tt>. It is automatically created by <a href="s6-supervise.html">s6-supervise</a> if it does not exist. <em>foo</em><tt>/event</tt> is the rendez-vous point for listeners, where <a href="s6-supervise.html">s6-supervise</a> will send notifications when the service goes up or down. </li> + <li style="margin-bottom:1em"> An optional service directory named <tt>log</tt>. If it exists and <em>foo</em> is in a <a href="scandir.html">scandir</a>, and <a href="s6-svscan.html">s6-svscan</a> runs on that scandir, then <em>two</em> services are monitored: <em>foo</em> and diff --git a/src/supervision/s6-supervise.c b/src/supervision/s6-supervise.c index bda8e52..1bde604 100644 --- a/src/supervision/s6-supervise.c +++ b/src/supervision/s6-supervise.c @@ -30,12 +30,9 @@ #define USAGE "s6-supervise dir" #define CTL S6_SUPERVISE_CTLDIR "/control" #define LCK S6_SUPERVISE_CTLDIR "/lock" +#define SLCK S6_SUPERVISE_CTLDIR "/service-lock" -#ifdef PATH_MAX -# define S6_PATH_MAX PATH_MAX -#else -# define S6_PATH_MAX 4096 -#endif +#define S6_PATH_MAX 512 typedef enum trans_e trans_t, *trans_t_ref ; enum trans_e @@ -238,51 +235,108 @@ static void trystart (void) { int p[2] ; int notifyp[2] = { -1, -1 } ; - unsigned int fd ; + int lfd = -1 ; + int locked = 1 ; + unsigned int notif, lk ; pid_t pid ; - if (pipecoe(p) < 0) + if (read_uint("lock-fd", &lk)) { - settimeout(60) ; - strerr_warnwu1sys("pipe (waiting 60 seconds)") ; - return ; + if (lk > INT_MAX) strerr_warnw2x("invalid ", "lock-fd") ; + else + { + struct stat st ; + lfd = open_write(SLCK) ; + if (lfd == -1) + { + settimeout(60) ; + strerr_warnwu4sys("open ", SLCK, " for writing", " (waiting 60 seconds)") ; + return ; + } + if (fstat(lfd, &st) == -1) + { + settimeout(60) ; + strerr_warnwu3sys("stat ", SLCK, " (waiting 60 seconds)") ; + goto errl ; + } + if (st.st_size) + { + ftruncate(lfd, 0) ; + strerr_warnw1x("a previous instance of the service wrote to the lock file!") ; + } + locked = fd_lock(lfd, 1, 1) ; + if (locked == -1) + { + settimeout(60) ; + strerr_warnwu3sys("lock ", SLCK, " (waiting 60 seconds)") ; + goto errl ; + } + if (!locked) + strerr_warnw1x("another instance of the service is already running, child will block") ; + } } - if (read_uint("notification-fd", &fd) && pipe(notifyp) < 0) + if (read_uint("notification-fd", ¬if)) + { + if (notif > INT_MAX) strerr_warnw2x("invalid ", "notification-fd") ; + else if (lfd >= 0 && notif == lk) + { + settimeout_infinite() ; + strerr_warnwu1x("start service: notification-fd and lock-fd are the same") ; + goto errl ; + } + else if (pipe(notifyp) < 0) + { + settimeout(60) ; + strerr_warnwu2sys("pipe", " (waiting 60 seconds)") ; + goto errl ; + } + } + if (pipecoe(p) < 0) { settimeout(60) ; - strerr_warnwu1sys("pipe (waiting 60 seconds)") ; - fd_close(p[1]) ; fd_close(p[0]) ; - return ; + strerr_warnwu2sys("pipe", " (waiting 60 seconds)") ; + goto errn ; } pid = fork() ; if (pid < 0) { settimeout(60) ; - strerr_warnwu1sys("fork (waiting 60 seconds)") ; - if (notifyp[1] >= 0) fd_close(notifyp[1]) ; - if (notifyp[0] >= 0) fd_close(notifyp[0]) ; - fd_close(p[1]) ; fd_close(p[0]) ; - return ; + strerr_warnwu2sys("fork", " (waiting 60 seconds)") ; + goto errp ; } else if (!pid) { char const *cargv[2] = { "run", 0 } ; - PROG = "s6-supervise (child)" ; + ((char *)PROG)[strlen(PROG)] = ' ' ; selfpipe_finish() ; sig_restore(SIGPIPE) ; if (notifyp[0] >= 0) close(notifyp[0]) ; close(p[0]) ; - if (notifyp[1] >= 0 && fd_move((int)fd, notifyp[1]) < 0) + if (notifyp[1] >= 0 && fd_move(notif, notifyp[1]) < 0) { failcoe(p[1]) ; strerr_diefu1sys(127, "move notification descriptor") ; } + if (lfd >= 0) + { + if (fd_move(lk, lfd) < 0) + { + failcoe(p[1]) ; + strerr_diefu1sys(127, "move lock descriptor") ; + } + if (!locked && fd_lock(lk, 1, 0) == -1) + { + failcoe(p[1]) ; + strerr_diefu2sys(127, "lock ", SLCK) ; + } + } setsid() ; execv("./run", (char *const *)cargv) ; failcoe(p[1]) ; strerr_dieexec(127, "run") ; } - if (notifyp[1] >= 0) fd_close(notifyp[1]) ; fd_close(p[1]) ; + if (notifyp[1] >= 0) fd_close(notifyp[1]) ; + if (lfd >= 0) fd_close(lfd) ; { char c ; switch (fd_read(p[0], &c, 1)) @@ -312,6 +366,19 @@ static void trystart (void) tain_addsec_g(&dontrespawnbefore, 1) ; announce() ; ftrigw_notifyb_nosig(S6_SUPERVISE_EVENTDIR, "u", 1) ; + return ; + + errp: + fd_close(p[1]) ; + fd_close(p[0]) ; + errn: + if (notifyp[1] >= 0) + { + fd_close(notifyp[1]) ; + fd_close(notifyp[0]) ; + } + errl: + if (lfd >= 0) fd_close(lfd) ; } static void downtimeout (void) @@ -541,11 +608,11 @@ static action_t_ref const actions[5][24] = static inline void handle_notifyfd (void) { - char buf[4096] ; + char buf[512] ; ssize_t r = 1 ; while (r > 0) { - r = sanitize_read(fd_read(notifyfd, buf, 4096)) ; + r = sanitize_read(fd_read(notifyfd, buf, 512)) ; if (r > 0 && memchr(buf, '\n', r)) { tain_wallclock_read(&status.readystamp) ; @@ -704,10 +771,11 @@ int main (int argc, char const *const *argv) { size_t proglen = strlen(PROG) ; size_t namelen = strlen(argv[1]) ; - char progname[proglen + namelen + 2] ; + char progname[proglen + namelen + 10] ; memcpy(progname, PROG, proglen) ; progname[proglen] = ' ' ; memcpy(progname + proglen + 1, argv[1], namelen + 1) ; + memcpy(progname + proglen + 2 + namelen, "(child)", 8) ; PROG = progname ; if (!fd_sanitize()) strerr_diefu1sys(111, "sanitize stdin and stdout") ; x[1].fd = control_init() ; |