| | | 1 | | using System.Diagnostics; |
| | | 2 | | using System.Text.Json; |
| | | 3 | | using Serilog.Events; |
| | | 4 | | using Serilog; |
| | | 5 | | |
| | | 6 | | namespace Kestrun.Health; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// A health probe that runs an external process and interprets its output. |
| | | 10 | | /// </summary> |
| | | 11 | | /// <remarks> |
| | | 12 | | /// Initializes a new instance of the <see cref="ProcessProbe"/> class. |
| | | 13 | | /// </remarks> |
| | | 14 | | /// <param name="name">The name of the probe.</param> |
| | | 15 | | /// <param name="tags">The tags associated with the probe.</param> |
| | | 16 | | /// <param name="fileName">The file name of the process to run.</param> |
| | | 17 | | /// <param name="args">The arguments to pass to the process.</param> |
| | | 18 | | /// <param name="timeout">The timeout for the process to complete.</param> |
| | | 19 | | /// <param name="logger">Optional logger; if null a contextual logger is created.</param> |
| | 3 | 20 | | public sealed class ProcessProbe(string name, string[] tags, string fileName, string args = "", TimeSpan? timeout = null |
| | | 21 | | { |
| | | 22 | | /// <summary> |
| | | 23 | | /// The name of the probe. |
| | | 24 | | /// </summary> |
| | 10 | 25 | | public string Name { get; } = name; |
| | | 26 | | /// <summary> |
| | | 27 | | /// The tags associated with the probe. |
| | | 28 | | /// </summary> |
| | 3 | 29 | | public string[] Tags { get; } = tags; |
| | | 30 | | /// <summary> |
| | | 31 | | /// Logger used for diagnostics. |
| | | 32 | | /// </summary> |
| | 17 | 33 | | public Serilog.ILogger Logger { get; init; } = logger ?? Log.ForContext("HealthProbe", name); |
| | | 34 | | /// <summary> |
| | | 35 | | /// The file name of the process to run. |
| | | 36 | | /// </summary> |
| | 3 | 37 | | private readonly string _fileName = fileName; |
| | | 38 | | /// <summary> |
| | | 39 | | /// The arguments to pass to the process. |
| | | 40 | | /// </summary> |
| | 3 | 41 | | private readonly string _args = args ?? ""; |
| | | 42 | | /// <summary> |
| | | 43 | | /// The timeout for the process to complete. |
| | | 44 | | /// </summary> |
| | 3 | 45 | | private readonly TimeSpan _timeout = timeout ?? TimeSpan.FromSeconds(10); |
| | | 46 | | |
| | | 47 | | /// <summary> |
| | | 48 | | /// Executes the process and interprets its output according to the health probe contract. |
| | | 49 | | /// </summary> |
| | | 50 | | /// <param name="ct">The cancellation token.</param> |
| | | 51 | | /// <returns>A task representing the asynchronous operation, with a <see cref="ProbeResult"/> as the result.</return |
| | | 52 | | public async Task<ProbeResult> CheckAsync(CancellationToken ct = default) |
| | | 53 | | { |
| | 3 | 54 | | var sw = Stopwatch.StartNew(); |
| | | 55 | | try |
| | | 56 | | { |
| | 3 | 57 | | using var proc = CreateProcess(); |
| | 3 | 58 | | _ = proc.Start(); |
| | 3 | 59 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | | 60 | | { |
| | 3 | 61 | | Logger.Debug("ProcessProbe {Probe} started process {File} {Args} (PID={Pid}) with timeout {Timeout}", Na |
| | | 62 | | } |
| | 3 | 63 | | var (outText, errText, timedOut) = await RunProcessAsync(proc, ct).ConfigureAwait(false); |
| | 3 | 64 | | if (timedOut) |
| | | 65 | | { |
| | 1 | 66 | | sw.Stop(); |
| | 1 | 67 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | | 68 | | { |
| | 1 | 69 | | Logger.Debug("ProcessProbe {Probe} internal timeout after {Timeout} (duration={Duration}ms)", Name, |
| | | 70 | | } |
| | 1 | 71 | | var data = string.IsNullOrWhiteSpace(outText) |
| | 1 | 72 | | ? null |
| | 1 | 73 | | : new Dictionary<string, object> { ["stdout"] = outText.Length > 500 ? outText[..500] : outText }; |
| | 1 | 74 | | return new ProbeResult(ProbeStatus.Degraded, $"Timed out after {_timeout.TotalMilliseconds}ms", data); |
| | | 75 | | } |
| | 2 | 76 | | sw.Stop(); |
| | 2 | 77 | | if (TryParseJsonContract(outText, out var contractResult)) |
| | | 78 | | { |
| | 0 | 79 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | | 80 | | { |
| | 0 | 81 | | Logger.Debug("ProcessProbe {Probe} parsed JSON contract (exit={ExitCode}, duration={Duration}ms)", N |
| | | 82 | | } |
| | 0 | 83 | | return contractResult; |
| | | 84 | | } |
| | 2 | 85 | | var mapped = MapExitCode(proc.ExitCode, errText); |
| | 2 | 86 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | | 87 | | { |
| | 2 | 88 | | Logger.Debug("ProcessProbe {Probe} completed (exit={ExitCode}, status={Status}, duration={Duration}ms)", |
| | | 89 | | } |
| | 2 | 90 | | return mapped; |
| | | 91 | | } |
| | 0 | 92 | | catch (OperationCanceledException) when (ct.IsCancellationRequested) |
| | | 93 | | { |
| | | 94 | | // Surface caller/request cancellation without converting to a health status; runner decides final response. |
| | 0 | 95 | | throw; |
| | | 96 | | } |
| | 0 | 97 | | catch (TaskCanceledException ex) |
| | | 98 | | { |
| | | 99 | | // Internal timeout (cts.CancelAfter) -> degrade instead of unhealthy so transient slowness isn't reported a |
| | 0 | 100 | | sw.Stop(); |
| | 0 | 101 | | Logger.Warning(ex, "ProcessProbe {Probe} timed out after {Timeout} (duration={Duration}ms)", Name, _timeout, |
| | 0 | 102 | | return new ProbeResult(ProbeStatus.Degraded, $"Timed out: {ex.Message}"); |
| | | 103 | | } |
| | 0 | 104 | | catch (Exception ex) |
| | | 105 | | { |
| | 0 | 106 | | sw.Stop(); |
| | 0 | 107 | | Logger.Error(ex, "ProcessProbe {Probe} failed after {Duration}ms", Name, sw.ElapsedMilliseconds); |
| | 0 | 108 | | return new ProbeResult(ProbeStatus.Unhealthy, $"Exception: {ex.Message}"); |
| | | 109 | | } |
| | 3 | 110 | | } |
| | | 111 | | |
| | | 112 | | /// <summary> |
| | | 113 | | /// Creates and configures the process to be executed. |
| | | 114 | | /// </summary> |
| | 3 | 115 | | private Process CreateProcess() => new() |
| | 3 | 116 | | { |
| | 3 | 117 | | StartInfo = new ProcessStartInfo |
| | 3 | 118 | | { |
| | 3 | 119 | | FileName = _fileName, |
| | 3 | 120 | | Arguments = _args, |
| | 3 | 121 | | RedirectStandardOutput = true, |
| | 3 | 122 | | RedirectStandardError = true, |
| | 3 | 123 | | UseShellExecute = false, |
| | 3 | 124 | | CreateNoWindow = true |
| | 3 | 125 | | }, |
| | 3 | 126 | | EnableRaisingEvents = true |
| | 3 | 127 | | }; |
| | | 128 | | |
| | | 129 | | /// <summary> |
| | | 130 | | /// Runs the process and captures its standard output and error. |
| | | 131 | | /// </summary> |
| | | 132 | | /// <param name="proc">The process to run.</param> |
| | | 133 | | /// <param name="ct">The cancellation token.</param> |
| | | 134 | | /// <returns>A task representing the asynchronous operation, with the standard output and error as the result.</retu |
| | | 135 | | private async Task<(string StdOut, string StdErr, bool TimedOut)> RunProcessAsync(Process proc, CancellationToken ct |
| | | 136 | | { |
| | 3 | 137 | | using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); |
| | 3 | 138 | | cts.CancelAfter(_timeout); |
| | | 139 | | |
| | 3 | 140 | | var stdOutTask = proc.StandardOutput.ReadToEndAsync(ct); |
| | 3 | 141 | | var stdErrTask = proc.StandardError.ReadToEndAsync(ct); |
| | | 142 | | |
| | 3 | 143 | | using var reg = SetupProcessKillRegistration(proc, cts.Token); |
| | 3 | 144 | | var timedOut = await WaitForProcessWithTimeout(proc, cts.Token, ct); |
| | 3 | 145 | | var (outText, errText) = await ReadProcessStreams(stdOutTask, stdErrTask, timedOut, ct); |
| | | 146 | | |
| | 3 | 147 | | return (outText, errText, timedOut); |
| | 3 | 148 | | } |
| | | 149 | | |
| | | 150 | | /// <summary> |
| | | 151 | | /// Sets up a cancellation token registration to kill the process when timeout or cancellation occurs. |
| | | 152 | | /// </summary> |
| | | 153 | | /// <param name="proc">The process to potentially kill.</param> |
| | | 154 | | /// <param name="cancellationToken">The token that triggers the kill operation.</param> |
| | | 155 | | /// <returns>The cancellation token registration.</returns> |
| | | 156 | | private CancellationTokenRegistration SetupProcessKillRegistration(Process proc, CancellationToken cancellationToken |
| | | 157 | | { |
| | 3 | 158 | | return cancellationToken.Register(() => |
| | 3 | 159 | | { |
| | 3 | 160 | | try |
| | 3 | 161 | | { |
| | 1 | 162 | | if (!proc.HasExited) |
| | 3 | 163 | | { |
| | 1 | 164 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | 3 | 165 | | { |
| | 1 | 166 | | Logger.Debug("ProcessProbe {Probe} cancel/timeout -> killing PID {Pid}", Name, proc.Id); |
| | 3 | 167 | | } |
| | 1 | 168 | | proc.Kill(entireProcessTree: true); |
| | 3 | 169 | | } |
| | 1 | 170 | | } |
| | 0 | 171 | | catch (InvalidOperationException) |
| | 3 | 172 | | { |
| | 3 | 173 | | // Process already exited, safe to ignore. |
| | 0 | 174 | | } |
| | 0 | 175 | | catch (Exception ex) |
| | 3 | 176 | | { |
| | 0 | 177 | | Logger.Warning(ex, "ProcessProbe {Probe} exception while attempting to kill PID {Pid}", Name, proc.Id); |
| | 0 | 178 | | } |
| | 4 | 179 | | }); |
| | | 180 | | } |
| | | 181 | | |
| | | 182 | | /// <summary> |
| | | 183 | | /// Waits for the process to exit, handling timeout scenarios. |
| | | 184 | | /// </summary> |
| | | 185 | | /// <param name="proc">The process to wait for.</param> |
| | | 186 | | /// <param name="timeoutToken">Token that fires on timeout.</param> |
| | | 187 | | /// <param name="callerToken">The original caller's cancellation token.</param> |
| | | 188 | | /// <returns>True if the process timed out, false if it completed normally.</returns> |
| | | 189 | | private static async Task<bool> WaitForProcessWithTimeout(Process proc, CancellationToken timeoutToken, Cancellation |
| | | 190 | | { |
| | | 191 | | try |
| | | 192 | | { |
| | 3 | 193 | | await proc.WaitForExitAsync(timeoutToken).ConfigureAwait(false); |
| | 2 | 194 | | return false; // No timeout |
| | | 195 | | } |
| | 1 | 196 | | catch (OperationCanceledException) when (!callerToken.IsCancellationRequested) |
| | | 197 | | { |
| | | 198 | | // Internal timeout fired; process kill requested via registration. Mark and wait for real exit to drain str |
| | | 199 | | try |
| | | 200 | | { |
| | 1 | 201 | | proc.WaitForExit(); // ensure fully exited so streams close |
| | 1 | 202 | | } |
| | 0 | 203 | | catch (Exception ex) when (ex is InvalidOperationException) |
| | | 204 | | { |
| | | 205 | | // ignore - already exited |
| | 0 | 206 | | } |
| | 1 | 207 | | return true; // Timed out |
| | | 208 | | } |
| | 3 | 209 | | } |
| | | 210 | | |
| | | 211 | | /// <summary> |
| | | 212 | | /// Reads the process stdout and stderr streams with error handling. |
| | | 213 | | /// </summary> |
| | | 214 | | /// <param name="stdOutTask">Task reading standard output.</param> |
| | | 215 | | /// <param name="stdErrTask">Task reading standard error.</param> |
| | | 216 | | /// <param name="timedOut">Whether the process timed out.</param> |
| | | 217 | | /// <param name="callerToken">The original caller's cancellation token.</param> |
| | | 218 | | /// <returns>The stdout and stderr text.</returns> |
| | | 219 | | private static async Task<(string StdOut, string StdErr)> ReadProcessStreams( |
| | | 220 | | Task<string> stdOutTask, |
| | | 221 | | Task<string> stdErrTask, |
| | | 222 | | bool timedOut, |
| | | 223 | | CancellationToken callerToken) |
| | | 224 | | { |
| | 3 | 225 | | var outText = string.Empty; |
| | 3 | 226 | | var errText = string.Empty; |
| | | 227 | | |
| | | 228 | | try |
| | | 229 | | { |
| | 3 | 230 | | outText = await stdOutTask.ConfigureAwait(false); |
| | 3 | 231 | | } |
| | 0 | 232 | | catch (OperationCanceledException) when (timedOut && !callerToken.IsCancellationRequested) |
| | | 233 | | { |
| | | 234 | | // stdout read was canceled by caller token? (should not happen since we only passed caller token) |
| | 0 | 235 | | } |
| | | 236 | | |
| | | 237 | | try |
| | | 238 | | { |
| | 3 | 239 | | errText = await stdErrTask.ConfigureAwait(false); |
| | 3 | 240 | | } |
| | 0 | 241 | | catch (OperationCanceledException) when (timedOut && !callerToken.IsCancellationRequested) |
| | | 242 | | { |
| | | 243 | | // ignore similar to above |
| | 0 | 244 | | } |
| | | 245 | | |
| | 3 | 246 | | return (outText, errText); |
| | 3 | 247 | | } |
| | | 248 | | |
| | | 249 | | /// <summary> |
| | | 250 | | /// Parses the JSON contract from the process output. |
| | | 251 | | /// </summary> |
| | | 252 | | /// <param name="outText">The standard output text from the process.</param> |
| | | 253 | | /// <param name="result">The parsed probe result.</param> |
| | | 254 | | /// <returns>True if the JSON contract was successfully parsed; otherwise, false.</returns> |
| | | 255 | | private bool TryParseJsonContract(string? outText, out ProbeResult result) |
| | | 256 | | { |
| | 2 | 257 | | if (string.IsNullOrWhiteSpace(outText)) |
| | | 258 | | { |
| | 2 | 259 | | result = default!; |
| | 2 | 260 | | return false; |
| | | 261 | | } |
| | | 262 | | |
| | | 263 | | try |
| | | 264 | | { |
| | 0 | 265 | | using var doc = JsonDocument.Parse(outText); |
| | 0 | 266 | | if (!doc.RootElement.TryGetProperty("status", out var statusProp)) |
| | | 267 | | { |
| | 0 | 268 | | result = default!; |
| | 0 | 269 | | return false; |
| | | 270 | | } |
| | | 271 | | |
| | 0 | 272 | | var status = statusProp.GetString()?.ToLowerInvariant() switch |
| | 0 | 273 | | { |
| | 0 | 274 | | ProbeStatusLabels.STATUS_HEALTHY => ProbeStatus.Healthy, |
| | 0 | 275 | | ProbeStatusLabels.STATUS_DEGRADED => ProbeStatus.Degraded, |
| | 0 | 276 | | ProbeStatusLabels.STATUS_UNHEALTHY => ProbeStatus.Unhealthy, |
| | 0 | 277 | | _ => ProbeStatus.Unhealthy |
| | 0 | 278 | | }; |
| | | 279 | | |
| | 0 | 280 | | var desc = doc.RootElement.TryGetProperty("description", out var d) ? d.GetString() : null; |
| | 0 | 281 | | var data = ParseJsonData(doc.RootElement); |
| | 0 | 282 | | result = new ProbeResult(status, desc, data); |
| | 0 | 283 | | return true; |
| | | 284 | | } |
| | 0 | 285 | | catch (Exception ex) |
| | | 286 | | { |
| | 0 | 287 | | if (Logger.IsEnabled(LogEventLevel.Debug)) |
| | | 288 | | { |
| | 0 | 289 | | Logger.Debug(ex, "ProcessProbe {Probe} output not valid contract JSON", Name); |
| | | 290 | | } |
| | 0 | 291 | | result = default!; |
| | 0 | 292 | | return false; |
| | | 293 | | } |
| | 0 | 294 | | } |
| | | 295 | | |
| | | 296 | | /// <summary> |
| | | 297 | | /// Parses the "data" property from the JSON root element into a dictionary. |
| | | 298 | | /// </summary> |
| | | 299 | | /// <param name="root">The root JSON element.</param> |
| | | 300 | | /// <returns>A dictionary containing the parsed data, or null if no data is present.</returns> |
| | | 301 | | private static IReadOnlyDictionary<string, object>? ParseJsonData(JsonElement root) |
| | | 302 | | { |
| | 0 | 303 | | if (!root.TryGetProperty("data", out var dataProp) || dataProp.ValueKind != JsonValueKind.Object) |
| | | 304 | | { |
| | 0 | 305 | | return null; |
| | | 306 | | } |
| | | 307 | | |
| | 0 | 308 | | var dict = new Dictionary<string, object>(); |
| | 0 | 309 | | foreach (var p in dataProp.EnumerateObject()) |
| | | 310 | | { |
| | 0 | 311 | | dict[p.Name] = p.Value.ToString(); |
| | | 312 | | } |
| | 0 | 313 | | return dict.Count == 0 ? null : dict; |
| | | 314 | | } |
| | | 315 | | |
| | | 316 | | /// <summary> |
| | | 317 | | /// Maps the process exit code to a ProbeResult according to the health probe contract. |
| | | 318 | | /// </summary> |
| | | 319 | | /// <param name="code">The exit code of the process.</param> |
| | | 320 | | /// <param name="errText">The error text from the process output.</param> |
| | | 321 | | /// <returns>The mapped ProbeResult.</returns> |
| | | 322 | | private static ProbeResult MapExitCode(int code, string? errText) |
| | | 323 | | { |
| | 2 | 324 | | var trimmedErr = string.IsNullOrWhiteSpace(errText) ? null : errText.Trim(); |
| | 2 | 325 | | return code switch |
| | 2 | 326 | | { |
| | 1 | 327 | | 0 => new ProbeResult(ProbeStatus.Healthy, trimmedErr ?? "OK"), |
| | 1 | 328 | | 1 => new ProbeResult(ProbeStatus.Degraded, trimmedErr ?? "Degraded"), |
| | 0 | 329 | | 2 => new ProbeResult(ProbeStatus.Unhealthy, trimmedErr ?? "Unhealthy"), |
| | 0 | 330 | | _ => new ProbeResult(ProbeStatus.Unhealthy, $"Exit {code}: {trimmedErr}".TrimEnd(':', ' ')) |
| | 2 | 331 | | }; |
| | | 332 | | } |
| | | 333 | | } |