| | | 1 | | using System.Collections.Concurrent; |
| | | 2 | | using System.Diagnostics; |
| | | 3 | | using SerilogLogger = Serilog.ILogger; |
| | | 4 | | |
| | | 5 | | namespace Kestrun.Health; |
| | | 6 | | |
| | | 7 | | /// <summary> |
| | | 8 | | /// Executes registered <see cref="IProbe"/> instances and aggregates their results into a single report. |
| | | 9 | | /// </summary> |
| | | 10 | | internal static class HealthProbeRunner |
| | | 11 | | { |
| | | 12 | | /// <summary> |
| | | 13 | | /// Executes the provided probes and builds an aggregated <see cref="HealthReport"/>. |
| | | 14 | | /// </summary> |
| | | 15 | | /// <param name="probes">The probes to execute.</param> |
| | | 16 | | /// <param name="tagFilter">Optional tag filter (case-insensitive). When provided, only probes that advertise at lea |
| | | 17 | | /// <param name="perProbeTimeout">Maximum execution time per probe. Specify <see cref="TimeSpan.Zero"/> to disable t |
| | | 18 | | /// <param name="maxDegreeOfParallelism">Maximum number of probes executed concurrently. Values less than one disabl |
| | | 19 | | /// <param name="logger">Logger used for diagnostics.</param> |
| | | 20 | | /// <param name="ct">Cancellation token tied to the HTTP request.</param> |
| | | 21 | | /// <returns>The aggregated health report.</returns> |
| | | 22 | | public static async Task<HealthReport> RunAsync( |
| | | 23 | | IReadOnlyList<IProbe> probes, |
| | | 24 | | IReadOnlyList<string> tagFilter, |
| | | 25 | | TimeSpan perProbeTimeout, |
| | | 26 | | int maxDegreeOfParallelism, |
| | | 27 | | SerilogLogger logger, |
| | | 28 | | CancellationToken ct) |
| | | 29 | | { |
| | 16 | 30 | | ArgumentNullException.ThrowIfNull(probes); |
| | 16 | 31 | | ArgumentNullException.ThrowIfNull(tagFilter); |
| | 16 | 32 | | ArgumentNullException.ThrowIfNull(logger); |
| | | 33 | | |
| | 16 | 34 | | string[] normalizedTags = tagFilter.Count == 0 |
| | 16 | 35 | | ? [] |
| | 8 | 36 | | : [.. tagFilter.Select(static t => t.Trim()) |
| | 8 | 37 | | .Where(static t => !string.IsNullOrWhiteSpace(t)) |
| | 16 | 38 | | .Distinct(StringComparer.OrdinalIgnoreCase)]; |
| | | 39 | | |
| | 16 | 40 | | var selected = normalizedTags.Length == 0 |
| | 16 | 41 | | ? probes |
| | 38 | 42 | | : [.. probes.Where(p => p.Tags.Any(tag => normalizedTags.Contains(tag, StringComparer.OrdinalIgnoreCase)))]; |
| | | 43 | | |
| | 16 | 44 | | if (selected.Count == 0) |
| | | 45 | | { |
| | 4 | 46 | | return new HealthReport( |
| | 4 | 47 | | ProbeStatus.Healthy, |
| | 4 | 48 | | ProbeStatusLabels.STATUS_HEALTHY, |
| | 4 | 49 | | DateTimeOffset.UtcNow, |
| | 4 | 50 | | [], |
| | 4 | 51 | | new HealthSummary(0, 0, 0, 0), |
| | 4 | 52 | | normalizedTags); |
| | | 53 | | } |
| | | 54 | | |
| | 12 | 55 | | var entries = new ConcurrentBag<HealthProbeEntry>(); |
| | 12 | 56 | | using var throttle = maxDegreeOfParallelism > 0 ? new SemaphoreSlim(maxDegreeOfParallelism) : null; |
| | | 57 | | |
| | 12 | 58 | | var tasks = selected |
| | 34 | 59 | | .Select(probe => ExecuteProbeAsync(probe, perProbeTimeout, throttle, logger, entries, ct)) |
| | 12 | 60 | | .ToArray(); |
| | 12 | 61 | | await Task.WhenAll(tasks).ConfigureAwait(false); |
| | | 62 | | |
| | 42 | 63 | | var ordered = entries.OrderBy(static e => e.Name, StringComparer.OrdinalIgnoreCase).ToArray(); |
| | 12 | 64 | | var summary = new HealthSummary( |
| | 12 | 65 | | ordered.Length, |
| | 34 | 66 | | ordered.Count(static e => e.Status == ProbeStatus.Healthy), |
| | 34 | 67 | | ordered.Count(static e => e.Status == ProbeStatus.Degraded), |
| | 46 | 68 | | ordered.Count(static e => e.Status == ProbeStatus.Unhealthy)); |
| | | 69 | | |
| | | 70 | | // Determine overall status using explicit precedence: Unhealthy > Degraded > Healthy |
| | 46 | 71 | | var overall = DetermineOverallStatus(ordered.Select(static e => e.Status)); |
| | 12 | 72 | | return new HealthReport( |
| | 12 | 73 | | overall, |
| | 12 | 74 | | overall.ToString().ToLowerInvariant(), |
| | 12 | 75 | | DateTimeOffset.UtcNow, |
| | 12 | 76 | | ordered, |
| | 12 | 77 | | summary, |
| | 12 | 78 | | normalizedTags); |
| | 16 | 79 | | } |
| | | 80 | | |
| | | 81 | | /// <summary> |
| | | 82 | | /// Executes a single probe with timeout and error handling, adding the result to the provided sink. |
| | | 83 | | /// </summary> |
| | | 84 | | /// <param name="probe">The probe to execute.</param> |
| | | 85 | | /// <param name="perProbeTimeout">Maximum execution time for the probe. Specify <see cref="TimeSpan.Zero"/> to disab |
| | | 86 | | /// <param name="throttle">Optional semaphore used to limit concurrency. May be null to disable throttling.</param> |
| | | 87 | | /// <param name="logger">Logger used for diagnostics.</param> |
| | | 88 | | /// <param name="sink">Concurrent sink to which the result is added.</param> |
| | | 89 | | /// <param name="ct">Cancellation token tied to the HTTP request.</param> |
| | | 90 | | /// <returns>A task representing the asynchronous operation.</returns> |
| | | 91 | | private static async Task ExecuteProbeAsync( |
| | | 92 | | IProbe probe, |
| | | 93 | | TimeSpan perProbeTimeout, |
| | | 94 | | SemaphoreSlim? throttle, |
| | | 95 | | SerilogLogger logger, |
| | | 96 | | ConcurrentBag<HealthProbeEntry> sink, |
| | | 97 | | CancellationToken ct) |
| | | 98 | | { |
| | 34 | 99 | | if (throttle is not null) |
| | | 100 | | { |
| | 8 | 101 | | await throttle.WaitAsync(ct).ConfigureAwait(false); |
| | | 102 | | } |
| | | 103 | | |
| | | 104 | | try |
| | | 105 | | { |
| | 34 | 106 | | using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct); |
| | 34 | 107 | | if (perProbeTimeout > TimeSpan.Zero) |
| | | 108 | | { |
| | 33 | 109 | | linkedCts.CancelAfter(perProbeTimeout); |
| | | 110 | | } |
| | | 111 | | |
| | 34 | 112 | | var sw = Stopwatch.StartNew(); |
| | 34 | 113 | | ProbeResult? result = null; |
| | 34 | 114 | | string? error = null; |
| | | 115 | | |
| | | 116 | | try |
| | | 117 | | { |
| | 34 | 118 | | result = await probe.CheckAsync(linkedCts.Token).ConfigureAwait(false) |
| | 34 | 119 | | ?? new ProbeResult(ProbeStatus.Unhealthy, "Probe returned null result"); |
| | 32 | 120 | | } |
| | 1 | 121 | | catch (OperationCanceledException) when (!ct.IsCancellationRequested && linkedCts.IsCancellationRequested) |
| | | 122 | | { |
| | | 123 | | // Timeout Policy: |
| | | 124 | | // A per-probe timeout is considered a transient performance issue rather than a hard failure. |
| | | 125 | | // We classify these as Degraded to signal slowness / partial impairment while allowing |
| | | 126 | | // other Healthy probes to keep the overall status from immediately flipping to Unhealthy. |
| | | 127 | | // Rationale: |
| | | 128 | | // * Many probes (HTTP, process, IO) may occasionally exceed a strict SLA due to load. |
| | | 129 | | // * Treating every timeout as Unhealthy causes noisy flapping and obscures true faults. |
| | | 130 | | // * Aggregation precedence still ensures multiple Degraded probes can surface an overall |
| | | 131 | | // Degraded status, while a single critical failure (explicit Unhealthy) dominates. |
| | | 132 | | // If future scenarios require elevating timeouts to Unhealthy, this mapping can be made |
| | | 133 | | // configurable (e.g., via HealthProbeOptions). For now we keep policy simple & conservative. |
| | 1 | 134 | | logger.Warning("Health probe {Probe} timed out after {Timeout}.", probe.Name, perProbeTimeout); |
| | 1 | 135 | | result = new ProbeResult(ProbeStatus.Degraded, $"Timed out after {perProbeTimeout.TotalSeconds:N1}s"); |
| | 1 | 136 | | } |
| | 0 | 137 | | catch (OperationCanceledException) when (ct.IsCancellationRequested) |
| | | 138 | | { |
| | 0 | 139 | | throw; |
| | | 140 | | } |
| | 1 | 141 | | catch (Exception ex) |
| | | 142 | | { |
| | 1 | 143 | | logger.Error(ex, "Health probe {Probe} threw an exception.", probe.Name); |
| | 1 | 144 | | error = ex.Message; |
| | 1 | 145 | | result = new ProbeResult(ProbeStatus.Unhealthy, $"Exception: {ex.Message}"); |
| | 1 | 146 | | } |
| | | 147 | | finally |
| | | 148 | | { |
| | 34 | 149 | | sw.Stop(); |
| | | 150 | | } |
| | | 151 | | |
| | 34 | 152 | | sink.Add(new HealthProbeEntry( |
| | 34 | 153 | | probe.Name, |
| | 34 | 154 | | probe.Tags ?? [], |
| | 34 | 155 | | result.Status, |
| | 34 | 156 | | result.Status.ToString().ToLowerInvariant(), |
| | 34 | 157 | | result.Description, |
| | 34 | 158 | | result.Data, |
| | 34 | 159 | | sw.Elapsed, |
| | 34 | 160 | | error)); |
| | 34 | 161 | | } |
| | | 162 | | finally |
| | | 163 | | { |
| | 34 | 164 | | _ = throttle?.Release(); |
| | | 165 | | } |
| | 34 | 166 | | } |
| | | 167 | | |
| | | 168 | | /// <summary> |
| | | 169 | | /// Determines the overall health status using explicit precedence rules. |
| | | 170 | | /// Unhealthy takes precedence over all others, Degraded over Healthy. |
| | | 171 | | /// Returns Healthy if no statuses are provided. |
| | | 172 | | /// </summary> |
| | | 173 | | /// <param name="statuses">Collection of probe statuses.</param> |
| | | 174 | | /// <returns>The highest precedence status found.</returns> |
| | | 175 | | private static ProbeStatus DetermineOverallStatus(IEnumerable<ProbeStatus> statuses) |
| | | 176 | | { |
| | 12 | 177 | | var foundAny = false; |
| | 12 | 178 | | var foundDegraded = false; |
| | | 179 | | |
| | 89 | 180 | | foreach (var status in statuses) |
| | | 181 | | { |
| | 34 | 182 | | foundAny = true; |
| | 34 | 183 | | if (status == ProbeStatus.Unhealthy) |
| | | 184 | | { |
| | 3 | 185 | | return ProbeStatus.Unhealthy; |
| | | 186 | | } |
| | | 187 | | |
| | 31 | 188 | | if (status == ProbeStatus.Degraded) |
| | | 189 | | { |
| | 6 | 190 | | foundDegraded = true; |
| | | 191 | | } |
| | | 192 | | } |
| | | 193 | | |
| | 9 | 194 | | return !foundAny ? ProbeStatus.Healthy : foundDegraded ? ProbeStatus.Degraded : ProbeStatus.Healthy; |
| | 3 | 195 | | } |
| | | 196 | | } |