Files
nt8-sdk/docs/architecture/circuit_breaker_implementation.md
Billy Valentine 92f3732b3d
Some checks failed
Build and Test / build (push) Has been cancelled
Phase 0 completion: NT8 SDK core framework with risk management and position sizing
2025-09-09 17:06:37 -04:00

49 KiB

Circuit Breaker Implementation Design

Overview

This document details the implementation of circuit breaker functionality in the Order Management System (OMS), which provides automatic protection against cascading failures by temporarily stopping order submissions when system instability is detected.

Circuit Breaker Architecture

Core Components

  1. Circuit Breaker: Core circuit breaker logic
  2. Circuit Breaker Configuration: Configuration for different circuit breaker settings
  3. Circuit Breaker State Tracker: Tracks circuit breaker state transitions
  4. Circuit Breaker Monitor: Monitors system health and triggers circuit breaker
  5. Circuit Breaker Reporter: Reports circuit breaker events and metrics

Circuit Breaker Models

Circuit Breaker Configuration

/// <summary>
/// Configuration for circuit breaker
/// </summary>
public record CircuitBreakerConfig : IConfiguration
{
    public string Id { get; set; } = "circuit-breaker-config";
    public string Name { get; set; } = "Circuit Breaker Configuration";
    public string Description { get; set; } = "Configuration for circuit breaker functionality";
    public bool IsActive { get; set; } = true;
    public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
    public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
    public int Version { get; set; } = 1;
    
    /// <summary>
    /// Failure threshold (number of failures before opening circuit)
    /// </summary>
    public int FailureThreshold { get; set; } = 5;
    
    /// <summary>
    /// Success threshold (number of successes before closing circuit)
    /// </summary>
    public int SuccessThreshold { get; set; } = 3;
    
    /// <summary>
    /// Timeout period when circuit is open (in seconds)
    /// </summary>
    public int TimeoutSeconds { get; set; } = 60;
    
    /// <summary>
    /// Window size for failure counting (in seconds)
    /// </summary>
    public int WindowSizeSeconds { get; set; } = 300; // 5 minutes
    
    /// <summary>
    /// Types of failures that trigger circuit breaker
    /// </summary>
    public List<FailureType> FailureTypes { get; set; } = new List<FailureType>
    {
        FailureType.OrderRejection,
        FailureType.OrderTimeout,
        FailureType.VenueConnectionFailure,
        FailureType.VenueRateLimitExceeded,
        FailureType.SystemOverload,
        FailureType.RiskManagementViolation
    };
    
    /// <summary>
    /// Whether to enable half-open state for gradual recovery
    /// </summary>
    public bool EnableHalfOpenState { get; set; } = true;
    
    /// <summary>
    /// Whether to automatically reset circuit breaker after timeout
    /// </summary>
    public bool EnableAutomaticReset { get; set; } = true;
    
    /// <summary>
    /// Whether to log circuit breaker events
    /// </summary>
    public bool LogEvents { get; set; } = true;
    
    /// <summary>
    /// Whether to generate alerts for circuit breaker events
    /// </summary>
    public bool GenerateAlerts { get; set; } = true;
    
    /// <summary>
    /// Alert recipients (email addresses, webhook URLs, etc.)
    /// </summary>
    public List<string> AlertRecipients { get; set; } = new List<string>();
    
    /// <summary>
    /// Whether to enable per-venue circuit breakers
    /// </summary>
    public bool EnablePerVenueCircuitBreakers { get; set; } = true;
    
    /// <summary>
    /// Whether to enable per-symbol circuit breakers
    /// </summary>
    public bool EnablePerSymbolCircuitBreakers { get; set; } = false;
    
    /// <summary>
    /// Whether to enable per-user circuit breakers
    /// </summary>
    public bool EnablePerUserCircuitBreakers { get; set; } = false;
    
    /// <summary>
    /// Whether to enable global circuit breaker
    /// </summary>
    public bool EnableGlobalCircuitBreaker { get; set; } = true;
    
    /// <summary>
    /// External health check endpoints
    /// </summary>
    public List<string> HealthCheckEndpoints { get; set; } = new List<string>();
    
    /// <summary>
    /// Health check interval (in seconds)
    /// </summary>
    public int HealthCheckIntervalSeconds { get; set; } = 30;
    
    /// <summary>
    /// Whether to enable predictive circuit breaking based on system metrics
    /// </summary>
    public bool EnablePredictiveCircuitBreaking { get; set; } = false;
    
    /// <summary>
    /// Threshold for system metrics to trigger predictive circuit breaking
    /// </summary>
    public Dictionary<string, double> PredictiveThresholds { get; set; } = 
        new Dictionary<string, double>
        {
            ["cpu_usage"] = 0.9, // 90% CPU usage
            ["memory_usage"] = 0.85, // 85% memory usage
            ["disk_io"] = 0.8, // 80% disk I/O
            ["network_io"] = 0.75, // 75% network I/O
            ["error_rate"] = 0.1 // 10% error rate
        };
    
    /// <summary>
    /// Whether to enable manual circuit breaker control
    /// </summary>
    public bool EnableManualControl { get; set; } = true;
    
    /// <summary>
    /// Manual override password (hashed)
    /// </summary>
    public string ManualOverridePasswordHash { get; set; }
    
    public static CircuitBreakerConfig Default => new CircuitBreakerConfig();
}

Circuit Breaker State

/// <summary>
/// Circuit breaker state enumeration
/// </summary>
public enum CircuitBreakerState
{
    /// <summary>
    /// Circuit is closed, allowing requests
    /// </summary>
    Closed,
    
    /// <summary>
    /// Circuit is open, rejecting requests
    /// </summary>
    Open,
    
    /// <summary>
    /// Circuit is half-open, allowing limited requests for testing
    /// </summary>
    HalfOpen
}

/// <summary>
/// Circuit breaker state tracker
/// </summary>
public class CircuitBreakerStateTracker
{
    private CircuitBreakerState _state;
    private DateTime _lastStateChanged;
    private DateTime _openStartTime;
    private int _failureCount;
    private int _successCount;
    private readonly Queue<DateTime> _failureTimestamps;
    private readonly Queue<DateTime> _successTimestamps;
    private readonly object _lock = new object();
    
    public CircuitBreakerStateTracker()
    {
        _state = CircuitBreakerState.Closed;
        _lastStateChanged = DateTime.UtcNow;
        _failureTimestamps = new Queue<DateTime>();
        _successTimestamps = new Queue<DateTime>();
    }
    
    public CircuitBreakerState State
    {
        get
        {
            lock (_lock)
            {
                return _state;
            }
        }
    }
    
    public DateTime LastStateChanged
    {
        get
        {
            lock (_lock)
            {
                return _lastStateChanged;
            }
        }
    }
    
    public DateTime OpenStartTime
    {
        get
        {
            lock (_lock)
            {
                return _openStartTime;
            }
        }
    }
    
    public int FailureCount
    {
        get
        {
            lock (_lock)
            {
                return _failureCount;
            }
        }
    }
    
    public int SuccessCount
    {
        get
        {
            lock (_lock)
            {
                return _successCount;
            }
        }
    }
    
    /// <summary>
    /// Record a failure
    /// </summary>
    public void RecordFailure(DateTime timestamp)
    {
        lock (_lock)
        {
            _failureCount++;
            _failureTimestamps.Enqueue(timestamp);
            PruneOldTimestamps();
        }
    }
    
    /// <summary>
    /// Record a success
    /// </summary>
    public void RecordSuccess(DateTime timestamp)
    {
        lock (_lock)
        {
            _successCount++;
            _successTimestamps.Enqueue(timestamp);
            PruneOldTimestamps();
        }
    }
    
    /// <summary>
    /// Transition to a new state
    /// </summary>
    public void TransitionTo(CircuitBreakerState newState, DateTime timestamp)
    {
        lock (_lock)
        {
            var oldState = _state;
            _state = newState;
            _lastStateChanged = timestamp;
            
            if (newState == CircuitBreakerState.Open)
            {
                _openStartTime = timestamp;
            }
            else if (newState == CircuitBreakerState.Closed)
            {
                // Reset counters when closing
                _failureCount = 0;
                _successCount = 0;
                _failureTimestamps.Clear();
                _successTimestamps.Clear();
            }
            
            _logger?.LogInformation("Circuit breaker transitioned from {OldState} to {NewState}", oldState, newState);
        }
    }
    
    /// <summary>
    /// Get failure count in the specified time window
    /// </summary>
    public int GetFailureCount(TimeSpan timeWindow)
    {
        lock (_lock)
        {
            var cutoffTime = DateTime.UtcNow.Subtract(timeWindow);
            return _failureTimestamps.Count(t => t >= cutoffTime);
        }
    }
    
    /// <summary>
    /// Get success count in the specified time window
    /// </summary>
    public int GetSuccessCount(TimeSpan timeWindow)
    {
        lock (_lock)
        {
            var cutoffTime = DateTime.UtcNow.Subtract(timeWindow);
            return _successTimestamps.Count(t => t >= cutoffTime);
        }
    }
    
    /// <summary>
    /// Prune old timestamps to prevent memory leaks
    /// </summary>
    private void PruneOldTimestamps()
    {
        var maxAge = TimeSpan.FromHours(1);
        var cutoffTime = DateTime.UtcNow.Subtract(maxAge);
        
        while (_failureTimestamps.Count > 0 && _failureTimestamps.Peek() < cutoffTime)
        {
            _failureTimestamps.Dequeue();
        }
        
        while (_successTimestamps.Count > 0 && _successTimestamps.Peek() < cutoffTime)
        {
            _successTimestamps.Dequeue();
        }
    }
    
    /// <summary>
    /// Clear all state
    /// </summary>
    public void Clear()
    {
        lock (_lock)
        {
            _state = CircuitBreakerState.Closed;
            _lastStateChanged = DateTime.UtcNow;
            _openStartTime = DateTime.MinValue;
            _failureCount = 0;
            _successCount = 0;
            _failureTimestamps.Clear();
            _successTimestamps.Clear();
        }
    }
    
    private ILogger<CircuitBreakerStateTracker> _logger;
    
    public void SetLogger(ILogger<CircuitBreakerStateTracker> logger)
    {
        _logger = logger;
    }
}

Circuit Breaker Event

/// <summary>
/// Represents a circuit breaker event
/// </summary>
public record CircuitBreakerEvent
{
    /// <summary>
    /// Unique identifier for this event
    /// </summary>
    public string Id { get; set; } = Guid.NewGuid().ToString();
    
    /// <summary>
    /// Timestamp of event
    /// </summary>
    public DateTime Timestamp { get; set; } = DateTime.UtcNow;
    
    /// <summary>
    /// Circuit breaker scope (global, venue, symbol, user)
    /// </summary>
    public CircuitBreakerScope Scope { get; set; }
    
    /// <summary>
    /// Identifier for the scope (venue ID, symbol, user ID, etc.)
    /// </summary>
    public string ScopeId { get; set; }
    
    /// <summary>
    /// Previous circuit breaker state
    /// </summary>
    public CircuitBreakerState PreviousState { get; set; }
    
    /// <summary>
    /// New circuit breaker state
    /// </summary>
    public CircuitBreakerState NewState { get; set; }
    
    /// <summary>
    /// Reason for state change
    /// </summary>
    public string Reason { get; set; }
    
    /// <summary>
    /// Associated failure (if applicable)
    /// </summary>
    public FailureRecord Failure { get; set; }
    
    /// <summary>
    /// Whether this event was manually triggered
    /// </summary>
    public bool IsManual { get; set; }
    
    /// <summary>
    /// Additional metadata
    /// </summary>
    public Dictionary<string, object> Metadata { get; set; } = new Dictionary<string, object>();
}

/// <summary>
/// Circuit breaker scope enumeration
/// </summary>
public enum CircuitBreakerScope
{
    Global,
    Venue,
    Symbol,
    User
}

/// <summary>
/// Failure record
/// </summary>
public record FailureRecord
{
    /// <summary>
    /// Unique identifier for this failure
    /// </summary>
    public string Id { get; set; } = Guid.NewGuid().ToString();
    
    /// <summary>
    /// Timestamp of failure
    /// </summary>
    public DateTime Timestamp { get; set; } = DateTime.UtcNow;
    
    /// <summary>
    /// Type of failure
    /// </summary>
    public FailureType Type { get; set; }
    
    /// <summary>
    /// Error message
    /// </summary>
    public string Message { get; set; }
    
    /// <summary>
    /// Stack trace (if available)
    /// </summary>
    public string StackTrace { get; set; }
    
    /// <summary>
    /// Order associated with failure (if applicable)
    /// </summary>
    public OrderRequest Order { get; set; }
    
    /// <summary>
    /// Venue associated with failure (if applicable)
    /// </summary>
    public string VenueId { get; set; }
    
    /// <summary>
    /// Symbol associated with failure (if applicable)
    /// </summary>
    public string Symbol { get; set; }
    
    /// <summary>
    /// User associated with failure (if applicable)
    /// </summary>
    public string UserId { get; set; }
    
    /// <summary>
    /// Exception (if available)
    /// </summary>
    public Exception Exception { get; set; }
}

/// <summary>
/// Failure type enumeration
/// </summary>
public enum FailureType
{
    OrderRejection,
    OrderTimeout,
    VenueConnectionFailure,
    VenueRateLimitExceeded,
    SystemOverload,
    RiskManagementViolation,
    MarketDataUnavailable,
    InvalidOrderParameters,
    InsufficientFunds,
    PositionLimitExceeded,
    Other
}

Circuit Breaker Implementation

Circuit Breaker

/// <summary>
/// Implements circuit breaker functionality
/// </summary>
public class CircuitBreaker
{
    private readonly ILogger<CircuitBreaker> _logger;
    private readonly CircuitBreakerConfig _config;
    private readonly CircuitBreakerStateTracker _stateTracker;
    private readonly List<FailureRecord> _failures;
    private readonly List<CircuitBreakerEvent> _events;
    private readonly object _lock = new object();
    private readonly Timer _resetTimer;
    private readonly Timer _healthCheckTimer;
    private readonly IHealthChecker _healthChecker;
    
    public CircuitBreaker(
        ILogger<CircuitBreaker> logger,
        IHealthChecker healthChecker,
        CircuitBreakerConfig config = null)
    {
        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
        _healthChecker = healthChecker ?? throw new ArgumentNullException(nameof(healthChecker));
        _config = config ?? CircuitBreakerConfig.Default;
        
        _stateTracker = new CircuitBreakerStateTracker();
        _stateTracker.SetLogger(logger);
        _failures = new List<FailureRecord>();
        _events = new List<CircuitBreakerEvent>();
        
        // Set up timers
        _resetTimer = new Timer(ResetCircuit, null, Timeout.Infinite, Timeout.Infinite);
        _healthCheckTimer = new Timer(PerformHealthCheck, null, 
            TimeSpan.FromSeconds(_config.HealthCheckIntervalSeconds), 
            TimeSpan.FromSeconds(_config.HealthCheckIntervalSeconds));
    }
    
    /// <summary>
    /// Check if requests are allowed through the circuit breaker
    /// </summary>
    public CircuitBreakerResult CheckCircuit()
    {
        lock (_lock)
        {
            var now = DateTime.UtcNow;
            var state = _stateTracker.State;
            
            switch (state)
            {
                case CircuitBreakerState.Closed:
                    return new CircuitBreakerResult(CircuitBreakerAction.Allow, state);
                    
                case CircuitBreakerState.Open:
                    // Check if timeout has expired
                    if (_config.EnableAutomaticReset && 
                        now.Subtract(_stateTracker.OpenStartTime).TotalSeconds >= _config.TimeoutSeconds)
                    {
                        // Transition to half-open state if enabled
                        if (_config.EnableHalfOpenState)
                        {
                            _stateTracker.TransitionTo(CircuitBreakerState.HalfOpen, now);
                            _logger.LogInformation("Circuit breaker transitioning to half-open state after timeout");
                            return new CircuitBreakerResult(CircuitBreakerAction.Test, state);
                        }
                        else
                        {
                            // Close circuit directly
                            _stateTracker.TransitionTo(CircuitBreakerState.Closed, now);
                            _logger.LogInformation("Circuit breaker closing after timeout");
                            return new CircuitBreakerResult(CircuitBreakerAction.Allow, state);
                        }
                    }
                    
                    return new CircuitBreakerResult(CircuitBreakerAction.Reject, state);
                    
                case CircuitBreakerState.HalfOpen:
                    // Allow limited requests for testing
                    return new CircuitBreakerResult(CircuitBreakerAction.Test, state);
                    
                default:
                    _logger.LogWarning("Unknown circuit breaker state: {State}", state);
                    return new CircuitBreakerResult(CircuitBreakerAction.Reject, state);
            }
        }
    }
    
    /// <summary>
    /// Record a failure
    /// </summary>
    public void RecordFailure(FailureRecord failure)
    {
        if (failure == null) throw new ArgumentNullException(nameof(failure));
        
        lock (_lock)
        {
            // Record failure
            _failures.Add(failure);
            _stateTracker.RecordFailure(failure.Timestamp);
            
            // Prune old failures
            PruneOldFailures();
            
            // Check if failure threshold is exceeded
            var failureCount = _stateTracker.GetFailureCount(TimeSpan.FromSeconds(_config.WindowSizeSeconds));
            if (failureCount >= _config.FailureThreshold)
            {
                // Open circuit
                var now = DateTime.UtcNow;
                _stateTracker.TransitionTo(CircuitBreakerState.Open, now);
                
                // Start reset timer
                if (_config.EnableAutomaticReset)
                {
                    _resetTimer.Change(TimeSpan.FromSeconds(_config.TimeoutSeconds), Timeout.InfiniteTimeSpan);
                }
                
                // Log event
                var circuitEvent = new CircuitBreakerEvent
                {
                    Timestamp = now,
                    Scope = CircuitBreakerScope.Global, // Assuming global for simplicity
                    ScopeId = "global",
                    PreviousState = CircuitBreakerState.Closed,
                    NewState = CircuitBreakerState.Open,
                    Reason = $"Failure threshold exceeded: {failureCount} failures in {_config.WindowSizeSeconds} seconds",
                    Failure = failure
                };
                
                _events.Add(circuitEvent);
                _logger.LogCritical("Circuit breaker opened: {Reason}", circuitEvent.Reason);
                
                // Generate alert if configured
                if (_config.GenerateAlerts)
                {
                    GenerateAlert(circuitEvent);
                }
            }
        }
    }
    
    /// <summary>
    /// Record a success
    /// </summary>
    public void RecordSuccess(DateTime timestamp)
    {
        lock (_lock)
        {
            var state = _stateTracker.State;
            _stateTracker.RecordSuccess(timestamp);
            
            // In half-open state, check if we should close the circuit
            if (state == CircuitBreakerState.HalfOpen)
            {
                var successCount = _stateTracker.GetSuccessCount(TimeSpan.FromSeconds(_config.WindowSizeSeconds));
                if (successCount >= _config.SuccessThreshold)
                {
                    // Close circuit
                    _stateTracker.TransitionTo(CircuitBreakerState.Closed, timestamp);
                    
                    // Log event
                    var circuitEvent = new CircuitBreakerEvent
                    {
                        Timestamp = timestamp,
                        Scope = CircuitBreakerScope.Global,
                        ScopeId = "global",
                        PreviousState = CircuitBreakerState.HalfOpen,
                        NewState = CircuitBreakerState.Closed,
                        Reason = $"Success threshold exceeded: {successCount} successes"
                    };
                    
                    _events.Add(circuitEvent);
                    _logger.LogInformation("Circuit breaker closed: {Reason}", circuitEvent.Reason);
                    
                    // Generate alert if configured
                    if (_config.GenerateAlerts)
                    {
                        GenerateAlert(circuitEvent);
                    }
                }
            }
        }
    }
    
    /// <summary>
    /// Manually open the circuit breaker
    /// </summary>
    public void OpenCircuit(string reason, string userId = null)
    {
        lock (_lock)
        {
            var now = DateTime.UtcNow;
            var previousState = _stateTracker.State;
            
            _stateTracker.TransitionTo(CircuitBreakerState.Open, now);
            
            // Start reset timer
            if (_config.EnableAutomaticReset)
            {
                _resetTimer.Change(TimeSpan.FromSeconds(_config.TimeoutSeconds), Timeout.InfiniteTimeSpan);
            }
            
            // Log event
            var circuitEvent = new CircuitBreakerEvent
            {
                Timestamp = now,
                Scope = CircuitBreakerScope.Global,
                ScopeId = "global",
                PreviousState = previousState,
                NewState = CircuitBreakerState.Open,
                Reason = reason,
                IsManual = true
            };
            
            _events.Add(circuitEvent);
            _logger.LogWarning("Circuit breaker manually opened: {Reason}", reason);
            
            // Generate alert if configured
            if (_config.GenerateAlerts)
            {
                GenerateAlert(circuitEvent);
            }
        }
    }
    
    /// <summary>
    /// Manually close the circuit breaker
    /// </summary>
    public void CloseCircuit(string reason, string userId = null)
    {
        lock (_lock)
        {
            var now = DateTime.UtcNow;
            var previousState = _stateTracker.State;
            
            _stateTracker.TransitionTo(CircuitBreakerState.Closed, now);
            
            // Stop reset timer
            _resetTimer.Change(Timeout.Infinite, Timeout.Infinite);
            
            // Log event
            var circuitEvent = new CircuitBreakerEvent
            {
                Timestamp = now,
                Scope = CircuitBreakerScope.Global,
                ScopeId = "global",
                PreviousState = previousState,
                NewState = CircuitBreakerState.Closed,
                Reason = reason,
                IsManual = true
            };
            
            _events.Add(circuitEvent);
            _logger.LogInformation("Circuit breaker manually closed: {Reason}", reason);
            
            // Generate alert if configured
            if (_config.GenerateAlerts)
            {
                GenerateAlert(circuitEvent);
            }
        }
    }
    
    /// <summary>
    /// Reset circuit after timeout
    /// </summary>
    private void ResetCircuit(object state)
    {
        lock (_lock)
        {
            if (_stateTracker.State == CircuitBreakerState.Open)
            {
                var now = DateTime.UtcNow;
                
                // Transition to half-open state if enabled
                if (_config.EnableHalfOpenState)
                {
                    _stateTracker.TransitionTo(CircuitBreakerState.HalfOpen, now);
                    
                    // Log event
                    var circuitEvent = new CircuitBreakerEvent
                    {
                        Timestamp = now,
                        Scope = CircuitBreakerScope.Global,
                        ScopeId = "global",
                        PreviousState = CircuitBreakerState.Open,
                        NewState = CircuitBreakerState.HalfOpen,
                        Reason = "Circuit breaker timeout expired, transitioning to half-open"
                    };
                    
                    _events.Add(circuitEvent);
                    _logger.LogInformation("Circuit breaker transitioning to half-open after timeout");
                    
                    // Generate alert if configured
                    if (_config.GenerateAlerts)
                    {
                        GenerateAlert(circuitEvent);
                    }
                }
                else
                {
                    // Close circuit directly
                    _stateTracker.TransitionTo(CircuitBreakerState.Closed, now);
                    
                    // Log event
                    var circuitEvent = new CircuitBreakerEvent
                    {
                        Timestamp = now,
                        Scope = CircuitBreakerScope.Global,
                        ScopeId = "global",
                        PreviousState = CircuitBreakerState.Open,
                        NewState = CircuitBreakerState.Closed,
                        Reason = "Circuit breaker timeout expired, closing circuit"
                    };
                    
                    _events.Add(circuitEvent);
                    _logger.LogInformation("Circuit breaker closing after timeout");
                    
                    // Generate alert if configured
                    if (_config.GenerateAlerts)
                    {
                        GenerateAlert(circuitEvent);
                    }
                }
            }
        }
    }
    
    /// <summary>
    /// Perform health check
    /// </summary>
    private async void PerformHealthCheck(object state)
    {
        try
        {
            if (!_config.EnablePredictiveCircuitBreaking) return;
            
            var healthStatus = await _healthChecker.CheckHealthAsync();
            
            // Check if any health metrics exceed thresholds
            foreach (var kvp in _config.PredictiveThresholds)
            {
                var metricName = kvp.Key;
                var threshold = kvp.Value;
                
                if (healthStatus.Metrics.ContainsKey(metricName))
                {
                    var currentValue = healthStatus.Metrics[metricName];
                    if (currentValue > threshold)
                    {
                        // Record failure to trigger circuit breaker
                        var failure = new FailureRecord
                        {
                            Timestamp = DateTime.UtcNow,
                            Type = FailureType.SystemOverload,
                            Message = $"Health metric {metricName} exceeded threshold: {currentValue:F2} > {threshold:F2}",
                            Metadata = new Dictionary<string, object>
                            {
                                ["metric_name"] = metricName,
                                ["current_value"] = currentValue,
                                ["threshold"] = threshold
                            }
                        };
                        
                        RecordFailure(failure);
                        break; // Only record one failure per health check
                    }
                }
            }
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error performing health check");
        }
    }
    
    /// <summary>
    /// Prune old failures to prevent memory leaks
    /// </summary>
    private void PruneOldFailures()
    {
        var cutoffTime = DateTime.UtcNow.Subtract(TimeSpan.FromHours(1));
        _failures.RemoveAll(f => f.Timestamp < cutoffTime);
    }
    
    /// <summary>
    /// Generate alert for circuit breaker event
    /// </summary>
    private void GenerateAlert(CircuitBreakerEvent circuitEvent)
    {
        // In a real implementation, this would send alerts to configured recipients
        _logger.LogInformation("Circuit breaker alert: {NewState} - {Reason}", 
            circuitEvent.NewState, circuitEvent.Reason);
    }
    
    /// <summary>
    /// Get current circuit breaker state
    /// </summary>
    public CircuitBreakerState GetState()
    {
        lock (_lock)
        {
            return _stateTracker.State;
        }
    }
    
    /// <summary>
    /// Get circuit breaker metrics
    /// </summary>
    public CircuitBreakerMetrics GetMetrics()
    {
        lock (_lock)
        {
            return new CircuitBreakerMetrics
            {
                State = _stateTracker.State,
                LastStateChanged = _stateTracker.LastStateChanged,
                OpenStartTime = _stateTracker.OpenStartTime,
                FailureCount = _stateTracker.FailureCount,
                SuccessCount = _stateTracker.SuccessCount,
                TotalEvents = _events.Count,
                TotalFailures = _failures.Count,
                FailureRate = _stateTracker.FailureCount > 0 ? 
                    (double)_stateTracker.FailureCount / (_stateTracker.FailureCount + _stateTracker.SuccessCount) : 0,
                Uptime = DateTime.UtcNow.Subtract(_stateTracker.LastStateChanged).TotalSeconds
            };
        }
    }
    
    /// <summary>
    /// Get recent circuit breaker events
    /// </summary>
    public List<CircuitBreakerEvent> GetRecentEvents(TimeSpan timeWindow)
    {
        lock (_lock)
        {
            var cutoffTime = DateTime.UtcNow.Subtract(timeWindow);
            return _events.Where(e => e.Timestamp >= cutoffTime).ToList();
        }
    }
    
    /// <summary>
    /// Get recent failures
    /// </summary>
    public List<FailureRecord> GetRecentFailures(TimeSpan timeWindow)
    {
        lock (_lock)
        {
            var cutoffTime = DateTime.UtcNow.Subtract(timeWindow);
            return _failures.Where(f => f.Timestamp >= cutoffTime).ToList();
        }
    }
    
    /// <summary>
    /// Reset circuit breaker state
    /// </summary>
    public void Reset()
    {
        lock (_lock)
        {
            _stateTracker.Clear();
            _failures.Clear();
            _events.Clear();
            _resetTimer.Change(Timeout.Infinite, Timeout.Infinite);
            
            _logger.LogInformation("Circuit breaker state reset");
        }
    }
    
    public void Dispose()
    {
        _resetTimer?.Dispose();
        _healthCheckTimer?.Dispose();
    }
}

Circuit Breaker Results

/// <summary>
/// Result of circuit breaker check
/// </summary>
public record CircuitBreakerResult
{
    /// <summary>
    /// Action to take for the request
    /// </summary>
    public CircuitBreakerAction Action { get; set; }
    
    /// <summary>
    /// Current circuit breaker state
    /// </summary>
    public CircuitBreakerState State { get; set; }
    
    /// <summary>
    /// Error message (if applicable)
    /// </summary>
    public string ErrorMessage { get; set; }
    
    public CircuitBreakerResult(CircuitBreakerAction action, CircuitBreakerState state, string errorMessage = null)
    {
        Action = action;
        State = state;
        ErrorMessage = errorMessage;
    }
}

/// <summary>
/// Circuit breaker action enumeration
/// </summary>
public enum CircuitBreakerAction
{
    /// <summary>
    /// Allow the request
    /// </summary>
    Allow,
    
    /// <summary>
    /// Reject the request
    /// </summary>
    Reject,
    
    /// <summary>
    /// Allow limited requests for testing (half-open state)
    /// </summary>
    Test
}

Circuit Breaker Metrics

/// <summary>
/// Circuit breaker metrics
/// </summary>
public record CircuitBreakerMetrics
{
    public CircuitBreakerState State { get; set; }
    public DateTime LastStateChanged { get; set; }
    public DateTime OpenStartTime { get; set; }
    public int FailureCount { get; set; }
    public int SuccessCount { get; set; }
    public int TotalEvents { get; set; }
    public int TotalFailures { get; set; }
    public double FailureRate { get; set; }
    public double Uptime { get; set; } // Seconds since last state change
    public DateTime Timestamp { get; set; } = DateTime.UtcNow;
}

Health Checker Interface

/// <summary>
/// Interface for system health checking
/// </summary>
public interface IHealthChecker
{
    /// <summary>
    /// Check system health
    /// </summary>
    Task<HealthStatus> CheckHealthAsync();
    
    /// <summary>
    /// Get health check history
    /// </summary>
    Task<List<HealthStatus>> GetHealthHistoryAsync(TimeSpan timeWindow);
}

/// <summary>
/// Health status
/// </summary>
public record HealthStatus
{
    public DateTime Timestamp { get; set; } = DateTime.UtcNow;
    public bool IsHealthy { get; set; }
    public Dictionary<string, double> Metrics { get; set; } = new Dictionary<string, double>();
    public List<string> Issues { get; set; } = new List<string>();
    public string Details { get; set; }
}

Integration with OrderManager

Circuit Breaker Integration

public partial class OrderManager : IOrderManager
{
    private readonly CircuitBreaker _circuitBreaker;
    
    // Enhanced constructor with circuit breaker
    public OrderManager(
        IRiskManager riskManager,
        IPositionSizer positionSizer,
        ILogger<OrderManager> logger,
        RoutingConfigurationManager configManager,
        RoutingMetricsCollector metricsCollector,
        TwapExecutor twapExecutor,
        VwapExecutor vwapExecutor,
        IcebergExecutor icebergExecutor,
        AlgorithmParameterProvider parameterProvider,
        RateLimiter rateLimiter,
        ValueLimiter valueLimiter,
        CircuitBreaker circuitBreaker) : base(riskManager, positionSizer, logger, configManager, metricsCollector, twapExecutor, vwapExecutor, icebergExecutor, parameterProvider, rateLimiter, valueLimiter)
    {
        _circuitBreaker = circuitBreaker ?? throw new ArgumentNullException(nameof(circuitBreaker));
        _venueManager = new VenueManager(logger);
        _omsToVenueOrderIdMap = new Dictionary<string, string>();
        _venueToOmsOrderIdMap = new Dictionary<string, string>();
        
        // Initialize with configurations
        InitializeWithConfigurationsAsync().Wait();
    }
    
    /// <summary>
    /// Submit an order with circuit breaker protection
    /// </summary>
    public async Task<OrderResult> SubmitOrderAsync(OrderRequest request, StrategyContext context)
    {
        if (request == null) throw new ArgumentNullException(nameof(request));
        if (context == null) throw new ArgumentNullException(nameof(context));
        
        try
        {
            // Check circuit breaker
            var circuitResult = _circuitBreaker.CheckCircuit();
            
            switch (circuitResult.Action)
            {
                case CircuitBreakerAction.Reject:
                    _logger.LogWarning("Order submission rejected due to open circuit breaker");
                    return new OrderResult(false, null, "Order submission rejected due to system protection", null);
                    
                case CircuitBreakerAction.Test:
                    _logger.LogInformation("Order submission allowed for testing in half-open circuit breaker state");
                    // Proceed with order submission but monitor closely
                    break;
                    
                case CircuitBreakerAction.Allow:
                    // Proceed with normal order submission
                    break;
            }
            
            // Continue with normal order submission process
            var result = await base.SubmitOrderAsync(request, context);
            
            // Record success or failure with circuit breaker
            if (result.Success)
            {
                _circuitBreaker.RecordSuccess(DateTime.UtcNow);
            }
            else
            {
                // Record failure if order was rejected by risk management or other internal systems
                if (result.Message.Contains("risk") || result.Message.Contains("validation"))
                {
                    var failure = new FailureRecord
                    {
                        Timestamp = DateTime.UtcNow,
                        Type = FailureType.RiskManagementViolation,
                        Message = result.Message,
                        Order = request,
                        UserId = context.UserId
                    };
                    
                    _circuitBreaker.RecordFailure(failure);
                }
            }
            
            return result;
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error submitting order {Symbol}", request.Symbol);
            
            // Record failure with circuit breaker
            var failure = new FailureRecord
            {
                Timestamp = DateTime.UtcNow,
                Type = FailureType.Other,
                Message = ex.Message,
                Order = request,
                UserId = context.UserId,
                Exception = ex
            };
            
            _circuitBreaker.RecordFailure(failure);
            
            // Re-throw exception to maintain normal error handling
            throw;
        }
    }
    
    /// <summary>
    /// Get circuit breaker state
    /// </summary>
    public CircuitBreakerState GetCircuitBreakerState()
    {
        return _circuitBreaker.GetState();
    }
    
    /// <summary>
    /// Get circuit breaker metrics
    /// </summary>
    public CircuitBreakerMetrics GetCircuitBreakerMetrics()
    {
        return _circuitBreaker.GetMetrics();
    }
    
    /// <summary>
    /// Manually open circuit breaker
    /// </summary>
    public void OpenCircuitBreaker(string reason, string userId = null)
    {
        _circuitBreaker.OpenCircuit(reason, userId);
    }
    
    /// <summary>
    /// Manually close circuit breaker
    /// </summary>
    public void CloseCircuitBreaker(string reason, string userId = null)
    {
        _circuitBreaker.CloseCircuit(reason, userId);
    }
    
    /// <summary>
    /// Reset circuit breaker
    /// </summary>
    public void ResetCircuitBreaker()
    {
        _circuitBreaker.Reset();
    }
}

Circuit Breaker Configuration Management

Circuit Breaker Configuration Integration

public partial class RoutingConfigurationManager
{
    /// <summary>
    /// Get circuit breaker configuration
    /// </summary>
    public async Task<CircuitBreakerConfig> GetCircuitBreakerConfigAsync()
    {
        var config = await GetConfigurationAsync<CircuitBreakerConfig>("circuit-breaker-config");
        return config ?? CircuitBreakerConfig.Default;
    }
    
    /// <summary>
    /// Update circuit breaker configuration
    /// </summary>
    public async Task UpdateCircuitBreakerConfigAsync(CircuitBreakerConfig config)
    {
        if (config == null) throw new ArgumentNullException(nameof(config));
        
        config.Id = "circuit-breaker-config";
        config.Name = "Circuit Breaker Configuration";
        config.Description = "Configuration for circuit breaker functionality";
        
        await UpdateConfigurationAsync(config);
        _logger.LogInformation("Circuit breaker configuration updated");
    }
}

Testing Considerations

Unit Tests for Circuit Breaker

  1. State Transitions: Test circuit breaker state transitions
  2. Failure Recording: Test recording of failures and threshold checking
  3. Success Recording: Test recording of successes and threshold checking
  4. Timeout Handling: Test automatic reset after timeout
  5. Manual Control: Test manual opening and closing of circuit
  6. Half-Open State: Test half-open state behavior
  7. Health Checks: Test health check integration
  8. Metrics Collection: Test collection of circuit breaker metrics
  9. Event Logging: Test logging of circuit breaker events
  10. Reset Functionality: Test resetting of circuit breaker state

Integration Tests

  1. End-to-End Circuit Breaking: Test complete circuit breaker flow
  2. Order Manager Integration: Test integration with OrderManager
  3. Performance Testing: Test performance with circuit breaker enabled
  4. Concurrent Access: Test concurrent access to circuit breaker
  5. Error Handling: Test error handling in circuit breaker
  6. Configuration Updates: Test dynamic configuration updates

Performance Considerations

Memory Management

/// <summary>
/// Manages memory usage for circuit breaker
/// </summary>
public class CircuitBreakerMemoryManager
{
    private readonly int _maxEvents;
    private readonly int _maxFailures;
    private readonly TimeSpan _eventRetentionTime;
    private readonly object _lock = new object();
    
    public CircuitBreakerMemoryManager(
        int maxEvents = 10000,
        int maxFailures = 100000,
        TimeSpan retentionTime = default)
    {
        _maxEvents = maxEvents;
        _maxFailures = maxFailures;
        _eventRetentionTime = retentionTime == default ? 
            TimeSpan.FromHours(24) : retentionTime;
    }
    
    public bool IsMemoryPressureHigh(int currentEventCount, int currentFailureCount)
    {
        return currentEventCount > (_maxEvents * 0.8) || // 80% threshold
               currentFailureCount > (_maxFailures * 0.8);
    }
    
    public TimeSpan GetEventRetentionTime()
    {
        return _eventRetentionTime;
    }
}

Distributed Circuit Breaking

/// <summary>
/// Distributed circuit breaker that coordinates across multiple instances
/// </summary>
public class DistributedCircuitBreaker : CircuitBreaker
{
    private readonly IDistributedCache _distributedCache;
    private readonly string _instanceId;
    
    public DistributedCircuitBreaker(
        ILogger<CircuitBreaker> logger,
        IHealthChecker healthChecker,
        IDistributedCache distributedCache,
        string instanceId,
        CircuitBreakerConfig config = null) : base(logger, healthChecker, config)
    {
        _distributedCache = distributedCache ?? throw new ArgumentNullException(nameof(distributedCache));
        _instanceId = instanceId ?? throw new ArgumentNullException(nameof(instanceId));
    }
    
    protected override void RecordFailure(FailureRecord failure)
    {
        base.RecordFailure(failure);
        
        // Publish failure to distributed cache for coordination
        PublishFailureToCluster(failure);
    }
    
    protected override void RecordSuccess(DateTime timestamp)
    {
        base.RecordSuccess(timestamp);
        
        // Publish success to distributed cache for coordination
        PublishSuccessToCluster(timestamp);
    }
    
    private async void PublishFailureToCluster(FailureRecord failure)
    {
        try
        {
            var clusterFailure = new ClusterFailureRecord
            {
                InstanceId = _instanceId,
                Failure = failure,
                Timestamp = DateTime.UtcNow
            };
            
            var json = JsonSerializer.Serialize(clusterFailure);
            var key = $"circuitbreaker:failure:{Guid.NewGuid()}";
            
            await _distributedCache.SetStringAsync(key, json, new DistributedCacheEntryOptions
            {
                AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(5)
            });
            
            // Also publish to a channel for real-time coordination
            await _distributedCache.PublishAsync("circuitbreaker:failures", json);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error publishing failure to cluster");
        }
    }
    
    private async void PublishSuccessToCluster(DateTime timestamp)
    {
        try
        {
            var clusterSuccess = new ClusterSuccessRecord
            {
                InstanceId = _instanceId,
                Timestamp = timestamp
            };
            
            var json = JsonSerializer.Serialize(clusterSuccess);
            var key = $"circuitbreaker:success:{Guid.NewGuid()}";
            
            await _distributedCache.SetStringAsync(key, json, new DistributedCacheEntryOptions
            {
                AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(5)
            });
            
            // Also publish to a channel for real-time coordination
            await _distributedCache.PublishAsync("circuitbreaker:successes", json);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error publishing success to cluster");
        }
    }
}

/// <summary>
/// Cluster failure record
/// </summary>
public record ClusterFailureRecord
{
    public string InstanceId { get; set; }
    public FailureRecord Failure { get; set; }
    public DateTime Timestamp { get; set; }
}

/// <summary>
/// Cluster success record
/// </summary>
public record ClusterSuccessRecord
{
    public string InstanceId { get; set; }
    public DateTime Timestamp { get; set; }
}

Monitoring and Alerting

Circuit Breaker Metrics Export

/// <summary>
/// Exports circuit breaker metrics for monitoring
/// </summary>
public class CircuitBreakerMetricsExporter
{
    private readonly CircuitBreaker _circuitBreaker;
    
    public CircuitBreakerMetricsExporter(CircuitBreaker circuitBreaker)
    {
        _circuitBreaker = circuitBreaker ?? throw new ArgumentNullException(nameof(circuitBreaker));
    }
    
    public string ExportToPrometheus()
    {
        var metrics = _circuitBreaker.GetMetrics();
        var sb = new StringBuilder();
        
        // Circuit breaker state
        sb.AppendLine($"# HELP circuit_breaker_state Current circuit breaker state (0=closed, 1=open, 2=half-open)");
        sb.AppendLine($"# TYPE circuit_breaker_state gauge");
        sb.AppendLine($"circuit_breaker_state {GetStateNumericValue(metrics.State)}");
        
        // Failure count
        sb.AppendLine($"# HELP circuit_breaker_failures_total Total circuit breaker failures");
        sb.AppendLine($"# TYPE circuit_breaker_failures_total counter");
        sb.AppendLine($"circuit_breaker_failures_total {metrics.FailureCount}");
        
        // Success count
        sb.AppendLine($"# HELP circuit_breaker_successes_total Total circuit breaker successes");
        sb.AppendLine($"# TYPE circuit_breaker_successes_total counter");
        sb.AppendLine($"circuit_breaker_successes_total {metrics.SuccessCount}");
        
        // Failure rate
        sb.AppendLine($"# HELP circuit_breaker_failure_rate Current circuit breaker failure rate");
        sb.AppendLine($"# TYPE circuit_breaker_failure_rate gauge");
        sb.AppendLine($"circuit_breaker_failure_rate {metrics.FailureRate:F4}");
        
        // Uptime
        sb.AppendLine($"# HELP circuit_breaker_uptime_seconds Circuit breaker uptime in seconds");
        sb.AppendLine($"# TYPE circuit_breaker_uptime_seconds gauge");
        sb.AppendLine($"circuit_breaker_uptime_seconds {metrics.Uptime:F0}");
        
        // Total events
        sb.AppendLine($"# HELP circuit_breaker_events_total Total circuit breaker events");
        sb.AppendLine($"# TYPE circuit_breaker_events_total counter");
        sb.AppendLine($"circuit_breaker_events_total {metrics.TotalEvents}");
        
        // Total failures
        sb.AppendLine($"# HELP circuit_breaker_recorded_failures_total Total recorded failures");
        sb.AppendLine($"# TYPE circuit_breaker_recorded_failures_total counter");
        sb.AppendLine($"circuit_breaker_recorded_failures_total {metrics.TotalFailures}");
        
        return sb.ToString();
    }
    
    private int GetStateNumericValue(CircuitBreakerState state)
    {
        return state switch
        {
            CircuitBreakerState.Closed => 0,
            CircuitBreakerState.Open => 1,
            CircuitBreakerState.HalfOpen => 2,
            _ => -1
        };
    }
}

Future Enhancements

  1. Machine Learning Circuit Breaking: Use ML to predict system failures and proactively open circuits
  2. Real-time Adaptive Circuit Breaking: Adjust circuit breaker parameters in real-time based on system conditions
  3. Cross-System Circuit Breaking: Coordinate circuit breaking across multiple interconnected systems
  4. Circuit Breaker Analytics: Advanced analytics and reporting on circuit breaker performance
  5. Circuit Breaker Strategy Builder: Visual tools for building and testing circuit breaker strategies
  6. Circuit Breaker Benchmarking: Compare circuit breaker performance against industry standards
  7. Circuit Breaker Compliance: Ensure circuit breaker complies with regulatory requirements
  8. Hierarchical Circuit Breaking: Implement hierarchical circuit breakers for complex system architectures
  9. Circuit Breaker with Chaos Engineering: Integrate circuit breaker testing with chaos engineering practices
  10. Quantum-Resistant Circuit Breaking: Prepare circuit breaker for quantum computing threats