Files
nt8-sdk/docs/architecture/routing_performance_metrics_implementation.md
Billy Valentine 92f3732b3d
Some checks failed
Build and Test / build (push) Has been cancelled
Phase 0 completion: NT8 SDK core framework with risk management and position sizing
2025-09-09 17:06:37 -04:00

1345 lines
48 KiB
Markdown

# Routing Performance Metrics Implementation Design
## Overview
This document details the implementation of routing performance metrics in the Order Management System (OMS), which tracks and analyzes the performance of different execution venues to optimize routing decisions and provide insights into system performance.
## Metrics Architecture
The routing performance metrics system consists of several components:
1. **Metrics Collection**: Real-time collection of performance data from execution venues
2. **Metrics Storage**: Efficient storage of metrics data for analysis
3. **Metrics Aggregation**: Calculation of aggregated metrics for reporting
4. **Metrics Analysis**: Analysis of metrics to identify trends and issues
5. **Metrics Reporting**: Provision of metrics to monitoring systems and dashboards
## Metrics Models
### Base Metrics Interface
```csharp
/// <summary>
/// Base interface for all metrics
/// </summary>
public interface IMetric
{
/// <summary>
/// Unique identifier for this metric
/// </summary>
string Id { get; }
/// <summary>
/// Name of this metric
/// </summary>
string Name { get; }
/// <summary>
/// Description of this metric
/// </summary>
string Description { get; }
/// <summary>
/// When this metric was created
/// </summary>
DateTime CreatedAt { get; }
/// <summary>
/// When this metric was last updated
/// </summary>
DateTime UpdatedAt { get; set; }
/// <summary>
/// Tags associated with this metric
/// </summary>
Dictionary<string, string> Tags { get; }
}
```
### Routing Metrics Model
```csharp
/// <summary>
/// Routing performance metrics
/// </summary>
public record RoutingMetrics : IMetric
{
public string Id { get; set; } = Guid.NewGuid().ToString();
public string Name { get; set; } = "Routing Metrics";
public string Description { get; set; } = "Performance metrics for order routing";
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public Dictionary<string, string> Tags { get; set; } = new Dictionary<string, string>();
// Overall routing statistics
public int TotalRoutedOrders { get; set; }
public int SuccessfulRoutedOrders { get; set; }
public int FailedRoutedOrders { get; set; }
public double SuccessRate => TotalRoutedOrders > 0 ? (double)SuccessfulRoutedOrders / TotalRoutedOrders : 0;
// Performance metrics
public double AverageRoutingTimeMs { get; set; }
public double MedianRoutingTimeMs { get; set; }
public double P95RoutingTimeMs { get; set; }
public double P99RoutingTimeMs { get; set; }
// Venue-specific metrics
public Dictionary<string, VenueMetrics> VenuePerformance { get; set; } = new Dictionary<string, VenueMetrics>();
// Time-based metrics
public Dictionary<DateTime, TimeBasedMetrics> TimeBasedPerformance { get; set; } = new Dictionary<DateTime, TimeBasedMetrics>();
// Algorithm-specific metrics
public Dictionary<string, AlgorithmMetrics> AlgorithmPerformance { get; set; } = new Dictionary<string, AlgorithmMetrics>();
// Symbol-specific metrics
public Dictionary<string, SymbolMetrics> SymbolPerformance { get; set; } = new Dictionary<string, SymbolMetrics>();
}
```
### Venue Metrics Model
```csharp
/// <summary>
/// Metrics for a specific execution venue
/// </summary>
public record VenueMetrics : IMetric
{
public string Id { get; set; }
public string Name { get; set; }
public string Description { get; set; } = "Venue performance metrics";
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public Dictionary<string, string> Tags { get; set; } = new Dictionary<string, string>();
// Basic venue information
public string VenueId { get; set; }
public string VenueName { get; set; }
public VenueType VenueType { get; set; }
// Order statistics
public int TotalOrders { get; set; }
public int SuccessfulOrders { get; set; }
public int FailedOrders { get; set; }
public int CancelledOrders { get; set; }
public int ExpiredOrders { get; set; }
public double FillRate => TotalOrders > 0 ? (double)SuccessfulOrders / TotalOrders : 0;
// Performance metrics
public double AverageLatencyMs { get; set; }
public double MedianLatencyMs { get; set; }
public double P95LatencyMs { get; set; }
public double P99LatencyMs { get; set; }
// Execution quality metrics
public double AverageSlippage { get; set; }
public double MedianSlippage { get; set; }
public double P95Slippage { get; set; }
public double P99Slippage { get; set; }
// Value metrics
public decimal TotalValueRouted { get; set; }
public decimal TotalCommissionPaid { get; set; }
public decimal AverageOrderValue { get; set; }
// Time-based performance
public Dictionary<DateTime, TimeBasedMetrics> HourlyPerformance { get; set; } = new Dictionary<DateTime, TimeBasedMetrics>();
// Order size distribution
public Dictionary<string, int> OrderSizeDistribution { get; set; } = new Dictionary<string, int>();
// Error statistics
public Dictionary<string, int> ErrorCounts { get; set; } = new Dictionary<string, int>();
}
```
### Time-Based Metrics Model
```csharp
/// <summary>
/// Metrics for a specific time period
/// </summary>
public record TimeBasedMetrics : IMetric
{
public string Id { get; set; }
public string Name { get; set; }
public string Description { get; set; } = "Time-based performance metrics";
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public Dictionary<string, string> Tags { get; set; } = new Dictionary<string, string>();
// Time period information
public DateTime PeriodStart { get; set; }
public DateTime PeriodEnd { get; set; }
public string PeriodType { get; set; } // Hourly, Daily, Weekly, Monthly
// Performance metrics for this period
public int OrdersRouted { get; set; }
public int SuccessfulOrders { get; set; }
public double AverageRoutingTimeMs { get; set; }
public double AverageSlippage { get; set; }
public decimal TotalValueRouted { get; set; }
}
```
### Algorithm Metrics Model
```csharp
/// <summary>
/// Metrics for algorithmic order execution
/// </summary>
public record AlgorithmMetrics : IMetric
{
public string Id { get; set; }
public string Name { get; set; }
public string Description { get; set; } = "Algorithm performance metrics";
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public Dictionary<string, string> Tags { get; set; } = new Dictionary<string, string>();
// Algorithm information
public string AlgorithmType { get; set; } // TWAP, VWAP, Iceberg
// Performance metrics
public int TotalOrders { get; set; }
public int CompletedOrders { get; set; }
public double CompletionRate => TotalOrders > 0 ? (double)CompletedOrders / TotalOrders : 0;
// Execution quality
public double AverageTrackingError { get; set; } // For TWAP/VWAP
public double AverageIcebergDetectionRate { get; set; } // For Iceberg orders
public double AverageParticipationRate { get; set; } // For algorithmic orders
// Time metrics
public TimeSpan AverageExecutionDuration { get; set; }
public TimeSpan MedianExecutionDuration { get; set; }
// Venue distribution
public Dictionary<string, int> VenueDistribution { get; set; } = new Dictionary<string, int>();
}
```
### Symbol Metrics Model
```csharp
/// <summary>
/// Metrics for a specific trading symbol
/// </summary>
public record SymbolMetrics : IMetric
{
public string Id { get; set; }
public string Name { get; set; }
public string Description { get; set; } = "Symbol performance metrics";
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public Dictionary<string, string> Tags { get; set; } = new Dictionary<string, string>();
// Symbol information
public string Symbol { get; set; }
public string AssetClass { get; set; }
// Trading metrics
public int TotalOrders { get; set; }
public int SuccessfulOrders { get; set; }
public double FillRate => TotalOrders > 0 ? (double)SuccessfulOrders / TotalOrders : 0;
// Price metrics
public double AverageSpread { get; set; }
public double MedianSpread { get; set; }
public double Volatility { get; set; }
// Volume metrics
public long TotalVolume { get; set; }
public double AverageOrderSize { get; set; }
// Venue performance for this symbol
public Dictionary<string, VenueMetrics> VenuePerformance { get; set; } = new Dictionary<string, VenueMetrics>();
}
```
## Metrics Collection System
### Metrics Collector
```csharp
/// <summary>
/// Collects and manages routing performance metrics
/// </summary>
public class RoutingMetricsCollector
{
private readonly ILogger<RoutingMetricsCollector> _logger;
private readonly IMetricsRepository _metricsRepository;
private readonly RoutingMetrics _currentMetrics;
private readonly object _lock = new object();
private readonly Timer _metricsFlushTimer;
public RoutingMetricsCollector(
ILogger<RoutingMetricsCollector> logger,
IMetricsRepository metricsRepository)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_metricsRepository = metricsRepository ?? throw new ArgumentNullException(nameof(metricsRepository));
_currentMetrics = new RoutingMetrics
{
Id = "routing-metrics-current",
Name = "Current Routing Metrics",
Description = "Current performance metrics for order routing",
Tags = new Dictionary<string, string> { ["type"] = "current" }
};
// Initialize metrics from repository
InitializeMetricsAsync().Wait();
// Set up periodic metrics flush
_metricsFlushTimer = new Timer(FlushMetricsAsync, null, TimeSpan.FromMinutes(5), TimeSpan.FromMinutes(5));
}
/// <summary>
/// Record metrics for a routed order
/// </summary>
public void RecordOrderRouting(OrderRequest request, RoutingResult result, TimeSpan routingTime)
{
if (request == null) throw new ArgumentNullException(nameof(request));
if (result == null) throw new ArgumentNullException(nameof(result));
lock (_lock)
{
// Update overall routing metrics
_currentMetrics.TotalRoutedOrders++;
if (result.Success)
{
_currentMetrics.SuccessfulRoutedOrders++;
}
else
{
_currentMetrics.FailedRoutedOrders++;
}
// Update routing time metrics
UpdateRoutingTimeMetrics(routingTime.TotalMilliseconds);
// Update venue-specific metrics
if (result.SelectedVenue != null)
{
UpdateVenueMetrics(request, result, routingTime);
}
// Update algorithm-specific metrics
if (!string.IsNullOrEmpty(request.Algorithm))
{
UpdateAlgorithmMetrics(request, result, routingTime);
}
// Update symbol-specific metrics
UpdateSymbolMetrics(request, result, routingTime);
_currentMetrics.UpdatedAt = DateTime.UtcNow;
}
}
/// <summary>
/// Record metrics for an executed order
/// </summary>
public void RecordOrderExecution(string venueId, VenueOrderResult result, TimeSpan executionTime, decimal slippage)
{
if (result == null) throw new ArgumentNullException(nameof(result));
if (string.IsNullOrEmpty(venueId)) throw new ArgumentException("Venue ID required", nameof(venueId));
lock (_lock)
{
// Update venue metrics with execution data
if (_currentMetrics.VenuePerformance.ContainsKey(venueId))
{
var venueMetrics = _currentMetrics.VenuePerformance[venueId];
// Update order counts
venueMetrics.TotalOrders++;
if (result.Success)
{
venueMetrics.SuccessfulOrders++;
}
else
{
venueMetrics.FailedOrders++;
}
// Update latency metrics
UpdateLatencyMetrics(venueMetrics, executionTime.TotalMilliseconds);
// Update slippage metrics
UpdateSlippageMetrics(venueMetrics, (double)slippage);
// Update value metrics
if (result.Status != null)
{
var orderValue = result.Status.Quantity * (result.Status.LimitPrice ?? result.Status.Fills?.FirstOrDefault()?.FillPrice ?? 0);
venueMetrics.TotalValueRouted += orderValue;
venueMetrics.AverageOrderValue = venueMetrics.TotalOrders > 0 ?
venueMetrics.TotalValueRouted / venueMetrics.TotalOrders : 0;
}
venueMetrics.UpdatedAt = DateTime.UtcNow;
}
}
}
private void UpdateRoutingTimeMetrics(double routingTimeMs)
{
// Simple incremental average calculation
var currentAvg = _currentMetrics.AverageRoutingTimeMs;
var count = _currentMetrics.TotalRoutedOrders;
_currentMetrics.AverageRoutingTimeMs = ((currentAvg * (count - 1)) + routingTimeMs) / count;
// In a production system, you would also track median, P95, P99 using proper statistical methods
// For now, we'll just update these with simple approximations
if (routingTimeMs > _currentMetrics.P99RoutingTimeMs)
{
_currentMetrics.P99RoutingTimeMs = routingTimeMs;
}
}
private void UpdateVenueMetrics(OrderRequest request, RoutingResult result, TimeSpan routingTime)
{
var venueId = result.SelectedVenue.Id;
if (!_currentMetrics.VenuePerformance.ContainsKey(venueId))
{
_currentMetrics.VenuePerformance[venueId] = new VenueMetrics
{
Id = $"venue-metrics-{venueId}",
Name = $"Metrics for {result.SelectedVenue.Name}",
VenueId = venueId,
VenueName = result.SelectedVenue.Name,
VenueType = result.SelectedVenue.Type,
Tags = new Dictionary<string, string> { ["venue_id"] = venueId }
};
}
var venueMetrics = _currentMetrics.VenuePerformance[venueId];
venueMetrics.TotalOrders++;
if (result.Success)
{
venueMetrics.SuccessfulOrders++;
}
else
{
venueMetrics.FailedOrders++;
}
// Update latency metrics
UpdateLatencyMetrics(venueMetrics, routingTime.TotalMilliseconds);
venueMetrics.UpdatedAt = DateTime.UtcNow;
}
private void UpdateLatencyMetrics(VenueMetrics venueMetrics, double latencyMs)
{
// Simple incremental average calculation
var currentAvg = venueMetrics.AverageLatencyMs;
var count = venueMetrics.TotalOrders;
venueMetrics.AverageLatencyMs = ((currentAvg * (count - 1)) + latencyMs) / count;
// Update order size distribution
// This is a simplified approach - in reality, you would have more sophisticated bucketing
var latencyBucket = GetLatencyBucket(latencyMs);
if (venueMetrics.OrderSizeDistribution.ContainsKey(latencyBucket))
{
venueMetrics.OrderSizeDistribution[latencyBucket]++;
}
else
{
venueMetrics.OrderSizeDistribution[latencyBucket] = 1;
}
}
private void UpdateSlippageMetrics(VenueMetrics venueMetrics, double slippage)
{
// Simple incremental average calculation
var currentAvg = venueMetrics.AverageSlippage;
var count = venueMetrics.TotalOrders;
venueMetrics.AverageSlippage = ((currentAvg * (count - 1)) + slippage) / count;
}
private string GetLatencyBucket(double latencyMs)
{
return latencyMs switch
{
<= 10 => "0-10ms",
<= 50 => "11-50ms",
<= 100 => "51-100ms",
<= 500 => "101-500ms",
<= 1000 => "501-1000ms",
_ => "1000ms+"
};
}
private void UpdateAlgorithmMetrics(OrderRequest request, RoutingResult result, TimeSpan routingTime)
{
var algorithm = request.Algorithm;
if (!_currentMetrics.AlgorithmPerformance.ContainsKey(algorithm))
{
_currentMetrics.AlgorithmPerformance[algorithm] = new AlgorithmMetrics
{
Id = $"algorithm-metrics-{algorithm}",
Name = $"Metrics for {algorithm}",
AlgorithmType = algorithm,
Tags = new Dictionary<string, string> { ["algorithm"] = algorithm }
};
}
var algorithmMetrics = _currentMetrics.AlgorithmPerformance[algorithm];
algorithmMetrics.TotalOrders++;
if (result.Success)
{
algorithmMetrics.CompletedOrders++;
}
// Update timing metrics
algorithmMetrics.AverageExecutionDuration = TimeSpan.FromMilliseconds(
(algorithmMetrics.AverageExecutionDuration.TotalMilliseconds * (algorithmMetrics.TotalOrders - 1) +
routingTime.TotalMilliseconds) / algorithmMetrics.TotalOrders);
// Update venue distribution
if (result.SelectedVenue != null)
{
var venueId = result.SelectedVenue.Id;
if (algorithmMetrics.VenueDistribution.ContainsKey(venueId))
{
algorithmMetrics.VenueDistribution[venueId]++;
}
else
{
algorithmMetrics.VenueDistribution[venueId] = 1;
}
}
algorithmMetrics.UpdatedAt = DateTime.UtcNow;
}
private void UpdateSymbolMetrics(OrderRequest request, RoutingResult result, TimeSpan routingTime)
{
var symbol = request.Symbol;
if (!_currentMetrics.SymbolPerformance.ContainsKey(symbol))
{
_currentMetrics.SymbolPerformance[symbol] = new SymbolMetrics
{
Id = $"symbol-metrics-{symbol}",
Name = $"Metrics for {symbol}",
Symbol = symbol,
Tags = new Dictionary<string, string> { ["symbol"] = symbol }
};
}
var symbolMetrics = _currentMetrics.SymbolPerformance[symbol];
symbolMetrics.TotalOrders++;
if (result.Success)
{
symbolMetrics.SuccessfulOrders++;
}
// Update venue performance for this symbol
if (result.SelectedVenue != null)
{
var venueId = result.SelectedVenue.Id;
if (!symbolMetrics.VenuePerformance.ContainsKey(venueId))
{
symbolMetrics.VenuePerformance[venueId] = new VenueMetrics
{
Id = $"symbol-venue-metrics-{symbol}-{venueId}",
Name = $"Venue metrics for {symbol} at {result.SelectedVenue.Name}",
VenueId = venueId,
VenueName = result.SelectedVenue.Name,
VenueType = result.SelectedVenue.Type,
Tags = new Dictionary<string, string> { ["symbol"] = symbol, ["venue_id"] = venueId }
};
}
var venueMetrics = symbolMetrics.VenuePerformance[venueId];
venueMetrics.TotalOrders++;
if (result.Success)
{
venueMetrics.SuccessfulOrders++;
}
venueMetrics.UpdatedAt = DateTime.UtcNow;
}
symbolMetrics.UpdatedAt = DateTime.UtcNow;
}
/// <summary>
/// Get current routing metrics
/// </summary>
public RoutingMetrics GetCurrentMetrics()
{
lock (_lock)
{
return new RoutingMetrics
{
Id = _currentMetrics.Id,
Name = _currentMetrics.Name,
Description = _currentMetrics.Description,
CreatedAt = _currentMetrics.CreatedAt,
UpdatedAt = _currentMetrics.UpdatedAt,
Tags = new Dictionary<string, string>(_currentMetrics.Tags),
TotalRoutedOrders = _currentMetrics.TotalRoutedOrders,
SuccessfulRoutedOrders = _currentMetrics.SuccessfulRoutedOrders,
FailedRoutedOrders = _currentMetrics.FailedRoutedOrders,
AverageRoutingTimeMs = _currentMetrics.AverageRoutingTimeMs,
MedianRoutingTimeMs = _currentMetrics.MedianRoutingTimeMs,
P95RoutingTimeMs = _currentMetrics.P95RoutingTimeMs,
P99RoutingTimeMs = _currentMetrics.P99RoutingTimeMs,
VenuePerformance = new Dictionary<string, VenueMetrics>(_currentMetrics.VenuePerformance),
TimeBasedPerformance = new Dictionary<DateTime, TimeBasedMetrics>(_currentMetrics.TimeBasedPerformance),
AlgorithmPerformance = new Dictionary<string, AlgorithmMetrics>(_currentMetrics.AlgorithmPerformance),
SymbolPerformance = new Dictionary<string, SymbolMetrics>(_currentMetrics.SymbolPerformance)
};
}
}
/// <summary>
/// Get venue-specific metrics
/// </summary>
public VenueMetrics GetVenueMetrics(string venueId)
{
if (string.IsNullOrEmpty(venueId)) throw new ArgumentException("Venue ID required", nameof(venueId));
lock (_lock)
{
return _currentMetrics.VenuePerformance.ContainsKey(venueId) ?
new VenueMetrics(_currentMetrics.VenuePerformance[venueId]) : null;
}
}
/// <summary>
/// Initialize metrics from repository
/// </summary>
private async Task InitializeMetricsAsync()
{
try
{
var savedMetrics = await _metricsRepository.GetMetricsAsync<RoutingMetrics>("routing-metrics-current");
if (savedMetrics != null)
{
lock (_lock)
{
// Copy relevant fields from saved metrics
_currentMetrics.TotalRoutedOrders = savedMetrics.TotalRoutedOrders;
_currentMetrics.SuccessfulRoutedOrders = savedMetrics.SuccessfulRoutedOrders;
_currentMetrics.FailedRoutedOrders = savedMetrics.FailedRoutedOrders;
_currentMetrics.AverageRoutingTimeMs = savedMetrics.AverageRoutingTimeMs;
_currentMetrics.VenuePerformance = new Dictionary<string, VenueMetrics>(savedMetrics.VenuePerformance);
// Note: We don't restore time-based or algorithm metrics to keep memory usage reasonable
}
_logger.LogInformation("Routing metrics initialized from repository");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Error initializing routing metrics from repository");
}
}
/// <summary>
/// Flush metrics to repository
/// </summary>
private async void FlushMetricsAsync(object state)
{
try
{
RoutingMetrics metricsToSave;
lock (_lock)
{
metricsToSave = new RoutingMetrics(_currentMetrics);
}
await _metricsRepository.SaveMetricsAsync(metricsToSave);
_logger.LogInformation("Routing metrics flushed to repository");
}
catch (Exception ex)
{
_logger.LogError(ex, "Error flushing routing metrics to repository");
}
}
/// <summary>
/// Reset metrics
/// </summary>
public void ResetMetrics()
{
lock (_lock)
{
_currentMetrics.TotalRoutedOrders = 0;
_currentMetrics.SuccessfulRoutedOrders = 0;
_currentMetrics.FailedRoutedOrders = 0;
_currentMetrics.AverageRoutingTimeMs = 0;
_currentMetrics.MedianRoutingTimeMs = 0;
_currentMetrics.P95RoutingTimeMs = 0;
_currentMetrics.P99RoutingTimeMs = 0;
_currentMetrics.VenuePerformance.Clear();
_currentMetrics.TimeBasedPerformance.Clear();
_currentMetrics.AlgorithmPerformance.Clear();
_currentMetrics.SymbolPerformance.Clear();
_currentMetrics.UpdatedAt = DateTime.UtcNow;
}
_logger.LogInformation("Routing metrics reset");
}
public void Dispose()
{
_metricsFlushTimer?.Dispose();
}
}
```
### Metrics Repository Interface
```csharp
/// <summary>
/// Repository for metrics storage and retrieval
/// </summary>
public interface IMetricsRepository
{
/// <summary>
/// Get metrics by ID
/// </summary>
Task<T> GetMetricsAsync<T>(string metricsId) where T : class, IMetric;
/// <summary>
/// Save metrics
/// </summary>
Task SaveMetricsAsync<T>(T metrics) where T : class, IMetric;
/// <summary>
/// Delete metrics
/// </summary>
Task DeleteMetricsAsync(string metricsId);
/// <summary>
/// Get metrics by time range
/// </summary>
Task<List<T>> GetMetricsByTimeRangeAsync<T>(DateTime startTime, DateTime endTime) where T : class, IMetric;
/// <summary>
/// Get metrics by tags
/// </summary>
Task<List<T>> GetMetricsByTagsAsync<T>(Dictionary<string, string> tags) where T : class, IMetric;
}
```
### In-Memory Metrics Repository
```csharp
/// <summary>
/// In-memory implementation of metrics repository for development and testing
/// </summary>
public class InMemoryMetricsRepository : IMetricsRepository
{
private readonly Dictionary<string, IMetric> _metrics;
private readonly ILogger<InMemoryMetricsRepository> _logger;
private readonly object _lock = new object();
public InMemoryMetricsRepository(ILogger<InMemoryMetricsRepository> logger)
{
_metrics = new Dictionary<string, IMetric>();
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<T> GetMetricsAsync<T>(string metricsId) where T : class, IMetric
{
if (string.IsNullOrEmpty(metricsId)) throw new ArgumentException("Metrics ID required", nameof(metricsId));
lock (_lock)
{
return Task.FromResult(_metrics.ContainsKey(metricsId) ? _metrics[metricsId] as T : null);
}
}
public Task SaveMetricsAsync<T>(T metrics) where T : class, IMetric
{
if (metrics == null) throw new ArgumentNullException(nameof(metrics));
if (string.IsNullOrEmpty(metrics.Id)) throw new ArgumentException("Metrics ID required", nameof(metrics));
lock (_lock)
{
_metrics[metrics.Id] = metrics;
}
_logger.LogDebug("Metrics saved: {MetricsId}", metrics.Id);
return Task.CompletedTask;
}
public Task DeleteMetricsAsync(string metricsId)
{
if (string.IsNullOrEmpty(metricsId)) throw new ArgumentException("Metrics ID required", nameof(metricsId));
lock (_lock)
{
if (_metrics.ContainsKey(metricsId))
{
_metrics.Remove(metricsId);
}
}
_logger.LogDebug("Metrics deleted: {MetricsId}", metricsId);
return Task.CompletedTask;
}
public Task<List<T>> GetMetricsByTimeRangeAsync<T>(DateTime startTime, DateTime endTime) where T : class, IMetric
{
lock (_lock)
{
var result = _metrics.Values
.OfType<T>()
.Where(m => m.UpdatedAt >= startTime && m.UpdatedAt <= endTime)
.ToList();
return Task.FromResult(result);
}
}
public Task<List<T>> GetMetricsByTagsAsync<T>(Dictionary<string, string> tags) where T : class, IMetric
{
if (tags == null) throw new ArgumentNullException(nameof(tags));
lock (_lock)
{
var result = _metrics.Values
.OfType<T>()
.Where(m => tags.All(tag => m.Tags.ContainsKey(tag.Key) && m.Tags[tag.Key] == tag.Value))
.ToList();
return Task.FromResult(result);
}
}
}
```
## Integration with OrderManager
### Metrics Integration in OrderManager
```csharp
public partial class OrderManager : IOrderManager
{
private readonly RoutingMetricsCollector _metricsCollector;
// Enhanced constructor with metrics collector
public OrderManager(
IRiskManager riskManager,
IPositionSizer positionSizer,
ILogger<OrderManager> logger,
RoutingConfigurationManager configManager,
RoutingMetricsCollector metricsCollector) : base(riskManager, positionSizer, logger, configManager)
{
_metricsCollector = metricsCollector ?? throw new ArgumentNullException(nameof(metricsCollector));
_venueManager = new VenueManager(logger);
_omsToVenueOrderIdMap = new Dictionary<string, string>();
_venueToOmsOrderIdMap = new Dictionary<string, string>();
// Initialize with configurations
InitializeWithConfigurationsAsync().Wait();
}
// Enhanced routing with metrics collection
public async Task<RoutingResult> RouteOrderAsync(OrderRequest request, StrategyContext context)
{
var startTime = DateTime.UtcNow;
try
{
var result = await RouteOrderInternalAsync(request, context);
var endTime = DateTime.UtcNow;
var routingTime = endTime - startTime;
// Record routing metrics
_metricsCollector.RecordOrderRouting(request, result, routingTime);
return result;
}
catch (Exception ex)
{
var endTime = DateTime.UtcNow;
var routingTime = endTime - startTime;
// Record failed routing metrics
_metricsCollector.RecordOrderRouting(request,
new RoutingResult(false, null, null, ex.Message, new Dictionary<string, object> { ["error"] = ex.Message }),
routingTime);
throw;
}
}
private async Task<RoutingResult> RouteOrderInternalAsync(OrderRequest request, StrategyContext context)
{
// Existing routing logic here...
// This is the same as the previous implementation
return await base.RouteOrderAsync(request, context);
}
// Enhanced order submission with execution metrics
public async Task<OrderResult> SubmitOrderAsync(OrderRequest request, StrategyContext context)
{
// Validate request parameters
if (!request.IsValid(out var errors))
{
return new OrderResult(false, null, string.Join("; ", errors), null);
}
// Validate through risk management
var riskDecision = await ValidateOrderAsync(request, context);
if (!riskDecision.Allow)
{
_logger.LogWarning("Order rejected by risk management: {Reason}", riskDecision.RejectReason);
return new OrderResult(false, null, $"Risk validation failed: {riskDecision.RejectReason}", null);
}
var startTime = DateTime.UtcNow;
try
{
// Route order to appropriate venue
var routingResult = await RouteOrderAsync(request, context);
if (!routingResult.Success)
{
_logger.LogError("Order routing failed: {Message}", routingResult.Message);
return new OrderResult(false, null, routingResult.Message, null);
}
// Submit to selected venue
var venueOrderRequest = ConvertToVenueOrderRequest(request);
var venueResult = await routingResult.SelectedVenue.SubmitOrderAsync(venueOrderRequest);
var endTime = DateTime.UtcNow;
var executionTime = endTime - startTime;
// Record execution metrics
if (venueResult != null)
{
// Calculate slippage (simplified)
var slippage = CalculateSlippage(request, venueResult);
_metricsCollector.RecordOrderExecution(routingResult.SelectedVenue.Id, venueResult, executionTime, slippage);
}
if (venueResult?.Success == true)
{
// Map order IDs
lock (_lock)
{
_omsToVenueOrderIdMap[venueResult.VenueOrderId] = venueResult.VenueOrderId;
_venueToOmsOrderIdMap[venueResult.VenueOrderId] = venueResult.VenueOrderId;
}
// Create order status
var orderStatus = ConvertToOrderStatus(venueResult.Status, request);
// Store order status
lock (_lock)
{
_orders[venueResult.VenueOrderId] = orderStatus; // Using venue order ID as key
}
_logger.LogInformation("Order {OrderId} submitted to venue {Venue}",
venueResult.VenueOrderId, routingResult.SelectedVenue.Name);
return new OrderResult(true, venueResult.VenueOrderId, "Order submitted successfully", orderStatus);
}
else
{
_logger.LogError("Order submission failed at venue {Venue}: {Message}",
routingResult.SelectedVenue.Name, venueResult?.Message ?? "Unknown error");
return new OrderResult(false, null,
$"Venue submission failed: {venueResult?.Message ?? "Unknown error"}", null);
}
}
catch (Exception ex)
{
var endTime = DateTime.UtcNow;
var executionTime = endTime - startTime;
_logger.LogError(ex, "Error submitting order for {Symbol}", request.Symbol);
// Record failed execution metrics
_metricsCollector.RecordOrderExecution("unknown",
new VenueOrderResult(false, null, ex.Message, null, new Dictionary<string, object> { ["error"] = ex.Message }),
executionTime, 0);
return new OrderResult(false, null, $"Error submitting order: {ex.Message}", null);
}
}
private decimal CalculateSlippage(OrderRequest request, VenueOrderResult venueResult)
{
// Simplified slippage calculation
// In a real implementation, this would compare expected vs actual execution prices
if (request.LimitPrice.HasValue && venueResult.Status?.Fills?.Any() == true)
{
var averageFillPrice = venueResult.Status.Fills.Average(f => f.FillPrice);
var expectedPrice = request.LimitPrice.Value;
// Calculate percentage slippage
if (expectedPrice != 0)
{
return Math.Abs((averageFillPrice - expectedPrice) / expectedPrice) * 100;
}
}
return 0;
}
// Enhanced methods to expose metrics
public RoutingMetrics GetRoutingMetrics()
{
return _metricsCollector.GetCurrentMetrics();
}
public VenueMetrics GetVenueMetrics(string venueId)
{
return _metricsCollector.GetVenueMetrics(venueId);
}
}
```
## Metrics Analysis and Alerting
### Metrics Analyzer
```csharp
/// <summary>
/// Analyzes routing metrics to identify trends and issues
/// </summary>
public class RoutingMetricsAnalyzer
{
private readonly ILogger<RoutingMetricsAnalyzer> _logger;
private readonly RoutingMetricsCollector _metricsCollector;
private readonly List<MetricsAlert> _activeAlerts;
public RoutingMetricsAnalyzer(
ILogger<RoutingMetricsAnalyzer> logger,
RoutingMetricsCollector metricsCollector)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_metricsCollector = metricsCollector ?? throw new ArgumentNullException(nameof(metricsCollector));
_activeAlerts = new List<MetricsAlert>();
}
/// <summary>
/// Analyze current metrics and generate alerts if needed
/// </summary>
public List<MetricsAlert> AnalyzeMetrics()
{
var alerts = new List<MetricsAlert>();
var currentMetrics = _metricsCollector.GetCurrentMetrics();
// Check overall routing success rate
if (currentMetrics.TotalRoutedOrders > 100) // Only analyze if we have enough data
{
if (currentMetrics.SuccessRate < 0.95) // Less than 95% success rate
{
alerts.Add(new MetricsAlert
{
Id = Guid.NewGuid().ToString(),
AlertType = "LOW_SUCCESS_RATE",
Severity = AlertSeverity.High,
Message = $"Routing success rate is low: {currentMetrics.SuccessRate:P2}",
Timestamp = DateTime.UtcNow,
Metrics = new Dictionary<string, object>
{
["success_rate"] = currentMetrics.SuccessRate,
["total_orders"] = currentMetrics.TotalRoutedOrders
}
});
}
}
// Check average routing time
if (currentMetrics.AverageRoutingTimeMs > 1000) // More than 1 second average
{
alerts.Add(new MetricsAlert
{
Id = Guid.NewGuid().ToString(),
AlertType = "HIGH_LATENCY",
Severity = AlertSeverity.Medium,
Message = $"Average routing time is high: {currentMetrics.AverageRoutingTimeMs:F2}ms",
Timestamp = DateTime.UtcNow,
Metrics = new Dictionary<string, object>
{
["average_routing_time"] = currentMetrics.AverageRoutingTimeMs
}
});
}
// Check venue-specific metrics
foreach (var kvp in currentMetrics.VenuePerformance)
{
var venueId = kvp.Key;
var venueMetrics = kvp.Value;
// Check venue fill rate
if (venueMetrics.TotalOrders > 50) // Only analyze if we have enough data
{
if (venueMetrics.FillRate < 0.90) // Less than 90% fill rate
{
alerts.Add(new MetricsAlert
{
Id = Guid.NewGuid().ToString(),
AlertType = "LOW_VENUE_FILL_RATE",
Severity = AlertSeverity.Medium,
Message = $"Venue {venueMetrics.VenueName} fill rate is low: {venueMetrics.FillRate:P2}",
Timestamp = DateTime.UtcNow,
Metrics = new Dictionary<string, object>
{
["venue_id"] = venueId,
["venue_name"] = venueMetrics.VenueName,
["fill_rate"] = venueMetrics.FillRate,
["total_orders"] = venueMetrics.TotalOrders
}
});
}
}
// Check venue latency
if (venueMetrics.AverageLatencyMs > 500) // More than 500ms average
{
alerts.Add(new MetricsAlert
{
Id = Guid.NewGuid().ToString(),
AlertType = "HIGH_VENUE_LATENCY",
Severity = AlertSeverity.Low,
Message = $"Venue {venueMetrics.VenueName} latency is high: {venueMetrics.AverageLatencyMs:F2}ms",
Timestamp = DateTime.UtcNow,
Metrics = new Dictionary<string, object>
{
["venue_id"] = venueId,
["venue_name"] = venueMetrics.VenueName,
["average_latency"] = venueMetrics.AverageLatencyMs
}
});
}
}
// Check for new alerts
var newAlerts = alerts.Where(a => !_activeAlerts.Any(aa => aa.AlertType == a.AlertType && aa.Severity == a.Severity)).ToList();
// Update active alerts
_activeAlerts.Clear();
_activeAlerts.AddRange(alerts);
if (newAlerts.Any())
{
_logger.LogInformation("Generated {Count} new metrics alerts", newAlerts.Count);
}
return newAlerts;
}
}
/// <summary>
/// Represents a metrics alert
/// </summary>
public record MetricsAlert
{
public string Id { get; set; }
public string AlertType { get; set; }
public AlertSeverity Severity { get; set; }
public string Message { get; set; }
public DateTime Timestamp { get; set; }
public Dictionary<string, object> Metrics { get; set; } = new Dictionary<string, object>();
}
/// <summary>
/// Alert severity levels
/// </summary>
public enum AlertSeverity
{
Low,
Medium,
High,
Critical
}
```
## Testing Considerations
### Unit Tests for Metrics System
1. **Metrics Collection**: Test collection of different types of metrics
2. **Metrics Aggregation**: Test aggregation of metrics over time
3. **Metrics Storage**: Test persistence and retrieval of metrics
4. **Metrics Analysis**: Test analysis and alerting based on metrics
5. **Performance Impact**: Test that metrics collection doesn't significantly impact performance
### Integration Tests
1. **End-to-End Metrics**: Test complete metrics flow from order routing to reporting
2. **Metrics Repository**: Test different repository implementations
3. **Metrics Analysis**: Test alert generation based on different metric thresholds
4. **Metrics Reset**: Test metrics reset functionality
## Performance Considerations
### Metrics Sampling
```csharp
/// <summary>
/// Controls sampling of metrics to reduce performance impact
/// </summary>
public class MetricsSampler
{
private readonly double _samplingRate;
private readonly Random _random;
public MetricsSampler(double samplingRate = 1.0) // 1.0 = 100% sampling
{
if (samplingRate < 0 || samplingRate > 1)
throw new ArgumentOutOfRangeException(nameof(samplingRate), "Sampling rate must be between 0 and 1");
_samplingRate = samplingRate;
_random = new Random();
}
public bool ShouldSample()
{
return _samplingRate >= 1.0 || _random.NextDouble() < _samplingRate;
}
}
```
### Metrics Batching
```csharp
/// <summary>
/// Batches metrics updates to reduce storage overhead
/// </summary>
public class MetricsBatcher
{
private readonly List<IMetric> _batch;
private readonly int _batchSize;
private readonly IMetricsRepository _repository;
private readonly object _lock = new object();
public MetricsBatcher(IMetricsRepository repository, int batchSize = 10)
{
_batch = new List<IMetric>();
_batchSize = batchSize;
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
}
public async Task AddMetricAsync(IMetric metric)
{
List<IMetric> batchToProcess = null;
lock (_lock)
{
_batch.Add(metric);
if (_batch.Count >= _batchSize)
{
batchToProcess = new List<IMetric>(_batch);
_batch.Clear();
}
}
if (batchToProcess != null)
{
await ProcessBatchAsync(batchToProcess);
}
}
private async Task ProcessBatchAsync(List<IMetric> batch)
{
try
{
// Save all metrics in the batch
var saveTasks = batch.Select(metric => _repository.SaveMetricsAsync(metric));
await Task.WhenAll(saveTasks);
}
catch (Exception ex)
{
// Log error but don't throw to avoid breaking the main flow
// In a real implementation, you might want to queue failed metrics for retry
}
}
}
```
## Monitoring and Dashboard Integration
### Metrics Exporter
```csharp
/// <summary>
/// Exports metrics in formats suitable for monitoring systems
/// </summary>
public class MetricsExporter
{
private readonly RoutingMetricsCollector _metricsCollector;
public MetricsExporter(RoutingMetricsCollector metricsCollector)
{
_metricsCollector = metricsCollector ?? throw new ArgumentNullException(nameof(metricsCollector));
}
/// <summary>
/// Export metrics in Prometheus format
/// </summary>
public string ExportToPrometheus()
{
var metrics = _metricsCollector.GetCurrentMetrics();
var sb = new StringBuilder();
// Overall routing metrics
sb.AppendLine($"# HELP routing_total_orders Total number of routed orders");
sb.AppendLine($"# TYPE routing_total_orders counter");
sb.AppendLine($"routing_total_orders {metrics.TotalRoutedOrders}");
sb.AppendLine($"# HELP routing_success_rate Ratio of successful routed orders");
sb.AppendLine($"# TYPE routing_success_rate gauge");
sb.AppendLine($"routing_success_rate {metrics.SuccessRate:F4}");
sb.AppendLine($"# HELP routing_average_time_ms Average routing time in milliseconds");
sb.AppendLine($"# TYPE routing_average_time_ms gauge");
sb.AppendLine($"routing_average_time_ms {metrics.AverageRoutingTimeMs:F2}");
// Venue-specific metrics
foreach (var kvp in metrics.VenuePerformance)
{
var venueMetrics = kvp.Value;
sb.AppendLine($"# HELP venue_orders_total Total orders for venue {venueMetrics.VenueName}");
sb.AppendLine($"# TYPE venue_orders_total counter");
sb.AppendLine($"venue_orders_total{{venue=\"{venueMetrics.VenueName}\"}} {venueMetrics.TotalOrders}");
sb.AppendLine($"# HELP venue_fill_rate Fill rate for venue {venueMetrics.VenueName}");
sb.AppendLine($"# TYPE venue_fill_rate gauge");
sb.AppendLine($"venue_fill_rate{{venue=\"{venueMetrics.VenueName}\"}} {venueMetrics.FillRate:F4}");
sb.AppendLine($"# HELP venue_average_latency_ms Average latency for venue {venueMetrics.VenueName}");
sb.AppendLine($"# TYPE venue_average_latency_ms gauge");
sb.AppendLine($"venue_average_latency_ms{{venue=\"{venueMetrics.VenueName}\"}} {venueMetrics.AverageLatencyMs:F2}");
}
return sb.ToString();
}
/// <summary>
/// Export metrics in JSON format
/// </summary>
public string ExportToJson()
{
var metrics = _metricsCollector.GetCurrentMetrics();
return JsonSerializer.Serialize(metrics, new JsonSerializerOptions { WriteIndented = true });
}
}
```
## Future Enhancements
1. **Real-time Metrics Streaming**: Stream metrics to monitoring systems in real-time
2. **Advanced Analytics**: Use machine learning to predict routing performance
3. **Custom Metrics**: Allow users to define custom metrics and alerts
4. **Metrics Retention**: Implement configurable metrics retention policies
5. **Metrics Compression**: Compress historical metrics to save storage space
6. **Metrics Visualization**: Built-in visualization of metrics trends
7. **Metrics Correlation**: Correlate metrics with market conditions and events
8. **Metrics Forecasting**: Predict future performance based on historical metrics