From c34492069abacae67482af4c8356241958a524f7 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Fri, 9 Jan 2026 13:28:11 +0000 Subject: feat(sync): add Syncing connection status to track historic sync progress - Add ConnectionStatus::Syncing state between Connecting and Connected - Track historic_sync_completed and historic_sync_completed_at in RelayState - Auto-detect sync completion via check_and_complete_historic_sync() - Update metrics: ngit_sync_relay_connected now shows 0-3 (disconnected/connecting/syncing/connected) - Update Prometheus metric documentation with new status values - Add state machine diagram showing Syncing transition - Operators can now distinguish 'connected but catching up' vs 'fully synced' --- docs/explanation/grasp-02-proactive-sync.md | 54 +++++++++++++++---- docs/explanation/monitoring.md | 83 +++++++++++++++++------------ 2 files changed, 94 insertions(+), 43 deletions(-) (limited to 'docs') diff --git a/docs/explanation/grasp-02-proactive-sync.md b/docs/explanation/grasp-02-proactive-sync.md index 461bde7..e1fb367 100644 --- a/docs/explanation/grasp-02-proactive-sync.md +++ b/docs/explanation/grasp-02-proactive-sync.md @@ -75,7 +75,9 @@ pub enum ConnectionStatus { Disconnected, /// Connection attempt in progress Connecting, - /// Successfully connected and subscribed + /// Successfully connected, historic sync in progress + Syncing, + /// Successfully connected, historic sync completed Connected, } @@ -97,6 +99,11 @@ pub struct RelayState { /// Whether announcement filter historic sync has completed for this relay /// Used to determine if we can use `since` filter on reconnect for Layer 1 pub announcements_synced: bool, + /// Whether initial historic sync has fully completed (all layers) + /// Used to transition from Syncing -> Connected status + pub historic_sync_completed: bool, + /// When historic sync completed (None if never completed or cleared on fresh_start) + pub historic_sync_completed_at: Option, } impl RelayState { @@ -198,25 +205,52 @@ When a relay doesn't support NIP-77 Negentropy, historic sync falls back to trad stateDiagram-v2 [*] --> Disconnected: discover relay → register_relay() Disconnected --> Connecting: retry_disconnected_relays → try_connect_relay - Connecting --> Connected: success → handle_connect_or_reconnect + Connecting --> Syncing: success → handle_connect_or_reconnect Connecting --> Disconnected: failure + record in health tracker + Syncing --> Connected: all historic batches complete → check_and_complete_historic_sync + Syncing --> Disconnected: connection lost → handle_disconnect Connected --> Disconnected: connection lost → handle_disconnect Connected --> [*]: intentional disconnect via check_disconnects note right of Disconnected: disconnected_at set for 15min rule
RelayConnection kept in HashMap - note right of Connected: last_connected tracked for since filter
Event loop spawned here note right of Connecting: connection attempt with timeout + note right of Syncing: historic sync in progress
event loop spawned here + note right of Connected: historic sync complete
last_connected tracked for since filter ``` ### Connection Flow Methods -| Method | Purpose | When Called | Actions | -| ------------------------------- | ------------------------- | --------------------------------- | --------------------------------------------------------------- | -| `register_relay()` | Initialize relay tracking | Discovery via RepoSyncIndex | Creates RelayConnection, stores in HashMap, returns immediately | -| `try_connect_relay()` | Attempt connection | Health tracker allows retry | Calls connection.connect(), sends notification on success | -| `handle_connect_or_reconnect()` | Setup after connection | ConnectNotification received | Spawns event loop, updates state, decides sync strategy | -| `handle_disconnect()` | Cleanup after disconnect | DisconnectNotification received | Updates state, clears pending, KEEPS RelayConnection | -| `retry_disconnected_relays()` | Periodic reconnection | Every 2s (health & metrics timer) | For each ready relay: try_connect_relay() | +| Method | Purpose | When Called | Actions | +| ----------------------------------- | ---------------------------- | --------------------------------- | --------------------------------------------------------------- | +| `register_relay()` | Initialize relay tracking | Discovery via RepoSyncIndex | Creates RelayConnection, stores in HashMap, returns immediately | +| `try_connect_relay()` | Attempt connection | Health tracker allows retry | Calls connection.connect(), sends notification on success | +| `handle_connect_or_reconnect()` | Setup after connection | ConnectNotification received | Spawns event loop, sets Syncing, decides sync strategy | +| `check_and_complete_historic_sync()` | Detect sync completion | After each batch confirmation | Transitions Syncing → Connected when no pending batches | +| `handle_disconnect()` | Cleanup after disconnect | DisconnectNotification received | Updates state, clears pending, KEEPS RelayConnection | +| `retry_disconnected_relays()` | Periodic reconnection | Every 2s (health & metrics timer) | For each ready relay: try_connect_relay() | + +### Historic Sync Completion + +When a relay first connects, it enters the **Syncing** state and begins historic sync: + +1. **Layer 1 (Announcements)**: Generic filter for all repository announcements +2. **Layer 2 (Repo Events)**: Filters for events tagging discovered repositories +3. **Layer 3 (Root Events)**: Filters for events tagging discovered PRs/Issues/Patches + +Each layer creates one or more `PendingBatch` entries tracked in `PendingSyncIndex`. As EOSE messages arrive: + +- `handle_eose()` confirms each batch via `confirm_batch()` +- `confirm_batch()` moves items to confirmed state and calls `check_and_complete_historic_sync()` +- `check_and_complete_historic_sync()` checks if `PendingSyncIndex` is empty for this relay +- When empty: transitions `Syncing` → `Connected`, sets `historic_sync_completed = true` + +**Metrics tracking**: The `ngit_sync_relay_connected` metric shows: +- `0` = Disconnected +- `1` = Connecting +- `2` = Syncing (historic sync in progress) +- `3` = Connected (historic sync complete, live sync active) + +This allows operators to monitor sync progress and distinguish between "connected but still catching up" vs "fully synced and live". ### Event Loop Lifecycle diff --git a/docs/explanation/monitoring.md b/docs/explanation/monitoring.md index 9368bf4..d2d20c0 100644 --- a/docs/explanation/monitoring.md +++ b/docs/explanation/monitoring.md @@ -98,54 +98,64 @@ When GRASP-02 proactive sync is implemented, the following metrics will be added | Metric | Type | Labels | Description | |--------|------|--------|-------------| -| `ngit_sync_relay_connected` | Gauge | relay | 1 if connected, 0 if not | +| `ngit_sync_relay_connected` | Gauge | relay | Connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected) | | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes | -| `ngit_sync_relay_status` | Gauge | relay, status | 1 for current status, 0 otherwise | +| `ngit_sync_relay_status` | Gauge | relay | Health status (1=healthy, 2=disconnected, 3=degraded, 4=dead, 5=rate_limited) | | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count | -| `ngit_sync_events_total` | Counter | source | Events received by source type | -| `ngit_sync_gap_events_total` | Counter | relay | Events found during catchup | +| `ngit_sync_events_synced_total` | Counter | - | Events synced (newly saved events only) | | `ngit_sync_relays_tracked_total` | Gauge | - | Total relays discovered | | `ngit_sync_relays_connected_total` | Gauge | - | Currently connected relay count | | `ngit_sync_relays_dead_total` | Gauge | - | Relays marked as dead | -### Event Sources +### Connection Status Values -The `source` label on `ngit_sync_events_total` tracks how events were received: +The `ngit_sync_relay_connected` metric tracks the connection lifecycle: -- `direct` - Submitted directly to our relay by a user -- `live_sync` - Received via live WebSocket subscription (expected path) -- `catchup` - Found during negentropy catchup after reconnect -- `daily_catchup` - Found during daily reconciliation +- `0` = **Disconnected** - Not currently connected +- `1` = **Connecting** - Connection attempt in progress +- `2` = **Syncing** - Connected, historic sync in progress +- `3` = **Connected** - Connected, historic sync complete, live sync active -**Catchup events indicate sync failures** - these should have been received via live sync. High catchup rates suggest connectivity issues or filter mismatches. +This allows operators to distinguish between "connected but still catching up" (Syncing) vs "fully synced and live" (Connected). ### Relay Health States -The `status` label on `ngit_sync_relay_status` tracks relay health: +The `ngit_sync_relay_status` metric tracks relay health: -- `healthy` - Normal operation, connections working -- `backoff` - Exponential backoff after failures (5s → 10s → ... → 1h) -- `dead` - 24h of continuous failures, daily retry only +- `1` = **Healthy** - Connected and stable +- `2` = **Disconnected** - Not connected, but no issues detected +- `3` = **Degraded** - Connection problems or unstable after recovery +- `4` = **Dead** - 24h+ of continuous failures +- `5` = **RateLimited** - Rate limit cooldown active (65s) ### Example Grafana Queries ```promql -# Relay health overview - count by status -sum by (status) (ngit_sync_relay_status == 1) +# Relay connection status overview - count by status +sum by (relay) (ngit_sync_relay_connected == 0) # Disconnected +sum by (relay) (ngit_sync_relay_connected == 1) # Connecting +sum by (relay) (ngit_sync_relay_connected == 2) # Syncing +sum by (relay) (ngit_sync_relay_connected == 3) # Connected + +# Relays still syncing (not yet fully caught up) +count(ngit_sync_relay_connected == 2) # Connection success rate over last hour sum(rate(ngit_sync_connection_attempts_total{result="success"}[1h])) / sum(rate(ngit_sync_connection_attempts_total[1h])) -# Sync gap detection - events that should have been live synced -sum(rate(ngit_sync_gap_events_total[1h])) by (relay) - -# Live sync effectiveness (lower is better - fewer gaps) -sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h])) -/ sum(rate(ngit_sync_events_total[1h])) +# Event sync rate (newly saved events) +rate(ngit_sync_events_synced_total[5m]) # Relays with high failure counts (potential issues) topk(10, ngit_sync_relay_failures) + +# Relay health overview - count by health state +sum(ngit_sync_relay_status == 1) # Healthy +sum(ngit_sync_relay_status == 2) # Disconnected +sum(ngit_sync_relay_status == 3) # Degraded +sum(ngit_sync_relay_status == 4) # Dead +sum(ngit_sync_relay_status == 5) # RateLimited ``` ### Example Alerts @@ -153,23 +163,30 @@ topk(10, ngit_sync_relay_failures) ```yaml # Alert if relay stuck in dead state for > 1 day - alert: SyncRelayDead - expr: ngit_sync_relay_status{status="dead"} == 1 + expr: ngit_sync_relay_status == 4 # Dead state for: 1d labels: severity: warning annotations: - summary: "Sync relay {{ $labels.relay }} is dead" - -# Alert if sync gap rate is high (>10% of events from catchup) -- alert: SyncGapHigh - expr: > - sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h])) - / sum(rate(ngit_sync_events_total[1h])) > 0.1 - for: 30m + summary: "Sync relay {{ $labels.relay }} is dead (24h+ failures)" + +# Alert if relay stuck in syncing state for > 1 hour +- alert: SyncRelaySlow + expr: ngit_sync_relay_connected == 2 # Syncing state + for: 1h + labels: + severity: info + annotations: + summary: "Sync relay {{ $labels.relay }} taking >1h to complete historic sync" + +# Alert if too many relays are degraded +- alert: SyncManyDegraded + expr: sum(ngit_sync_relay_status == 3) > 5 # Degraded state + for: 15m labels: severity: warning annotations: - summary: "High sync gap rate - {{ $value | humanizePercentage }} of events from catchup" + summary: "{{ $value }} relays in degraded state" ``` ### Design Rationale -- cgit v1.2.3