From c34492069abacae67482af4c8356241958a524f7 Mon Sep 17 00:00:00 2001
From: DanConwayDev <DanConwayDev@protonmail.com>
Date: Fri, 9 Jan 2026 13:28:11 +0000
Subject: feat(sync): add Syncing connection status to track historic sync
 progress

- Add ConnectionStatus::Syncing state between Connecting and Connected
- Track historic_sync_completed and historic_sync_completed_at in RelayState
- Auto-detect sync completion via check_and_complete_historic_sync()
- Update metrics: ngit_sync_relay_connected now shows 0-3 (disconnected/connecting/syncing/connected)
- Update Prometheus metric documentation with new status values
- Add state machine diagram showing Syncing transition
- Operators can now distinguish 'connected but catching up' vs 'fully synced'
---
 docs/explanation/grasp-02-proactive-sync.md | 54 +++++++++++++++----
 docs/explanation/monitoring.md              | 83 +++++++++++++++++------------
 2 files changed, 94 insertions(+), 43 deletions(-)

(limited to 'docs')
diff --git a/docs/explanation/grasp-02-proactive-sync.md b/docs/explanation/grasp-02-proactive-sync.md
index 461bde7..e1fb367 100644
--- a/docs/explanation/grasp-02-proactive-sync.md
+++ b/docs/explanation/grasp-02-proactive-sync.md
@@ -75,7 +75,9 @@ pub enum ConnectionStatus {
     Disconnected,
     /// Connection attempt in progress
     Connecting,
-    /// Successfully connected and subscribed
+    /// Successfully connected, historic sync in progress
+    Syncing,
+    /// Successfully connected, historic sync completed
     Connected,
 }
 
@@ -97,6 +99,11 @@ pub struct RelayState {
     /// Whether announcement filter historic sync has completed for this relay
     /// Used to determine if we can use `since` filter on reconnect for Layer 1
     pub announcements_synced: bool,
+    /// Whether initial historic sync has fully completed (all layers)
+    /// Used to transition from Syncing -> Connected status
+    pub historic_sync_completed: bool,
+    /// When historic sync completed (None if never completed or cleared on fresh_start)
+    pub historic_sync_completed_at: Option<Timestamp>,
 }
 
 impl RelayState {
@@ -198,25 +205,52 @@ When a relay doesn't support NIP-77 Negentropy, historic sync falls back to trad
 stateDiagram-v2
     [*] --> Disconnected: discover relay → register_relay()
     Disconnected --> Connecting: retry_disconnected_relays → try_connect_relay
-    Connecting --> Connected: success → handle_connect_or_reconnect
+    Connecting --> Syncing: success → handle_connect_or_reconnect
     Connecting --> Disconnected: failure + record in health tracker
+    Syncing --> Connected: all historic batches complete → check_and_complete_historic_sync
+    Syncing --> Disconnected: connection lost → handle_disconnect
     Connected --> Disconnected: connection lost → handle_disconnect
     Connected --> [*]: intentional disconnect via check_disconnects
 
     note right of Disconnected: disconnected_at set for 15min rule<br/>RelayConnection kept in HashMap
-    note right of Connected: last_connected tracked for since filter<br/>Event loop spawned here
     note right of Connecting: connection attempt with timeout
+    note right of Syncing: historic sync in progress<br/>event loop spawned here
+    note right of Connected: historic sync complete<br/>last_connected tracked for since filter
 ```
 
 ### Connection Flow Methods
 
-| Method                          | Purpose                   | When Called                       | Actions                                                         |
-| ------------------------------- | ------------------------- | --------------------------------- | --------------------------------------------------------------- |
-| `register_relay()`              | Initialize relay tracking | Discovery via RepoSyncIndex       | Creates RelayConnection, stores in HashMap, returns immediately |
-| `try_connect_relay()`           | Attempt connection        | Health tracker allows retry       | Calls connection.connect(), sends notification on success       |
-| `handle_connect_or_reconnect()` | Setup after connection    | ConnectNotification received      | Spawns event loop, updates state, decides sync strategy         |
-| `handle_disconnect()`           | Cleanup after disconnect  | DisconnectNotification received   | Updates state, clears pending, KEEPS RelayConnection            |
-| `retry_disconnected_relays()`   | Periodic reconnection     | Every 2s (health & metrics timer) | For each ready relay: try_connect_relay()                       |
+| Method                              | Purpose                      | When Called                       | Actions                                                         |
+| ----------------------------------- | ---------------------------- | --------------------------------- | --------------------------------------------------------------- |
+| `register_relay()`                  | Initialize relay tracking    | Discovery via RepoSyncIndex       | Creates RelayConnection, stores in HashMap, returns immediately |
+| `try_connect_relay()`               | Attempt connection           | Health tracker allows retry       | Calls connection.connect(), sends notification on success       |
+| `handle_connect_or_reconnect()`     | Setup after connection       | ConnectNotification received      | Spawns event loop, sets Syncing, decides sync strategy          |
+| `check_and_complete_historic_sync()` | Detect sync completion       | After each batch confirmation     | Transitions Syncing → Connected when no pending batches        |
+| `handle_disconnect()`               | Cleanup after disconnect     | DisconnectNotification received   | Updates state, clears pending, KEEPS RelayConnection            |
+| `retry_disconnected_relays()`       | Periodic reconnection        | Every 2s (health & metrics timer) | For each ready relay: try_connect_relay()                       |
+
+### Historic Sync Completion
+
+When a relay first connects, it enters the **Syncing** state and begins historic sync:
+
+1. **Layer 1 (Announcements)**: Generic filter for all repository announcements
+2. **Layer 2 (Repo Events)**: Filters for events tagging discovered repositories  
+3. **Layer 3 (Root Events)**: Filters for events tagging discovered PRs/Issues/Patches
+
+Each layer creates one or more `PendingBatch` entries tracked in `PendingSyncIndex`. As EOSE messages arrive:
+
+- `handle_eose()` confirms each batch via `confirm_batch()`
+- `confirm_batch()` moves items to confirmed state and calls `check_and_complete_historic_sync()`
+- `check_and_complete_historic_sync()` checks if `PendingSyncIndex` is empty for this relay
+- When empty: transitions `Syncing` → `Connected`, sets `historic_sync_completed = true`
+
+**Metrics tracking**: The `ngit_sync_relay_connected` metric shows:
+- `0` = Disconnected
+- `1` = Connecting  
+- `2` = Syncing (historic sync in progress)
+- `3` = Connected (historic sync complete, live sync active)
+
+This allows operators to monitor sync progress and distinguish between "connected but still catching up" vs "fully synced and live".
 
 ### Event Loop Lifecycle
 
diff --git a/docs/explanation/monitoring.md b/docs/explanation/monitoring.md
index 9368bf4..d2d20c0 100644
--- a/docs/explanation/monitoring.md
+++ b/docs/explanation/monitoring.md
@@ -98,54 +98,64 @@ When GRASP-02 proactive sync is implemented, the following metrics will be added
 
 | Metric | Type | Labels | Description |
 |--------|------|--------|-------------|
-| `ngit_sync_relay_connected` | Gauge | relay | 1 if connected, 0 if not |
+| `ngit_sync_relay_connected` | Gauge | relay | Connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected) |
 | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes |
-| `ngit_sync_relay_status` | Gauge | relay, status | 1 for current status, 0 otherwise |
+| `ngit_sync_relay_status` | Gauge | relay | Health status (1=healthy, 2=disconnected, 3=degraded, 4=dead, 5=rate_limited) |
 | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count |
-| `ngit_sync_events_total` | Counter | source | Events received by source type |
-| `ngit_sync_gap_events_total` | Counter | relay | Events found during catchup |
+| `ngit_sync_events_synced_total` | Counter | - | Events synced (newly saved events only) |
 | `ngit_sync_relays_tracked_total` | Gauge | - | Total relays discovered |
 | `ngit_sync_relays_connected_total` | Gauge | - | Currently connected relay count |
 | `ngit_sync_relays_dead_total` | Gauge | - | Relays marked as dead |
 
-### Event Sources
+### Connection Status Values
 
-The `source` label on `ngit_sync_events_total` tracks how events were received:
+The `ngit_sync_relay_connected` metric tracks the connection lifecycle:
 
-- `direct` - Submitted directly to our relay by a user
-- `live_sync` - Received via live WebSocket subscription (expected path)
-- `catchup` - Found during negentropy catchup after reconnect
-- `daily_catchup` - Found during daily reconciliation
+- `0` = **Disconnected** - Not currently connected
+- `1` = **Connecting** - Connection attempt in progress
+- `2` = **Syncing** - Connected, historic sync in progress
+- `3` = **Connected** - Connected, historic sync complete, live sync active
 
-**Catchup events indicate sync failures** - these should have been received via live sync. High catchup rates suggest connectivity issues or filter mismatches.
+This allows operators to distinguish between "connected but still catching up" (Syncing) vs "fully synced and live" (Connected).
 
 ### Relay Health States
 
-The `status` label on `ngit_sync_relay_status` tracks relay health:
+The `ngit_sync_relay_status` metric tracks relay health:
 
-- `healthy` - Normal operation, connections working
-- `backoff` - Exponential backoff after failures (5s → 10s → ... → 1h)
-- `dead` - 24h of continuous failures, daily retry only
+- `1` = **Healthy** - Connected and stable
+- `2` = **Disconnected** - Not connected, but no issues detected
+- `3` = **Degraded** - Connection problems or unstable after recovery
+- `4` = **Dead** - 24h+ of continuous failures
+- `5` = **RateLimited** - Rate limit cooldown active (65s)
 
 ### Example Grafana Queries
 
 ```promql
-# Relay health overview - count by status
-sum by (status) (ngit_sync_relay_status == 1)
+# Relay connection status overview - count by status
+sum by (relay) (ngit_sync_relay_connected == 0)  # Disconnected
+sum by (relay) (ngit_sync_relay_connected == 1)  # Connecting
+sum by (relay) (ngit_sync_relay_connected == 2)  # Syncing
+sum by (relay) (ngit_sync_relay_connected == 3)  # Connected
+
+# Relays still syncing (not yet fully caught up)
+count(ngit_sync_relay_connected == 2)
 
 # Connection success rate over last hour
 sum(rate(ngit_sync_connection_attempts_total{result="success"}[1h]))
 / sum(rate(ngit_sync_connection_attempts_total[1h]))
 
-# Sync gap detection - events that should have been live synced
-sum(rate(ngit_sync_gap_events_total[1h])) by (relay)
-
-# Live sync effectiveness (lower is better - fewer gaps)
-sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h]))
-/ sum(rate(ngit_sync_events_total[1h]))
+# Event sync rate (newly saved events)
+rate(ngit_sync_events_synced_total[5m])
 
 # Relays with high failure counts (potential issues)
 topk(10, ngit_sync_relay_failures)
+
+# Relay health overview - count by health state
+sum(ngit_sync_relay_status == 1)  # Healthy
+sum(ngit_sync_relay_status == 2)  # Disconnected
+sum(ngit_sync_relay_status == 3)  # Degraded
+sum(ngit_sync_relay_status == 4)  # Dead
+sum(ngit_sync_relay_status == 5)  # RateLimited
 ```
 
 ### Example Alerts
@@ -153,23 +163,30 @@ topk(10, ngit_sync_relay_failures)
 ```yaml
 # Alert if relay stuck in dead state for > 1 day
 - alert: SyncRelayDead
-  expr: ngit_sync_relay_status{status="dead"} == 1
+  expr: ngit_sync_relay_status == 4  # Dead state
   for: 1d
   labels:
     severity: warning
   annotations:
-    summary: "Sync relay {{ $labels.relay }} is dead"
-
-# Alert if sync gap rate is high (>10% of events from catchup)
-- alert: SyncGapHigh
-  expr: >
-    sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h]))
-    / sum(rate(ngit_sync_events_total[1h])) > 0.1
-  for: 30m
+    summary: "Sync relay {{ $labels.relay }} is dead (24h+ failures)"
+
+# Alert if relay stuck in syncing state for > 1 hour
+- alert: SyncRelaySlow
+  expr: ngit_sync_relay_connected == 2  # Syncing state
+  for: 1h
+  labels:
+    severity: info
+  annotations:
+    summary: "Sync relay {{ $labels.relay }} taking >1h to complete historic sync"
+
+# Alert if too many relays are degraded
+- alert: SyncManyDegraded
+  expr: sum(ngit_sync_relay_status == 3) > 5  # Degraded state
+  for: 15m
   labels:
     severity: warning
   annotations:
-    summary: "High sync gap rate - {{ $value | humanizePercentage }} of events from catchup"
+    summary: "{{ $value }} relays in degraded state"
 ```
 
 ### Design Rationale
-- 
cgit v1.2.3