From 1fb3e978cbd7cee7f11b3a2f8c73b3ca4543b35d Mon Sep 17 00:00:00 2001 From: Mark Date: Sat, 16 May 2026 14:37:22 +0200 Subject: [PATCH] feat(option-c-offscreen): port health probe + request-id'd BUFFER + H1 try/catch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the offscreen-side architectural refactor per .planning/debug/empty-archive-port-race.md "Fix Strategy: Option C": 1. **Retired** the 290_000 ms pre-emptive reconnect setTimeout. Its race window between the synchronous .disconnect() and the onDisconnect handler running was the bisect-confirmed proximate cause of the H1 "Attempting to use a disconnected port object" Uncaught Errors. 2. **Added** PONG-based health probe: each ping increments missedPongs; if MAX_MISSED_PONGS (3) consecutive PINGs go without echo, reconnect via the same clean teardown path the onDisconnect handler uses. PONG receipt resets the counter. Liveness-based replacement for the time-based pre-emptive rotation. 3. **H1 fix** — wrap PING postMessage in try/catch. The port object can transition to disconnected synchronously (SW eviction, port glitch) between the interval-callback being queued and it running. The catch absorbs the throw and routes through reconnectPort() — no more uncaught throws bubble out to the offscreen console. 4. **Request-id'd protocol** — REQUEST_BUFFER carries the SW-generated requestId; BUFFER response echoes it. The offscreen now posts on the CURRENT keepalivePort (no more portAtRequest stale-port refuse-to- post). The SW matches BUFFER → request by id, so port replacement mid-encode no longer drops the response — the SW retries on the new port and the matching BUFFER routes correctly. 5. **reconnectPort(reason)** — new helper consolidating the teardown+disconnect+reconnect dance used by both the missed-PONG path and the synchronous-throw path. Idempotent w.r.t. the chained onDisconnect callback. Test updates: - H2 now sends REQUEST_BUFFER with a requestId (Option C contract). - H1.b refactored to test the externally-disconnected path (since the pre-emptive timeout path is gone): port._disconnected=true, fire ping, assert no throw + a fresh port appears. - Top-level snapshots of timer globals + afterEach restoration so a failing test doesn't leak overridden globals into the next test. Status: 48 GREEN, 4 RED (the remaining RED is all SW-side — addressed in next commit). All H1 + H1.b + H2 contracts now GREEN. Pinning contracts (D-12 port-serialization, D-13 segment-rotation, A3 webm- playback) untouched. tsc --noEmit exit 0; type-safety grep clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/offscreen/recorder.ts | 170 +++++++++++++++----- tests/offscreen/port-reconnect-race.test.ts | 117 ++++++-------- 2 files changed, 181 insertions(+), 106 deletions(-) diff --git a/src/offscreen/recorder.ts b/src/offscreen/recorder.ts index a226982..4202511 100644 --- a/src/offscreen/recorder.ts +++ b/src/offscreen/recorder.ts @@ -36,7 +36,15 @@ const VIDEO_BITRATE = 400_000; // CON-video-codec // контракт ротации. const PORT_NAME = 'video-keepalive'; // long-lived port name (D-17, Pattern 5) const PORT_PING_MS = 25_000; // < 30 s SW idle threshold -const PORT_RECONNECT_MS = 290_000; // pre-empt the ~5 min port cap (Pitfall 4) +// Option C health-probe: how many consecutive PINGs may go unanswered +// (no PONG echo) before we declare the port dead and reconnect. With +// PORT_PING_MS = 25_000 ms, 3 missed PONGs = 75 s of unresponsive SW — +// well past the ~30 s Chrome SW idle threshold, so a real disconnect +// already would have fired its onDisconnect handler by then. This +// threshold catches the failure mode the old 290 s pre-emptive setTimeout +// pretended to: the SW is up but the port has gone half-open. +// See .planning/debug/empty-archive-port-race.md (Fix Strategy: Option C). +const MAX_MISSED_PONGS = 3; const logger = new OffscreenLogger('Recorder'); @@ -466,29 +474,56 @@ function isFromOwnExtension(sender: chrome.runtime.MessageSender | undefined): b return sender?.id === chrome.runtime.id; } -// Stable handles for the ping interval and the pre-emptive reconnect timer, -// so we can clear them on disconnect / re-init. +// Stable handle for the ping interval. The legacy 290 s pre-emptive +// reconnect setTimeout was retired in the Option C refactor — its race +// window between the synchronous .disconnect() and the onDisconnect +// handler running was the proximate cause of the H1 "Attempting to use a +// disconnected port object" Uncaught Errors per the bisect notes in +// .planning/debug/empty-archive-port-race.md. let pingIntervalId: ReturnType | null = null; -let preemptiveReconnectId: ReturnType | null = null; + +// Option C health-probe state. The offscreen pings every PORT_PING_MS and +// expects a PONG echo from the SW. If MAX_MISSED_PONGS consecutive PINGs +// go unanswered, the port is considered dead and a clean reconnect is +// initiated via the same teardown path the onDisconnect handler uses. +// This replaces the time-based pre-emptive reconnect with a liveness- +// based one: the port lives until it stops responding, not until a wall +// clock says it should rotate. +let missedPongs = 0; function teardownPortTimers(): void { if (pingIntervalId !== null) { clearInterval(pingIntervalId); pingIntervalId = null; } - if (preemptiveReconnectId !== null) { - clearTimeout(preemptiveReconnectId); - preemptiveReconnectId = null; - } } function onPortMessage(message: unknown): void { - // Defense-in-depth: explicit shape check before destructuring (T-1-04) + // Defense-in-depth: explicit shape check before destructuring (T-1-04). if (typeof message !== 'object' || message === null) { return; } const type = (message as { type?: unknown }).type; + if (type === 'PONG') { + // Health-probe echo. Resetting missedPongs to 0 closes the probe loop + // — the port is demonstrably alive. + missedPongs = 0; + return; + } if (type === 'REQUEST_BUFFER') { + // Option C: REQUEST_BUFFER carries a requestId the SW generated per + // call. We echo it on the BUFFER response so the SW's per-request + // listener can match by id (preventing cross-talk between concurrent + // requests and between retries on port replacement). + const requestId = (message as { requestId?: unknown }).requestId; + if (typeof requestId !== 'string' || requestId.length === 0) { + // Defense-in-depth: a REQUEST_BUFFER without a valid id is invalid + // under the Option C protocol — drop with a warn. Legacy callers + // are absent from the codebase; this is purely a guard against + // future regressions or hostile peers. + logger.warn('REQUEST_BUFFER without a valid requestId — dropping'); + return; + } // Fire-and-forget: the chrome.runtime.Port.onMessage listener API // ignores the return value. We do the async base64 encoding inside // an IIFE so the listener stays synchronous-typed (void return). @@ -497,7 +532,7 @@ function onPortMessage(message: unknown): void { // because chrome.runtime.Port JSON-serializes across extension contexts, // and JSON.stringify(blob) === "{}". See src/shared/binary.ts and // tests/offscreen/port-serialization.test.ts for the contract. - void encodeAndSendBuffer(); + void encodeAndSendBuffer(requestId); } // Any unknown port message type is silently dropped (T-1-04 defense-in-depth). } @@ -519,32 +554,24 @@ function onPortMessage(message: unknown): void { // post-completion state. let encodeInFlight = false; -async function encodeAndSendBuffer(): Promise { +async function encodeAndSendBuffer(requestId: string): Promise { if (encodeInFlight) { logger.warn('encodeAndSendBuffer already running — dropping concurrent call'); return; } - // CR-01 fix: capture the port identity BEFORE the await. If `keepalivePort` - // is replaced by a fresh reconnect during base64 encoding, posting on the - // new port would silently leak the BUFFER to a stranger — the SW's - // per-request listener is still bound to the OLD port. The SW already - // times out cleanly (BUFFER_FETCH_TIMEOUT_MS = 2 s), so dropping a stale - // response on the floor is the correct behaviour: the next SAVE_ARCHIVE - // round-trip will REQUEST_BUFFER on the fresh port. - const portAtRequest = keepalivePort; - if (portAtRequest === null) { + if (keepalivePort === null) { logger.warn('encodeAndSendBuffer called without an active port — drop'); return; } encodeInFlight = true; try { - await doEncodeAndSendBuffer(portAtRequest); + await doEncodeAndSendBuffer(requestId); } finally { encodeInFlight = false; } } -async function doEncodeAndSendBuffer(portAtRequest: chrome.runtime.Port): Promise { +async function doEncodeAndSendBuffer(requestId: string): Promise { // WR-09 fix: an in-flight segment lacks the Matroska finalization that // MediaRecorder.stop() performs (SegmentSize, Cues) — splicing it onto // a finalized tail re-introduces the "File ended prematurely" symptom @@ -589,18 +616,62 @@ async function doEncodeAndSendBuffer(portAtRequest: chrome.runtime.Port): Promis ); } } - // After the await: refuse to post on a port that has been REPLACED - // by reconnect. The SW listens on the OLD port; posting on the NEW - // port would silently lose the data. Letting the SW time out is - // correct — the next SAVE_ARCHIVE will re-issue REQUEST_BUFFER on - // the fresh port. - if (keepalivePort !== portAtRequest) { + // After the await: post the BUFFER on the CURRENT keepalivePort. Under + // the Option C protocol the BUFFER carries the requestId the SW issued, + // so even if the port was replaced mid-encode (and the SW's matching + // listener migrated to the new port via the SW-side retry — see + // src/background/index.ts:getVideoBufferFromOffscreen), the BUFFER + // routes correctly by id. This retires the H2 silent-drop class that + // the bisect (.planning/debug/empty-archive-port-race.md) traced to + // the CR-01 "refuse-to-post on swapped port" pattern. + // + // Edge case: if the encode took so long that keepalivePort is now null + // (e.g. mid-reconnect with no fresh port yet) we drop with a warn. + // The SW's outer timeout will eventually fire and surface the error to + // the operator (no more silent video-less archive). + if (keepalivePort === null) { logger.warn( - 'port replaced during encode; dropping BUFFER response (SW will time out and retry)', + 'port unavailable at post time; dropping BUFFER (SW outer timeout will surface)', ); return; } - portAtRequest.postMessage({ type: 'BUFFER', segments: transferred }); + try { + keepalivePort.postMessage({ + type: 'BUFFER', + requestId, + segments: transferred, + }); + } catch (err) { + // The current port may have been disconnected between the null-check + // above and postMessage. Treat as a dropped response — the SW retries + // on the fresh port. + logger.warn('BUFFER post failed (port disconnected mid-post):', err); + } +} + +// Triggers a clean reconnect: tear down timers, disconnect the current +// port (if any), null the reference, and re-enter connectPort(). The +// disconnect call fires the onDisconnect handler on the local side too +// (Chrome's documented behaviour for explicit .disconnect()), and that +// handler also calls connectPort() — but the missedPongs reset + the +// teardownPortTimers idempotency keep the second pass a no-op. +function reconnectPort(reason: string): void { + logger.warn('Initiating port reconnect:', reason); + teardownPortTimers(); + const stale = keepalivePort; + keepalivePort = null; + missedPongs = 0; + if (stale !== null) { + try { + stale.disconnect(); + } catch (err) { + // Disconnecting an already-disconnected port throws on some Chrome + // versions; silently ignored — the local-side reference has already + // been cleared so no further use is possible. + logger.warn('stale.disconnect() during reconnect failed:', err); + } + } + connectPort(); } function connectPort(): void { @@ -612,22 +683,47 @@ function connectPort(): void { keepalivePort = null; return; } + missedPongs = 0; keepalivePort.onMessage.addListener(onPortMessage); keepalivePort.onDisconnect.addListener(() => { logger.warn('port disconnected — reconnecting'); teardownPortTimers(); keepalivePort = null; - // Synchronous reconnect — tests/offscreen/port.test.ts pins this + missedPongs = 0; + // Synchronous reconnect — tests/offscreen/port.test.ts pins this. connectPort(); }); pingIntervalId = setInterval(() => { - keepalivePort?.postMessage({ type: 'PING' }); + // H1 fix: wrap postMessage in try/catch. The port object can become + // disconnected between the interval-callback being queued and it + // running (race window between local .disconnect() and onDisconnect + // firing). Chrome's documented runtime semantics make postMessage + // throw synchronously with "Attempting to use a disconnected port + // object". Without the catch, that became an Uncaught Error in the + // offscreen console (3x at 290 s in UAT Test 3). + // + // Health probe: each ping costs us one missed-PONG slot. The SW + // echoes PONG on PING (src/background/index.ts), and onPortMessage + // resets missedPongs to 0 on receipt. If MAX_MISSED_PONGS pings go + // without a PONG, the SW (or the port) is unresponsive and we + // reconnect cleanly. + if (keepalivePort === null) { + return; + } + missedPongs++; + if (missedPongs > MAX_MISSED_PONGS) { + reconnectPort(`missed ${missedPongs} PONGs — port unresponsive`); + return; + } + try { + keepalivePort.postMessage({ type: 'PING' }); + } catch (err) { + // The port went disconnected synchronously — initiate the same + // clean reconnect path. The throw caught here is exactly the H1 + // "Attempting to use a disconnected port object" surface. + reconnectPort(`PING postMessage threw: ${String(err)}`); + } }, PORT_PING_MS); - preemptiveReconnectId = setTimeout(() => { - logger.log('pre-emptive port reconnect (290 s cap)'); - keepalivePort?.disconnect(); - // onDisconnect handler above triggers a fresh connectPort() call - }, PORT_RECONNECT_MS); } // Бутстрап (handshake + port lifecycle). diff --git a/tests/offscreen/port-reconnect-race.test.ts b/tests/offscreen/port-reconnect-race.test.ts index 5da7a8b..82a20b0 100644 --- a/tests/offscreen/port-reconnect-race.test.ts +++ b/tests/offscreen/port-reconnect-race.test.ts @@ -25,7 +25,7 @@ // tests/offscreen/port.test.ts + handshake.test.ts). No source-code // re-implementation — the production module is what we're testing. -import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; interface PortStub { name: string; @@ -122,21 +122,27 @@ function buildChromeStub(ports: PortStub[]): ChromeStub { }; } -describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', () => { - let originalSetInterval: typeof globalThis.setInterval; - let originalClearInterval: typeof globalThis.clearInterval; - let originalSetTimeout: typeof globalThis.setTimeout; - let originalClearTimeout: typeof globalThis.clearTimeout; +// Top-level snapshots of the real timer APIs — captured at module load, +// BEFORE any test runs, so they survive a test that fails before its +// inline restore code. The afterEach below restores unconditionally. +const ORIGINAL_SET_INTERVAL = globalThis.setInterval; +const ORIGINAL_CLEAR_INTERVAL = globalThis.clearInterval; +const ORIGINAL_SET_TIMEOUT = globalThis.setTimeout; +const ORIGINAL_CLEAR_TIMEOUT = globalThis.clearTimeout; +describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', () => { beforeEach(() => { vi.resetModules(); (globalThis as unknown as GlobalWithChrome).MediaRecorder = { isTypeSupported: vi.fn().mockReturnValue(true), }; - originalSetInterval = globalThis.setInterval; - originalClearInterval = globalThis.clearInterval; - originalSetTimeout = globalThis.setTimeout; - originalClearTimeout = globalThis.clearTimeout; + }); + + afterEach(() => { + globalThis.setInterval = ORIGINAL_SET_INTERVAL; + globalThis.clearInterval = ORIGINAL_CLEAR_INTERVAL; + globalThis.setTimeout = ORIGINAL_SET_TIMEOUT; + globalThis.clearTimeout = ORIGINAL_CLEAR_TIMEOUT; }); // ────────────────────────────────────────────────────────────────────── @@ -199,9 +205,6 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', // operator observed in UAT Test 3. expect(() => pingCb()).not.toThrow(); - // Restore globals. - globalThis.setInterval = originalSetInterval; - globalThis.clearInterval = originalClearInterval; }); // ────────────────────────────────────────────────────────────────────── @@ -272,7 +275,16 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', // 2) SW issues REQUEST_BUFFER — synthetically dispatch to the // captured onPortMessage listener. The production code kicks off // encodeAndSendBuffer which awaits 3 blobToBase64 calls. - oldPort.onMessage._listeners[0]({ type: 'REQUEST_BUFFER' }); + // + // Under the Option C protocol the request carries a requestId the + // SW generates per call; the offscreen echoes it on the BUFFER + // response. This test uses a literal id since it's not driving + // the SW — only the recorder's onPortMessage path. + const requestId = 'h2-test-id'; + oldPort.onMessage._listeners[0]({ + type: 'REQUEST_BUFFER', + requestId, + }); // 3) Mid-encode, the pre-emptive reconnect fires. Simulate by // firing onDisconnect on the old port — production code's @@ -346,82 +358,49 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', }); // ────────────────────────────────────────────────────────────────────── - // H1.b — pre-emptive reconnect path specifically. Walk the same - // sequence the operator hit at 290 s: the production code calls - // `keepalivePort?.disconnect()` and the test then invokes the ping - // callback that was installed BEFORE the reconnect. This proves the - // race window in the production timer-clear ordering. + // H1.b — externally-disconnected port path. The original 290 s + // pre-emptive setTimeout is gone (Option C retired it — see the + // "Bisect Results" section of empty-archive-port-race.md). What + // remains is the same H1 race shape, just triggered by a different + // path: the SW side (or Chrome's idle eviction, or a network glitch) + // can mark the remote end of the port as disconnected; the local + // postMessage then throws synchronously with the same Chrome error + // string. The fix's try/catch + reconnect path must absorb that. // ────────────────────────────────────────────────────────────────────── - it('H1.b: pre-emptive reconnect path — ping does not throw against just-disconnected port', async () => { + it('H1.b: externally-disconnected port — ping path absorbs the throw and reconnects cleanly', async () => { const ports: PortStub[] = []; const stub = buildChromeStub(ports); (globalThis as unknown as GlobalWithChrome).chrome = stub; - // Capture both interval callbacks AND timeout callbacks for direct - // invocation. We need to fire the pre-emptive setTimeout - // synthetically (it would otherwise wait 290 s of wall-clock time). const intervalCallbacks: Array<() => void> = []; - const timeoutCallbacks: Array<() => void> = []; - globalThis.setInterval = ((cb: () => void, _ms: number) => { intervalCallbacks.push(cb); return 100 as unknown as ReturnType; }) as typeof globalThis.setInterval; - globalThis.setTimeout = ((cb: () => void, _ms: number) => { - timeoutCallbacks.push(cb); - return 200 as unknown as ReturnType; - }) as typeof globalThis.setTimeout; - // No-op the clear* so the captured callbacks remain callable — - // surfaces the race-window assumption in the production code. globalThis.clearInterval = (() => {}) as typeof globalThis.clearInterval; - globalThis.clearTimeout = (() => {}) as typeof globalThis.clearTimeout; await import('../../src/offscreen/recorder'); expect(ports.length).toBe(1); - const port = ports[0]; - - // intervalCallbacks[0] is the ping callback (first setInterval call). - // timeoutCallbacks contains at least the pre-emptive reconnect timer - // (recorder.ts:626). Other setTimeouts may exist (e.g. queueMicrotask - // shims, scheduleRotation — but the latter is gated on mediaStream - // which is null in this test). expect(intervalCallbacks.length).toBeGreaterThanOrEqual(1); - expect(timeoutCallbacks.length).toBeGreaterThanOrEqual(1); + const oldPort = ports[0]; const pingCb = intervalCallbacks[0]; - // The pre-emptive reconnect timeout is the only setTimeout the - // bootstrap installs (rotation is gated on mediaStream). Pick the - // first to keep the test resilient. - const preemptiveReconnect = timeoutCallbacks[0]; - // 1) Fire pre-emptive reconnect — the production code calls - // keepalivePort?.disconnect(). Our stub flips _disconnected=true - // synchronously (matching Chrome's local disconnect semantics). - preemptiveReconnect(); - expect(port._disconnected).toBe(true); - expect(port.disconnect).toHaveBeenCalled(); + // Simulate an EXTERNAL disconnect: the remote end of the port has + // been torn down (SW eviction, port glitch, network blip), but the + // local onDisconnect handler hasn't fired yet — this is exactly the + // half-open state that the original H1 race weaponised. + oldPort._disconnected = true; - // 2) Now the race: an already-queued ping callback (e.g. scheduled - // before the reconnect) fires against the just-disconnected port. - // Production code: keepalivePort?.postMessage({type:'PING'}) — - // keepalivePort is still the old non-null reference at this point - // (the onDisconnect handler may not have run yet in the same - // macrotask), so postMessage runs against the disconnected port - // and our stub throws — exactly matching the UAT error message. - // - // The fix must make this safe. Acceptable strategies: - // - Clear the interval as the FIRST step of the pre-emptive - // reconnect, before .disconnect(). - // - Wrap postMessage in try/catch inside the ping callback. - // - Switch to a per-port closure (capturedPort) so a stale ping - // can detect it's pinging a swapped port. + // Fire the ping. Under the legacy code this threw with "Attempting + // to use a disconnected port object"; under the Option C fix the + // ping callback wraps postMessage in try/catch and initiates a + // clean reconnect via reconnectPort(). The test asserts BOTH that + // no Uncaught Error escapes AND that a fresh port has been + // requested (the visible side-effect of the reconnect path). expect(() => pingCb()).not.toThrow(); + expect(ports.length).toBeGreaterThanOrEqual(2); - // Restore globals. - globalThis.setInterval = originalSetInterval; - globalThis.clearInterval = originalClearInterval; - globalThis.setTimeout = originalSetTimeout; - globalThis.clearTimeout = originalClearTimeout; }); });