Milestone v1 (v2.0.0): Mokosh — Session Capture #1

Merged
strategy155 merged 297 commits from gsd/phase-04-harden-clean-up-optional into main 2026-05-31 15:34:17 +00:00
2 changed files with 181 additions and 106 deletions
Showing only changes of commit 1fb3e978cb - Show all commits

View File

@@ -36,7 +36,15 @@ const VIDEO_BITRATE = 400_000; // CON-video-codec
// контракт ротации.
const PORT_NAME = 'video-keepalive'; // long-lived port name (D-17, Pattern 5)
const PORT_PING_MS = 25_000; // < 30 s SW idle threshold
const PORT_RECONNECT_MS = 290_000; // pre-empt the ~5 min port cap (Pitfall 4)
// Option C health-probe: how many consecutive PINGs may go unanswered
// (no PONG echo) before we declare the port dead and reconnect. With
// PORT_PING_MS = 25_000 ms, 3 missed PONGs = 75 s of unresponsive SW —
// well past the ~30 s Chrome SW idle threshold, so a real disconnect
// already would have fired its onDisconnect handler by then. This
// threshold catches the failure mode the old 290 s pre-emptive setTimeout
// pretended to: the SW is up but the port has gone half-open.
// See .planning/debug/empty-archive-port-race.md (Fix Strategy: Option C).
const MAX_MISSED_PONGS = 3;
const logger = new OffscreenLogger('Recorder');
@@ -466,29 +474,56 @@ function isFromOwnExtension(sender: chrome.runtime.MessageSender | undefined): b
return sender?.id === chrome.runtime.id;
}
// Stable handles for the ping interval and the pre-emptive reconnect timer,
// so we can clear them on disconnect / re-init.
// Stable handle for the ping interval. The legacy 290 s pre-emptive
// reconnect setTimeout was retired in the Option C refactor — its race
// window between the synchronous .disconnect() and the onDisconnect
// handler running was the proximate cause of the H1 "Attempting to use a
// disconnected port object" Uncaught Errors per the bisect notes in
// .planning/debug/empty-archive-port-race.md.
let pingIntervalId: ReturnType<typeof setInterval> | null = null;
let preemptiveReconnectId: ReturnType<typeof setTimeout> | null = null;
// Option C health-probe state. The offscreen pings every PORT_PING_MS and
// expects a PONG echo from the SW. If MAX_MISSED_PONGS consecutive PINGs
// go unanswered, the port is considered dead and a clean reconnect is
// initiated via the same teardown path the onDisconnect handler uses.
// This replaces the time-based pre-emptive reconnect with a liveness-
// based one: the port lives until it stops responding, not until a wall
// clock says it should rotate.
let missedPongs = 0;
function teardownPortTimers(): void {
if (pingIntervalId !== null) {
clearInterval(pingIntervalId);
pingIntervalId = null;
}
if (preemptiveReconnectId !== null) {
clearTimeout(preemptiveReconnectId);
preemptiveReconnectId = null;
}
}
function onPortMessage(message: unknown): void {
// Defense-in-depth: explicit shape check before destructuring (T-1-04)
// Defense-in-depth: explicit shape check before destructuring (T-1-04).
if (typeof message !== 'object' || message === null) {
return;
}
const type = (message as { type?: unknown }).type;
if (type === 'PONG') {
// Health-probe echo. Resetting missedPongs to 0 closes the probe loop
// — the port is demonstrably alive.
missedPongs = 0;
return;
}
if (type === 'REQUEST_BUFFER') {
// Option C: REQUEST_BUFFER carries a requestId the SW generated per
// call. We echo it on the BUFFER response so the SW's per-request
// listener can match by id (preventing cross-talk between concurrent
// requests and between retries on port replacement).
const requestId = (message as { requestId?: unknown }).requestId;
if (typeof requestId !== 'string' || requestId.length === 0) {
// Defense-in-depth: a REQUEST_BUFFER without a valid id is invalid
// under the Option C protocol — drop with a warn. Legacy callers
// are absent from the codebase; this is purely a guard against
// future regressions or hostile peers.
logger.warn('REQUEST_BUFFER without a valid requestId — dropping');
return;
}
// Fire-and-forget: the chrome.runtime.Port.onMessage listener API
// ignores the return value. We do the async base64 encoding inside
// an IIFE so the listener stays synchronous-typed (void return).
@@ -497,7 +532,7 @@ function onPortMessage(message: unknown): void {
// because chrome.runtime.Port JSON-serializes across extension contexts,
// and JSON.stringify(blob) === "{}". See src/shared/binary.ts and
// tests/offscreen/port-serialization.test.ts for the contract.
void encodeAndSendBuffer();
void encodeAndSendBuffer(requestId);
}
// Any unknown port message type is silently dropped (T-1-04 defense-in-depth).
}
@@ -519,32 +554,24 @@ function onPortMessage(message: unknown): void {
// post-completion state.
let encodeInFlight = false;
async function encodeAndSendBuffer(): Promise<void> {
async function encodeAndSendBuffer(requestId: string): Promise<void> {
if (encodeInFlight) {
logger.warn('encodeAndSendBuffer already running — dropping concurrent call');
return;
}
// CR-01 fix: capture the port identity BEFORE the await. If `keepalivePort`
// is replaced by a fresh reconnect during base64 encoding, posting on the
// new port would silently leak the BUFFER to a stranger — the SW's
// per-request listener is still bound to the OLD port. The SW already
// times out cleanly (BUFFER_FETCH_TIMEOUT_MS = 2 s), so dropping a stale
// response on the floor is the correct behaviour: the next SAVE_ARCHIVE
// round-trip will REQUEST_BUFFER on the fresh port.
const portAtRequest = keepalivePort;
if (portAtRequest === null) {
if (keepalivePort === null) {
logger.warn('encodeAndSendBuffer called without an active port — drop');
return;
}
encodeInFlight = true;
try {
await doEncodeAndSendBuffer(portAtRequest);
await doEncodeAndSendBuffer(requestId);
} finally {
encodeInFlight = false;
}
}
async function doEncodeAndSendBuffer(portAtRequest: chrome.runtime.Port): Promise<void> {
async function doEncodeAndSendBuffer(requestId: string): Promise<void> {
// WR-09 fix: an in-flight segment lacks the Matroska finalization that
// MediaRecorder.stop() performs (SegmentSize, Cues) — splicing it onto
// a finalized tail re-introduces the "File ended prematurely" symptom
@@ -589,18 +616,62 @@ async function doEncodeAndSendBuffer(portAtRequest: chrome.runtime.Port): Promis
);
}
}
// After the await: refuse to post on a port that has been REPLACED
// by reconnect. The SW listens on the OLD port; posting on the NEW
// port would silently lose the data. Letting the SW time out is
// correct — the next SAVE_ARCHIVE will re-issue REQUEST_BUFFER on
// the fresh port.
if (keepalivePort !== portAtRequest) {
// After the await: post the BUFFER on the CURRENT keepalivePort. Under
// the Option C protocol the BUFFER carries the requestId the SW issued,
// so even if the port was replaced mid-encode (and the SW's matching
// listener migrated to the new port via the SW-side retry — see
// src/background/index.ts:getVideoBufferFromOffscreen), the BUFFER
// routes correctly by id. This retires the H2 silent-drop class that
// the bisect (.planning/debug/empty-archive-port-race.md) traced to
// the CR-01 "refuse-to-post on swapped port" pattern.
//
// Edge case: if the encode took so long that keepalivePort is now null
// (e.g. mid-reconnect with no fresh port yet) we drop with a warn.
// The SW's outer timeout will eventually fire and surface the error to
// the operator (no more silent video-less archive).
if (keepalivePort === null) {
logger.warn(
'port replaced during encode; dropping BUFFER response (SW will time out and retry)',
'port unavailable at post time; dropping BUFFER (SW outer timeout will surface)',
);
return;
}
portAtRequest.postMessage({ type: 'BUFFER', segments: transferred });
try {
keepalivePort.postMessage({
type: 'BUFFER',
requestId,
segments: transferred,
});
} catch (err) {
// The current port may have been disconnected between the null-check
// above and postMessage. Treat as a dropped response — the SW retries
// on the fresh port.
logger.warn('BUFFER post failed (port disconnected mid-post):', err);
}
}
// Triggers a clean reconnect: tear down timers, disconnect the current
// port (if any), null the reference, and re-enter connectPort(). The
// disconnect call fires the onDisconnect handler on the local side too
// (Chrome's documented behaviour for explicit .disconnect()), and that
// handler also calls connectPort() — but the missedPongs reset + the
// teardownPortTimers idempotency keep the second pass a no-op.
function reconnectPort(reason: string): void {
logger.warn('Initiating port reconnect:', reason);
teardownPortTimers();
const stale = keepalivePort;
keepalivePort = null;
missedPongs = 0;
if (stale !== null) {
try {
stale.disconnect();
} catch (err) {
// Disconnecting an already-disconnected port throws on some Chrome
// versions; silently ignored — the local-side reference has already
// been cleared so no further use is possible.
logger.warn('stale.disconnect() during reconnect failed:', err);
}
}
connectPort();
}
function connectPort(): void {
@@ -612,22 +683,47 @@ function connectPort(): void {
keepalivePort = null;
return;
}
missedPongs = 0;
keepalivePort.onMessage.addListener(onPortMessage);
keepalivePort.onDisconnect.addListener(() => {
logger.warn('port disconnected — reconnecting');
teardownPortTimers();
keepalivePort = null;
// Synchronous reconnect — tests/offscreen/port.test.ts pins this
missedPongs = 0;
// Synchronous reconnect — tests/offscreen/port.test.ts pins this.
connectPort();
});
pingIntervalId = setInterval(() => {
keepalivePort?.postMessage({ type: 'PING' });
// H1 fix: wrap postMessage in try/catch. The port object can become
// disconnected between the interval-callback being queued and it
// running (race window between local .disconnect() and onDisconnect
// firing). Chrome's documented runtime semantics make postMessage
// throw synchronously with "Attempting to use a disconnected port
// object". Without the catch, that became an Uncaught Error in the
// offscreen console (3x at 290 s in UAT Test 3).
//
// Health probe: each ping costs us one missed-PONG slot. The SW
// echoes PONG on PING (src/background/index.ts), and onPortMessage
// resets missedPongs to 0 on receipt. If MAX_MISSED_PONGS pings go
// without a PONG, the SW (or the port) is unresponsive and we
// reconnect cleanly.
if (keepalivePort === null) {
return;
}
missedPongs++;
if (missedPongs > MAX_MISSED_PONGS) {
reconnectPort(`missed ${missedPongs} PONGs — port unresponsive`);
return;
}
try {
keepalivePort.postMessage({ type: 'PING' });
} catch (err) {
// The port went disconnected synchronously — initiate the same
// clean reconnect path. The throw caught here is exactly the H1
// "Attempting to use a disconnected port object" surface.
reconnectPort(`PING postMessage threw: ${String(err)}`);
}
}, PORT_PING_MS);
preemptiveReconnectId = setTimeout(() => {
logger.log('pre-emptive port reconnect (290 s cap)');
keepalivePort?.disconnect();
// onDisconnect handler above triggers a fresh connectPort() call
}, PORT_RECONNECT_MS);
}
// Бутстрап (handshake + port lifecycle).

View File

@@ -25,7 +25,7 @@
// tests/offscreen/port.test.ts + handshake.test.ts). No source-code
// re-implementation — the production module is what we're testing.
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
interface PortStub {
name: string;
@@ -122,21 +122,27 @@ function buildChromeStub(ports: PortStub[]): ChromeStub {
};
}
describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', () => {
let originalSetInterval: typeof globalThis.setInterval;
let originalClearInterval: typeof globalThis.clearInterval;
let originalSetTimeout: typeof globalThis.setTimeout;
let originalClearTimeout: typeof globalThis.clearTimeout;
// Top-level snapshots of the real timer APIs — captured at module load,
// BEFORE any test runs, so they survive a test that fails before its
// inline restore code. The afterEach below restores unconditionally.
const ORIGINAL_SET_INTERVAL = globalThis.setInterval;
const ORIGINAL_CLEAR_INTERVAL = globalThis.clearInterval;
const ORIGINAL_SET_TIMEOUT = globalThis.setTimeout;
const ORIGINAL_CLEAR_TIMEOUT = globalThis.clearTimeout;
describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)', () => {
beforeEach(() => {
vi.resetModules();
(globalThis as unknown as GlobalWithChrome).MediaRecorder = {
isTypeSupported: vi.fn().mockReturnValue(true),
};
originalSetInterval = globalThis.setInterval;
originalClearInterval = globalThis.clearInterval;
originalSetTimeout = globalThis.setTimeout;
originalClearTimeout = globalThis.clearTimeout;
});
afterEach(() => {
globalThis.setInterval = ORIGINAL_SET_INTERVAL;
globalThis.clearInterval = ORIGINAL_CLEAR_INTERVAL;
globalThis.setTimeout = ORIGINAL_SET_TIMEOUT;
globalThis.clearTimeout = ORIGINAL_CLEAR_TIMEOUT;
});
// ──────────────────────────────────────────────────────────────────────
@@ -199,9 +205,6 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)',
// operator observed in UAT Test 3.
expect(() => pingCb()).not.toThrow();
// Restore globals.
globalThis.setInterval = originalSetInterval;
globalThis.clearInterval = originalClearInterval;
});
// ──────────────────────────────────────────────────────────────────────
@@ -272,7 +275,16 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)',
// 2) SW issues REQUEST_BUFFER — synthetically dispatch to the
// captured onPortMessage listener. The production code kicks off
// encodeAndSendBuffer which awaits 3 blobToBase64 calls.
oldPort.onMessage._listeners[0]({ type: 'REQUEST_BUFFER' });
//
// Under the Option C protocol the request carries a requestId the
// SW generates per call; the offscreen echoes it on the BUFFER
// response. This test uses a literal id since it's not driving
// the SW — only the recorder's onPortMessage path.
const requestId = 'h2-test-id';
oldPort.onMessage._listeners[0]({
type: 'REQUEST_BUFFER',
requestId,
});
// 3) Mid-encode, the pre-emptive reconnect fires. Simulate by
// firing onDisconnect on the old port — production code's
@@ -346,82 +358,49 @@ describe('port reconnect race (RED — confirms empty-archive-port-race H1+H2)',
});
// ──────────────────────────────────────────────────────────────────────
// H1.b — pre-emptive reconnect path specifically. Walk the same
// sequence the operator hit at 290 s: the production code calls
// `keepalivePort?.disconnect()` and the test then invokes the ping
// callback that was installed BEFORE the reconnect. This proves the
// race window in the production timer-clear ordering.
// H1.b — externally-disconnected port path. The original 290 s
// pre-emptive setTimeout is gone (Option C retired it — see the
// "Bisect Results" section of empty-archive-port-race.md). What
// remains is the same H1 race shape, just triggered by a different
// path: the SW side (or Chrome's idle eviction, or a network glitch)
// can mark the remote end of the port as disconnected; the local
// postMessage then throws synchronously with the same Chrome error
// string. The fix's try/catch + reconnect path must absorb that.
// ──────────────────────────────────────────────────────────────────────
it('H1.b: pre-emptive reconnect path — ping does not throw against just-disconnected port', async () => {
it('H1.b: externally-disconnected port — ping path absorbs the throw and reconnects cleanly', async () => {
const ports: PortStub[] = [];
const stub = buildChromeStub(ports);
(globalThis as unknown as GlobalWithChrome).chrome = stub;
// Capture both interval callbacks AND timeout callbacks for direct
// invocation. We need to fire the pre-emptive setTimeout
// synthetically (it would otherwise wait 290 s of wall-clock time).
const intervalCallbacks: Array<() => void> = [];
const timeoutCallbacks: Array<() => void> = [];
globalThis.setInterval = ((cb: () => void, _ms: number) => {
intervalCallbacks.push(cb);
return 100 as unknown as ReturnType<typeof setInterval>;
}) as typeof globalThis.setInterval;
globalThis.setTimeout = ((cb: () => void, _ms: number) => {
timeoutCallbacks.push(cb);
return 200 as unknown as ReturnType<typeof setTimeout>;
}) as typeof globalThis.setTimeout;
// No-op the clear* so the captured callbacks remain callable —
// surfaces the race-window assumption in the production code.
globalThis.clearInterval = (() => {}) as typeof globalThis.clearInterval;
globalThis.clearTimeout = (() => {}) as typeof globalThis.clearTimeout;
await import('../../src/offscreen/recorder');
expect(ports.length).toBe(1);
const port = ports[0];
// intervalCallbacks[0] is the ping callback (first setInterval call).
// timeoutCallbacks contains at least the pre-emptive reconnect timer
// (recorder.ts:626). Other setTimeouts may exist (e.g. queueMicrotask
// shims, scheduleRotation — but the latter is gated on mediaStream
// which is null in this test).
expect(intervalCallbacks.length).toBeGreaterThanOrEqual(1);
expect(timeoutCallbacks.length).toBeGreaterThanOrEqual(1);
const oldPort = ports[0];
const pingCb = intervalCallbacks[0];
// The pre-emptive reconnect timeout is the only setTimeout the
// bootstrap installs (rotation is gated on mediaStream). Pick the
// first to keep the test resilient.
const preemptiveReconnect = timeoutCallbacks[0];
// 1) Fire pre-emptive reconnect the production code calls
// keepalivePort?.disconnect(). Our stub flips _disconnected=true
// synchronously (matching Chrome's local disconnect semantics).
preemptiveReconnect();
expect(port._disconnected).toBe(true);
expect(port.disconnect).toHaveBeenCalled();
// Simulate an EXTERNAL disconnect: the remote end of the port has
// been torn down (SW eviction, port glitch, network blip), but the
// local onDisconnect handler hasn't fired yet — this is exactly the
// half-open state that the original H1 race weaponised.
oldPort._disconnected = true;
// 2) Now the race: an already-queued ping callback (e.g. scheduled
// before the reconnect) fires against the just-disconnected port.
// Production code: keepalivePort?.postMessage({type:'PING'}) —
// keepalivePort is still the old non-null reference at this point
// (the onDisconnect handler may not have run yet in the same
// macrotask), so postMessage runs against the disconnected port
// and our stub throws — exactly matching the UAT error message.
//
// The fix must make this safe. Acceptable strategies:
// - Clear the interval as the FIRST step of the pre-emptive
// reconnect, before .disconnect().
// - Wrap postMessage in try/catch inside the ping callback.
// - Switch to a per-port closure (capturedPort) so a stale ping
// can detect it's pinging a swapped port.
// Fire the ping. Under the legacy code this threw with "Attempting
// to use a disconnected port object"; under the Option C fix the
// ping callback wraps postMessage in try/catch and initiates a
// clean reconnect via reconnectPort(). The test asserts BOTH that
// no Uncaught Error escapes AND that a fresh port has been
// requested (the visible side-effect of the reconnect path).
expect(() => pingCb()).not.toThrow();
expect(ports.length).toBeGreaterThanOrEqual(2);
// Restore globals.
globalThis.setInterval = originalSetInterval;
globalThis.clearInterval = originalClearInterval;
globalThis.setTimeout = originalSetTimeout;
globalThis.clearTimeout = originalClearTimeout;
});
});