File size: 11,119 Bytes
5871090
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
/**
 * task-ledger — durable per-task state for long-running multi-task jobs
 * (Task #176, Wave A).
 *
 * Each plan (execution_plans row) owns N task_ledger rows keyed by
 * (plan_id, task_key). The ledger survives restarts: on resume the
 * runner re-walks rows in `pending|running` status, retrying up to
 * `max_attempts` and respecting the bounded concurrency.
 *
 * This module is storage + a simple bounded-pool scheduler. The actual
 * task work is supplied by the caller as an async function.
 */
import { and, eq, inArray, sql } from "drizzle-orm";
import { db, taskLedger, type TaskLedgerRow } from "@workspace/db";
import { newId } from "./ids";
import { logger } from "./logger";

export type TaskLedgerStatus =
  | "pending"
  | "running"
  | "done"
  | "failed"
  | "skipped";

export interface SeedTaskInput {
  planId: string;
  taskKey: string;
  params?: Record<string, unknown>;
  maxAttempts?: number;
}

/**
 * Idempotently insert task rows. Existing (plan_id, task_key) rows are
 * left untouched so re-seed during retry is safe.
 */
export async function seedTasks(rows: SeedTaskInput[]): Promise<number> {
  if (rows.length === 0) return 0;
  let inserted = 0;
  for (const r of rows) {
    try {
      await db.insert(taskLedger).values({
        id: newId("tldg"),
        planId: r.planId,
        taskKey: r.taskKey,
        params: r.params ?? {},
        maxAttempts: r.maxAttempts ?? 3,
        status: "pending",
      });
      inserted++;
    } catch (err) {
      // Unique-constraint violation = already seeded; tolerate it.
      const msg = err instanceof Error ? err.message : String(err);
      if (!/duplicate|unique/i.test(msg)) throw err;
    }
  }
  return inserted;
}

export async function listTasks(
  planId: string,
  statusFilter?: TaskLedgerStatus[],
): Promise<TaskLedgerRow[]> {
  const where =
    statusFilter && statusFilter.length > 0
      ? and(eq(taskLedger.planId, planId), inArray(taskLedger.status, statusFilter))
      : eq(taskLedger.planId, planId);
  return db.select().from(taskLedger).where(where);
}

export async function ledgerSummary(planId: string): Promise<{
  total: number;
  pending: number;
  running: number;
  done: number;
  failed: number;
  skipped: number;
}> {
  const rows = await db
    .select()
    .from(taskLedger)
    .where(eq(taskLedger.planId, planId));
  const out = {
    total: rows.length,
    pending: 0,
    running: 0,
    done: 0,
    failed: 0,
    skipped: 0,
  };
  for (const r of rows) {
    if (r.status in out) (out as Record<string, number>)[r.status]++;
  }
  return out;
}

/**
 * Atomically claim up to `n` pending rows for `planId`, transitioning them
 * `pending → running` in a single statement protected by `FOR UPDATE SKIP
 * LOCKED`. This is the race-free replacement for SELECT-then-UPDATE: two
 * concurrent callers (or two iterations of the drain loop) cannot pick
 * the same row.
 *
 * Returns the freshly-claimed `TaskLedgerRow`s (snake-case columns from the
 * RETURNING clause are mapped back into the camel-case row shape).
 */
async function claimNextBatch(
  planId: string,
  n: number,
): Promise<TaskLedgerRow[]> {
  if (n <= 0) return [];
  const result = await db.execute<{
    id: string;
    plan_id: string;
    task_key: string;
    params: Record<string, unknown>;
    status: TaskLedgerStatus;
    attempts: number;
    max_attempts: number;
    result: Record<string, unknown> | null;
    error_text: string | null;
    metrics: Record<string, unknown> | null;
    duration_ms: number | null;
    started_at: Date | null;
    finished_at: Date | null;
    created_at: Date;
    updated_at: Date;
  }>(sql`
    WITH claimed AS (
      SELECT id FROM task_ledger
      WHERE plan_id = ${planId} AND status = 'pending'
      ORDER BY created_at, id
      LIMIT ${n}
      FOR UPDATE SKIP LOCKED
    )
    UPDATE task_ledger
    SET status = 'running',
        attempts = task_ledger.attempts + 1,
        started_at = NOW(),
        updated_at = NOW()
    FROM claimed
    WHERE task_ledger.id = claimed.id
    RETURNING task_ledger.*
  `);
  const rows = (result as unknown as { rows?: unknown[] }).rows ??
    (result as unknown as unknown[]);
  return (rows as Array<{
    id: string;
    plan_id: string;
    task_key: string;
    params: Record<string, unknown>;
    status: TaskLedgerStatus;
    attempts: number;
    max_attempts: number;
    result: Record<string, unknown> | null;
    error_text: string | null;
    metrics: Record<string, unknown> | null;
    duration_ms: number | null;
    started_at: Date | string | null;
    finished_at: Date | string | null;
    created_at: Date | string;
    updated_at: Date | string;
  }>).map((r) => ({
    id: r.id,
    planId: r.plan_id,
    taskKey: r.task_key,
    params: r.params,
    status: r.status,
    attempts: r.attempts,
    maxAttempts: r.max_attempts,
    result: r.result,
    errorText: r.error_text,
    metrics: r.metrics,
    durationMs: r.duration_ms,
    startedAt: r.started_at ? new Date(r.started_at) : null,
    finishedAt: r.finished_at ? new Date(r.finished_at) : null,
    createdAt: new Date(r.created_at),
    updatedAt: new Date(r.updated_at),
  })) as TaskLedgerRow[];
}

async function markDone(
  row: TaskLedgerRow,
  result: Record<string, unknown>,
  metrics: Record<string, unknown> | null,
  startMs: number,
): Promise<void> {
  await db
    .update(taskLedger)
    .set({
      status: "done",
      result,
      metrics,
      durationMs: Date.now() - startMs,
      finishedAt: new Date(),
      updatedAt: new Date(),
    })
    .where(eq(taskLedger.id, row.id));
}

async function markFailed(
  row: TaskLedgerRow,
  error: unknown,
  startMs: number,
  willRetry: boolean,
): Promise<void> {
  const errText = error instanceof Error ? error.message : String(error);
  await db
    .update(taskLedger)
    .set({
      status: willRetry ? "pending" : "failed",
      errorText: errText,
      durationMs: Date.now() - startMs,
      finishedAt: new Date(),
      updatedAt: new Date(),
    })
    .where(eq(taskLedger.id, row.id));
}

export type TaskRunner<O> = (row: TaskLedgerRow) => Promise<{
  result: O;
  metrics?: Record<string, unknown>;
}>;

export interface RunPlanOptions {
  concurrency?: number;
  onProgress?: (snapshot: {
    completed: number;
    failed: number;
    total: number;
    last?: { taskKey: string; status: TaskLedgerStatus; durationMs?: number };
  }) => void;
}

/**
 * Walk all `pending` rows for a plan, dispatching them to `runner` with
 * bounded concurrency. Each row exhausts its `max_attempts` before
 * being parked in `failed`. Returns when no more `pending` rows remain
 * (regardless of failures).
 */
export async function runPlan<O extends Record<string, unknown>>(
  planId: string,
  runner: TaskRunner<O>,
  opts: RunPlanOptions = {},
): Promise<{
  completed: number;
  failed: number;
  total: number;
}> {
  const concurrency = Math.max(1, Math.min(opts.concurrency ?? 8, 32));
  // Recover stale `running` rows from a previous crashed/killed runner so
  // `seedTasks → process death → runPlan` resumes correctly. We can do
  // this safely here because runPlan is the single writer for a planId
  // and the API only allows one in-flight execute at a time (status gate
  // in benchmark.ts: only `approved` may run, and we flip to `running`
  // before claiming). For multi-process deployments this should later
  // grow a heartbeat / claim-lease check.
  const stale = await db
    .update(taskLedger)
    .set({
      status: "pending",
      errorText: "recovered_from_stale_running",
      updatedAt: new Date(),
    })
    .where(
      and(eq(taskLedger.planId, planId), eq(taskLedger.status, "running")),
    )
    .returning({ id: taskLedger.id });
  if (stale.length > 0) {
    logger.warn(
      { planId, recovered: stale.length },
      "task-ledger: recovered stale running rows on resume",
    );
  }
  const totalRows = await db
    .select({ id: taskLedger.id })
    .from(taskLedger)
    .where(eq(taskLedger.planId, planId));
  const total = totalRows.length;
  let completed = 0;
  let failed = 0;
  // NOTE: rows passed to runOne have already been atomically transitioned
  // to "running" by claimNextBatch (with attempts incremented), so we do
  // NOT call markStart here — that would double-increment attempts and
  // race against the very protection we just added.
  const runOne = async (row: TaskLedgerRow): Promise<void> => {
    const startMs = Date.now();
    try {
      const out = await runner(row);
      await markDone(row, out.result, out.metrics ?? null, startMs);
      completed++;
      opts.onProgress?.({
        completed,
        failed,
        total,
        last: {
          taskKey: row.taskKey,
          status: "done",
          durationMs: Date.now() - startMs,
        },
      });
    } catch (err) {
      // `row.attempts` already reflects the in-flight attempt because
      // claimNextBatch incremented it during the atomic claim. So if
      // attempts == maxAttempts, this attempt was the final one.
      const willRetry = row.attempts < row.maxAttempts;
      await markFailed(row, err, startMs, willRetry);
      if (!willRetry) {
        failed++;
        logger.warn(
          { err, taskKey: row.taskKey, attempts: row.attempts },
          "task-ledger: row exhausted retries",
        );
        opts.onProgress?.({
          completed,
          failed,
          total,
          last: {
            taskKey: row.taskKey,
            status: "failed",
            durationMs: Date.now() - startMs,
          },
        });
      } else {
        opts.onProgress?.({
          completed,
          failed,
          total,
          last: {
            taskKey: row.taskKey,
            status: "pending",
            durationMs: Date.now() - startMs,
          },
        });
      }
    }
  };
  // Drain loop with race-free claim + slot-availability waiting:
  // 1) try to atomically claim up to `slots` rows via FOR UPDATE SKIP LOCKED
  // 2) launch a worker per claimed row
  // 3) if no rows were claimed but inflight workers exist, wait for the
  //    next one to finish (Promise.race) — failed-with-retry rows surface
  //    back to `pending` via markFailed and become claimable again next
  //    iteration.
  const inflight = new Set<Promise<void>>();
  // Cap unbounded growth in catastrophic-retry scenarios.
  let safety = total * 8 + 64;
  while (safety-- > 0) {
    const slots = concurrency - inflight.size;
    const claimed = slots > 0 ? await claimNextBatch(planId, slots) : [];
    if (claimed.length > 0) {
      for (const row of claimed) {
        const p = runOne(row).finally(() => {
          inflight.delete(p);
        });
        inflight.add(p);
      }
      continue;
    }
    if (inflight.size === 0) {
      // No claimable work and nothing inflight → drain complete.
      break;
    }
    await Promise.race(inflight);
  }
  // Wait for any tail tasks to settle before returning.
  if (inflight.size > 0) {
    await Promise.allSettled(inflight);
  }
  return { completed, failed, total };
}