File size: 6,863 Bytes

ff78003

/**
 * Wave B — pre-promotion regression suite.
 *
 * The auto-promote gate runs the full archived suite against any
 * candidate variant before flipping `active_variant_id`. The samples
 * themselves are owned by admins (manually curated representative
 * inputs), so this module only knows how to:
 *   - list/archive samples, and
 *   - replay them against a variant via the registered NetworkRunner.
 *
 * The runner is the same `runNetwork()` that powers production, so the
 * suite genuinely exercises the candidate's internal subgraph rather
 * than a stub. Runtime failures (no handler, throw) score 0 on that
 * sample — the gate's pass-criterion is "every sample meets its floor",
 * so a single thrown handler will block promotion.
 */
import { and, desc, eq } from "drizzle-orm";
import {
  db,
  networkRegressionSamples,
  networkVersions,
  toolNetworks,
  type NetworkRegressionSampleRow,
} from "@workspace/db";
import { newId } from "../ids";
import { runNetwork } from "../tool-network";
import { gradeNetworkResult } from "../reviewer";

export interface ArchiveSampleInput {
  networkId: string;
  problemClassPath: string;
  label?: string;
  inputPayload: Record<string, unknown>;
  expectedFloor?: number;
  expectedShape?: Record<string, unknown>;
  createdBy?: string;
}

export async function archiveSample(
  input: ArchiveSampleInput,
): Promise<NetworkRegressionSampleRow> {
  const id = newId("nrgs");
  await db.insert(networkRegressionSamples).values({
    id,
    networkId: input.networkId,
    problemClassPath: input.problemClassPath,
    label: input.label ?? "",
    inputPayload: input.inputPayload as Record<string, unknown>,
    expectedFloor: clamp01(input.expectedFloor ?? 0.6),
    expectedShape: (input.expectedShape ?? {}) as Record<string, unknown>,
    status: "active",
    createdBy: input.createdBy ?? "system",
  });
  return (
    await db
      .select()
      .from(networkRegressionSamples)
      .where(eq(networkRegressionSamples.id, id))
      .limit(1)
  )[0]!;
}

export async function listSamples(
  networkId: string,
  status: "active" | "archived" = "active",
): Promise<NetworkRegressionSampleRow[]> {
  return db
    .select()
    .from(networkRegressionSamples)
    .where(
      and(
        eq(networkRegressionSamples.networkId, networkId),
        eq(networkRegressionSamples.status, status),
      ),
    )
    .orderBy(desc(networkRegressionSamples.createdAt));
}

export interface SampleResult {
  sampleId: string;
  label: string;
  expectedFloor: number;
  achievedScore: number;
  passed: boolean;
  errorText?: string;
  shapeOk: boolean;
  shapeIssues: string[];
}

export interface SuiteResult {
  networkId: string;
  variantId: string;
  totalSamples: number;
  passed: number;
  failed: number;
  results: SampleResult[];
  /** True ⇔ every sample met its floor and structural assertions. */
  allPassed: boolean;
}

/**
 * Replay every active sample against the given variant. The reviewer
 * score is sourced from the network runner's result if it exposes one
 * (`result.fitness` or `result.metrics.reviewerScore`); otherwise we
 * fall back to a binary 1.0/0.0 from `result.ok`.
 *
 * `expected_shape` is a best-effort structural check:
 *   - `must_include_keys: string[]` ⇒ every key must appear at the top
 *     level of `result.output`.
 *   - `must_include_paths: string[]` ⇒ dot-paths must resolve to a
 *     non-undefined value.
 */
export async function runSuiteAgainstVariant(
  networkId: string,
  variantId: string,
): Promise<SuiteResult> {
  const network = (
    await db.select().from(toolNetworks).where(eq(toolNetworks.id, networkId)).limit(1)
  )[0];
  if (!network) throw new Error(`network ${networkId} not found`);
  const variant = (
    await db.select().from(networkVersions).where(eq(networkVersions.id, variantId)).limit(1)
  )[0];
  if (!variant) throw new Error(`variant ${variantId} not found`);

  const samples = await listSamples(networkId, "active");
  const results: SampleResult[] = [];
  for (const s of samples) {
    let achieved = 0;
    let errorText: string | undefined;
    let output: Record<string, unknown> | null = null;
    try {
      const r = await runNetwork({
        networkName: network.name,
        input: (s.inputPayload as Record<string, unknown>) ?? {},
        variantOverride: variantId,
        actor: "regression_suite",
      });
      output = (r.output as Record<string, unknown>) ?? null;
      achieved = gradeNetworkResult({
        network: { id: networkId, name: network.name },
        result: r,
      }).score;
    } catch (err) {
      errorText = err instanceof Error ? err.message : String(err);
    }
    const shape = checkShape(
      output,
      (s.expectedShape as Record<string, unknown>) ?? {},
    );
    const passed =
      !errorText && achieved >= s.expectedFloor && shape.ok;
    results.push({
      sampleId: s.id,
      label: s.label,
      expectedFloor: s.expectedFloor,
      achievedScore: achieved,
      passed,
      errorText,
      shapeOk: shape.ok,
      shapeIssues: shape.issues,
    });
  }
  const passedN = results.filter((r) => r.passed).length;
  return {
    networkId,
    variantId,
    totalSamples: results.length,
    passed: passedN,
    failed: results.length - passedN,
    results,
    allPassed: results.length > 0 && passedN === results.length,
  };
}

function checkShape(
  output: Record<string, unknown> | null,
  expected: Record<string, unknown>,
): { ok: boolean; issues: string[] } {
  if (!expected || Object.keys(expected).length === 0) {
    return { ok: true, issues: [] };
  }
  if (!output) {
    return { ok: false, issues: ["output is null"] };
  }
  const issues: string[] = [];
  const mustKeys = Array.isArray(expected.must_include_keys)
    ? (expected.must_include_keys as unknown[]).filter(
        (x): x is string => typeof x === "string",
      )
    : [];
  for (const k of mustKeys) {
    if (!(k in output)) issues.push(`missing top-level key: ${k}`);
  }
  const mustPaths = Array.isArray(expected.must_include_paths)
    ? (expected.must_include_paths as unknown[]).filter(
        (x): x is string => typeof x === "string",
      )
    : [];
  for (const p of mustPaths) {
    if (!resolveDotPath(output, p)) issues.push(`missing path: ${p}`);
  }
  return { ok: issues.length === 0, issues };
}

function resolveDotPath(obj: unknown, path: string): boolean {
  const parts = path.split(".");
  let cur: unknown = obj;
  for (const p of parts) {
    if (cur && typeof cur === "object" && p in (cur as Record<string, unknown>)) {
      cur = (cur as Record<string, unknown>)[p];
    } else {
      return false;
    }
  }
  return cur !== undefined && cur !== null;
}

function clamp01(n: number): number {
  if (!Number.isFinite(n)) return 0;
  if (n < 0) return 0;
  if (n > 1) return 1;
  return n;
}