doatlas-2 / artifacts /api-server /src /lib /evolution /regression-suite.ts
Iostream-Li's picture
Add files using upload-large-folder tool
ff78003 verified
/**
* Wave B — pre-promotion regression suite.
*
* The auto-promote gate runs the full archived suite against any
* candidate variant before flipping `active_variant_id`. The samples
* themselves are owned by admins (manually curated representative
* inputs), so this module only knows how to:
* - list/archive samples, and
* - replay them against a variant via the registered NetworkRunner.
*
* The runner is the same `runNetwork()` that powers production, so the
* suite genuinely exercises the candidate's internal subgraph rather
* than a stub. Runtime failures (no handler, throw) score 0 on that
* sample — the gate's pass-criterion is "every sample meets its floor",
* so a single thrown handler will block promotion.
*/
import { and, desc, eq } from "drizzle-orm";
import {
db,
networkRegressionSamples,
networkVersions,
toolNetworks,
type NetworkRegressionSampleRow,
} from "@workspace/db";
import { newId } from "../ids";
import { runNetwork } from "../tool-network";
import { gradeNetworkResult } from "../reviewer";
export interface ArchiveSampleInput {
networkId: string;
problemClassPath: string;
label?: string;
inputPayload: Record<string, unknown>;
expectedFloor?: number;
expectedShape?: Record<string, unknown>;
createdBy?: string;
}
export async function archiveSample(
input: ArchiveSampleInput,
): Promise<NetworkRegressionSampleRow> {
const id = newId("nrgs");
await db.insert(networkRegressionSamples).values({
id,
networkId: input.networkId,
problemClassPath: input.problemClassPath,
label: input.label ?? "",
inputPayload: input.inputPayload as Record<string, unknown>,
expectedFloor: clamp01(input.expectedFloor ?? 0.6),
expectedShape: (input.expectedShape ?? {}) as Record<string, unknown>,
status: "active",
createdBy: input.createdBy ?? "system",
});
return (
await db
.select()
.from(networkRegressionSamples)
.where(eq(networkRegressionSamples.id, id))
.limit(1)
)[0]!;
}
export async function listSamples(
networkId: string,
status: "active" | "archived" = "active",
): Promise<NetworkRegressionSampleRow[]> {
return db
.select()
.from(networkRegressionSamples)
.where(
and(
eq(networkRegressionSamples.networkId, networkId),
eq(networkRegressionSamples.status, status),
),
)
.orderBy(desc(networkRegressionSamples.createdAt));
}
export interface SampleResult {
sampleId: string;
label: string;
expectedFloor: number;
achievedScore: number;
passed: boolean;
errorText?: string;
shapeOk: boolean;
shapeIssues: string[];
}
export interface SuiteResult {
networkId: string;
variantId: string;
totalSamples: number;
passed: number;
failed: number;
results: SampleResult[];
/** True ⇔ every sample met its floor and structural assertions. */
allPassed: boolean;
}
/**
* Replay every active sample against the given variant. The reviewer
* score is sourced from the network runner's result if it exposes one
* (`result.fitness` or `result.metrics.reviewerScore`); otherwise we
* fall back to a binary 1.0/0.0 from `result.ok`.
*
* `expected_shape` is a best-effort structural check:
* - `must_include_keys: string[]` ⇒ every key must appear at the top
* level of `result.output`.
* - `must_include_paths: string[]` ⇒ dot-paths must resolve to a
* non-undefined value.
*/
export async function runSuiteAgainstVariant(
networkId: string,
variantId: string,
): Promise<SuiteResult> {
const network = (
await db.select().from(toolNetworks).where(eq(toolNetworks.id, networkId)).limit(1)
)[0];
if (!network) throw new Error(`network ${networkId} not found`);
const variant = (
await db.select().from(networkVersions).where(eq(networkVersions.id, variantId)).limit(1)
)[0];
if (!variant) throw new Error(`variant ${variantId} not found`);
const samples = await listSamples(networkId, "active");
const results: SampleResult[] = [];
for (const s of samples) {
let achieved = 0;
let errorText: string | undefined;
let output: Record<string, unknown> | null = null;
try {
const r = await runNetwork({
networkName: network.name,
input: (s.inputPayload as Record<string, unknown>) ?? {},
variantOverride: variantId,
actor: "regression_suite",
});
output = (r.output as Record<string, unknown>) ?? null;
achieved = gradeNetworkResult({
network: { id: networkId, name: network.name },
result: r,
}).score;
} catch (err) {
errorText = err instanceof Error ? err.message : String(err);
}
const shape = checkShape(
output,
(s.expectedShape as Record<string, unknown>) ?? {},
);
const passed =
!errorText && achieved >= s.expectedFloor && shape.ok;
results.push({
sampleId: s.id,
label: s.label,
expectedFloor: s.expectedFloor,
achievedScore: achieved,
passed,
errorText,
shapeOk: shape.ok,
shapeIssues: shape.issues,
});
}
const passedN = results.filter((r) => r.passed).length;
return {
networkId,
variantId,
totalSamples: results.length,
passed: passedN,
failed: results.length - passedN,
results,
allPassed: results.length > 0 && passedN === results.length,
};
}
function checkShape(
output: Record<string, unknown> | null,
expected: Record<string, unknown>,
): { ok: boolean; issues: string[] } {
if (!expected || Object.keys(expected).length === 0) {
return { ok: true, issues: [] };
}
if (!output) {
return { ok: false, issues: ["output is null"] };
}
const issues: string[] = [];
const mustKeys = Array.isArray(expected.must_include_keys)
? (expected.must_include_keys as unknown[]).filter(
(x): x is string => typeof x === "string",
)
: [];
for (const k of mustKeys) {
if (!(k in output)) issues.push(`missing top-level key: ${k}`);
}
const mustPaths = Array.isArray(expected.must_include_paths)
? (expected.must_include_paths as unknown[]).filter(
(x): x is string => typeof x === "string",
)
: [];
for (const p of mustPaths) {
if (!resolveDotPath(output, p)) issues.push(`missing path: ${p}`);
}
return { ok: issues.length === 0, issues };
}
function resolveDotPath(obj: unknown, path: string): boolean {
const parts = path.split(".");
let cur: unknown = obj;
for (const p of parts) {
if (cur && typeof cur === "object" && p in (cur as Record<string, unknown>)) {
cur = (cur as Record<string, unknown>)[p];
} else {
return false;
}
}
return cur !== undefined && cur !== null;
}
function clamp01(n: number): number {
if (!Number.isFinite(n)) return 0;
if (n < 0) return 0;
if (n > 1) return 1;
return n;
}