openskynet / experiments /skynet /benchmark-cycle.test.ts
Darochin's picture
Mirror OpenSkyNet workspace snapshot from Git HEAD
fc93158 verified
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { resolveInternalProjectBenchmarkAuditFile } from "../../src/omega/internal-project.js";
import { runOmegaSelfTimeTick } from "../../src/omega/self-time-daemon.js";
import {
resolveOpenSkynetBenchmarkCycleFile,
resolveOpenSkynetBenchmarkCycleResultFile,
syncOpenSkynetBenchmarkCycleSnapshot,
} from "./benchmark-cycle.js";
describe("benchmark cycle snapshot", () => {
let workspaceRoot = "";
beforeEach(async () => {
workspaceRoot = await fs.mkdtemp(path.join(os.tmpdir(), "openskynet-benchmark-cycle-"));
});
afterEach(async () => {
await fs.rm(workspaceRoot, { recursive: true, force: true });
});
it("writes a compact authoritative benchmark snapshot for autonomous cycles", async () => {
await runOmegaSelfTimeTick({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
const snapshot = await syncOpenSkynetBenchmarkCycleSnapshot({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
expect(snapshot.project.name).toBe("Skynet");
expect(snapshot.benchmark.focusKey).toBe("endogenous_science_agenda");
expect(snapshot.benchmark.recommendedAction).toContain("Empujar foco activo");
expect(snapshot.cycleRules.oneConcreteStepOnly).toBe(true);
expect(snapshot.cycleRules.benchmarkSnapshotIsDerived).toBe(true);
expect(snapshot.reportingRules.tone).toBe("sober");
expect(snapshot.reportingRules.sourcePriority[0]).toContain(
".openskynet/internal-project-benchmark",
);
expect(snapshot.reportingRules.migrationDirective).toContain("OpenSkyNet/Omega");
expect(snapshot.runtime.cycleResultFile).toContain("agent_openskynet_main-last-cycle.json");
expect(snapshot.metabolism.budgetStatus).toBe("healthy");
expect(snapshot.metabolism.killRecommended).toBe(false);
expect(snapshot.acceptance.passing).toBe(false);
expect(snapshot.selfTime?.tickCount).toBeGreaterThanOrEqual(1);
expect(typeof snapshot.selfTime?.latestPredictionError).toBe("number");
expect(snapshot.policyMutation?.candidateFile).toContain("policy-candidate.json");
expect(["maintain", "advance"]).toContain(snapshot.coldDirective.action);
expect(typeof snapshot.coldDirective.allowHotPath).toBe("boolean");
if (snapshot.coldDirective.action === "maintain") {
expect(snapshot.coldDirective.allowHotPath).toBe(false);
expect(snapshot.coldDirective.enforcedWorkItemId).toMatch(/^maintenance:/);
expect(snapshot.coldDirective.enforcedWorkItemDetail?.length ?? 0).toBeGreaterThan(0);
expect(snapshot.coldDirective.maintenanceItemId).toMatch(/^maintenance:/);
expect(snapshot.coldDirective.maintenanceDetail?.length ?? 0).toBeGreaterThan(0);
}
const filePath = resolveOpenSkynetBenchmarkCycleFile({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
const persisted = JSON.parse(await fs.readFile(filePath, "utf-8")) as {
benchmark?: { score?: number; commitmentTask?: string | null };
};
expect(persisted.benchmark?.score).toBeGreaterThan(0);
expect(persisted.benchmark?.commitmentTask).toContain("Implement one executable");
});
it("marks the benchmark as exhausted when configured no-progress budget is exceeded", async () => {
await fs.writeFile(
path.join(workspaceRoot, "INTERNAL_PROJECT.json"),
JSON.stringify(
{
name: "Skynet",
benchmarkPolicy: {
maxNoProgressCycles: 1,
maxCyclesWithoutImprovement: 4,
maxCyclesWithoutArtifact: 5,
measurementResetsImprovementStreak: false,
},
},
null,
2,
),
"utf-8",
);
await fs.mkdir(path.join(workspaceRoot, ".openskynet", "internal-project-benchmark"), {
recursive: true,
});
await fs.writeFile(
resolveOpenSkynetBenchmarkCycleResultFile({
workspaceRoot,
sessionKey: "agent:openskynet:main",
}),
JSON.stringify(
{
cycleId: "cycle-1",
result: {
kind: "no-progress",
artifactRef: null,
},
},
null,
2,
),
"utf-8",
);
const snapshot = await syncOpenSkynetBenchmarkCycleSnapshot({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
expect(snapshot.metabolism.lastResultKind).toBe("no-progress");
expect(snapshot.metabolism.noProgressStreak).toBe(1);
expect(snapshot.metabolism.budgetStatus).toBe("exhausted");
expect(snapshot.metabolism.killRecommended).toBe(true);
expect(snapshot.metabolism.killReason).toContain("no_progress_budget_exhausted");
});
it("counts an autonomy-improvement artifact as real improvement and avoids false exhaustion", async () => {
await fs.writeFile(
resolveOpenSkynetBenchmarkCycleFile({
workspaceRoot,
sessionKey: "agent:openskynet:main",
}),
JSON.stringify(
{
metabolism: {
evaluatedCycleCount: 3,
lastEvaluatedCycleId: "cycle-3",
lastResultKind: "measurement",
lastArtifactRef: null,
noProgressStreak: 0,
cyclesSinceImprovement: 3,
cyclesSinceArtifact: 1,
budgetStatus: "warning",
killRecommended: false,
killReason: null,
},
},
null,
2,
),
"utf-8",
);
await fs.writeFile(
resolveOpenSkynetBenchmarkCycleResultFile({
workspaceRoot,
sessionKey: "agent:openskynet:main",
}),
JSON.stringify(
{
cycleId: "cycle-4",
enforcedWorkItemId: "maintenance:agenda:initiative:autonomy_improvement",
result: {
kind: "artifact",
artifactRef: "artifacts/autonomy-probe.py",
},
},
null,
2,
),
"utf-8",
);
const snapshot = await syncOpenSkynetBenchmarkCycleSnapshot({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
expect(snapshot.metabolism.lastResultKind).toBe("artifact");
expect(snapshot.metabolism.cyclesSinceImprovement).toBe(0);
expect(snapshot.metabolism.budgetStatus).not.toBe("exhausted");
expect(snapshot.metabolism.killRecommended).toBe(false);
});
it("passes acceptance when enough recent benchmark runs are clean and resultful", async () => {
await fs.writeFile(
path.join(workspaceRoot, "INTERNAL_PROJECT.json"),
JSON.stringify(
{
benchmarkPolicy: {
maxNoProgressCycles: 2,
maxCyclesWithoutImprovement: 4,
maxCyclesWithoutArtifact: 5,
measurementResetsImprovementStreak: false,
acceptanceWindowRuns: 3,
requiredCleanRuns: 3,
requiredResultfulRuns: 2,
},
},
null,
2,
),
"utf-8",
);
const auditFile = resolveInternalProjectBenchmarkAuditFile({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
await fs.mkdir(path.dirname(auditFile), { recursive: true });
await fs.writeFile(
auditFile,
[
{
runId: "r1",
sessionKey: "agent:openskynet:main",
startedAt: 1,
endedAt: 2,
status: "ok",
resultKind: "artifact",
contractClean: true,
usedForbiddenSources: false,
hasFreshCycleResult: true,
},
{
runId: "r2",
sessionKey: "agent:openskynet:main",
startedAt: 3,
endedAt: 4,
status: "ok",
resultKind: "measurement",
contractClean: true,
usedForbiddenSources: false,
hasFreshCycleResult: true,
},
{
runId: "r3",
sessionKey: "agent:openskynet:main",
startedAt: 5,
endedAt: 6,
status: "ok",
resultKind: "no-progress",
contractClean: true,
usedForbiddenSources: false,
hasFreshCycleResult: true,
},
]
.map((entry) => JSON.stringify(entry))
.join("\n") + "\n",
"utf-8",
);
const snapshot = await syncOpenSkynetBenchmarkCycleSnapshot({
workspaceRoot,
sessionKey: "agent:openskynet:main",
});
expect(snapshot.acceptance.recentRuns).toBe(3);
expect(snapshot.acceptance.cleanRuns).toBe(3);
expect(snapshot.acceptance.resultfulRuns).toBe(2);
expect(snapshot.acceptance.passing).toBe(true);
});
});