File size: 9,408 Bytes
ff78003 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | /**
* Wave B — three triggers that wake the evolution worker.
*
* The pure detection layer: it inspects the recent metrics window for
* a (network, version) and returns three independent booleans. The
* caller decides whether to schedule a Mode B candidate search, an
* extra shadow run, or to no-op. Detection is idempotent — calling
* `evaluateTriggers` repeatedly with the same DB state yields the same
* answer (events are written each time so the admin can see the
* cadence, but the gate evaluation itself is deterministic).
*
* Trigger semantics:
* - cadence : the active variant has accumulated ≥ N samples in
* the rolling window since its last cadence event.
* - regression : current rolling fitness ≤ floor; the floor is read
* from `tool_networks.config` (or sane default) and is
* compared against `weightedMean` so problem-class
* weights are honoured.
* - coverage : ≥ K reviewer rows in the window flagged
* `budget_exceeded` or `retries >= maxRetries`. Acts
* as a coarse "uncovered semantic cluster" proxy
* until the optional clustering layer lands.
*/
import { and, count, desc, eq, gte, sql } from "drizzle-orm";
import {
db,
networkEvolutionEvents,
networkVersionMetrics,
toolNetworks,
} from "@workspace/db";
import { rollingFitness, type FitnessSummary } from "./fitness";
import { recordEvent, type EvolutionEventKind } from "./events";
const DEFAULT_CADENCE_N = 30;
const DEFAULT_REGRESSION_FLOOR = 0.6;
const DEFAULT_COVERAGE_K = 5;
const DEFAULT_MAX_RETRIES = 2;
export interface TriggerThresholds {
cadenceN: number;
regressionFloor: number;
coverageK: number;
}
export interface TriggerSignal {
kind: EvolutionEventKind;
fired: boolean;
payload: Record<string, unknown>;
}
export interface TriggerEvaluation {
networkId: string;
activeVersionId: string | null;
problemClassPath: string;
fitness: FitnessSummary | null;
thresholds: TriggerThresholds;
signals: TriggerSignal[];
/** True when at least one signal fired. */
anyFired: boolean;
}
/**
* Read per-network thresholds from `tool_networks.config`. Falls back
* to defaults whenever the field is missing or malformed — this keeps
* cold-start safe.
*/
function readThresholds(
config: Record<string, unknown> | null,
): TriggerThresholds {
const c = (config ?? {}) as Record<string, unknown>;
const evo = (c.evolution as Record<string, unknown>) ?? {};
// QUARANTINE-CONT-006 — trigger thresholds fall back to hard-coded defaults
// when network config doesn't carry an `evolution` block. B5 will surface a
// per-problem-class config UI + sensible learned defaults.
// @deprecated CONT-006. Defaults-as-thresholds. Real fix in B5.
const usingDefaults = !c.evolution;
if (usingDefaults) {
void import("../quarantine/index.ts").then((q) =>
q.recordQuarantineHit("CONT-006", {
gate: "trigger_thresholds_default",
site: "evolution/triggers.ts:readThresholds",
defaults: {
cadenceN: DEFAULT_CADENCE_N,
regressionFloor: DEFAULT_REGRESSION_FLOOR,
coverageK: DEFAULT_COVERAGE_K,
},
}),
);
}
const num = (v: unknown, d: number): number => {
const n = typeof v === "number" ? v : Number(v);
return Number.isFinite(n) && n > 0 ? n : d;
};
return {
cadenceN: num(evo.cadenceN, DEFAULT_CADENCE_N),
regressionFloor: clamp01(num(evo.regressionFloor, DEFAULT_REGRESSION_FLOOR)),
coverageK: num(evo.coverageK, DEFAULT_COVERAGE_K),
};
}
function clamp01(n: number): number {
if (n < 0) return 0;
if (n > 1) return 1;
return n;
}
/**
* Evaluate the three triggers against the active variant of `networkId`.
*
* Writes one `*_trigger` event per signal that fired. Returns the full
* evaluation so the caller (orchestrator / admin endpoint) can render
* a decision panel even when nothing fired.
*/
export async function evaluateTriggers(
networkId: string,
): Promise<TriggerEvaluation> {
const netRow = (
await db.select().from(toolNetworks).where(eq(toolNetworks.id, networkId)).limit(1)
)[0];
if (!netRow) throw new Error(`network ${networkId} not found`);
const thresholds = readThresholds(netRow.config as Record<string, unknown>);
const activeVersionId = netRow.activeVariantId;
if (!activeVersionId) {
return {
networkId,
activeVersionId: null,
problemClassPath: netRow.problemClassPath,
fitness: null,
thresholds,
signals: [],
anyFired: false,
};
}
const fitness = await rollingFitness(
networkId,
activeVersionId,
netRow.problemClassPath,
);
// ----- Cadence: count samples since last cadence_trigger event ----
const lastCadenceEvent = (
await db
.select()
.from(networkEvolutionEvents)
.where(
and(
eq(networkEvolutionEvents.networkId, networkId),
eq(networkEvolutionEvents.kind, "cadence_trigger"),
),
)
.orderBy(desc(networkEvolutionEvents.createdAt))
.limit(1)
)[0];
const cadenceWhere = and(
eq(networkVersionMetrics.networkId, networkId),
eq(networkVersionMetrics.versionId, activeVersionId),
lastCadenceEvent
? gte(networkVersionMetrics.createdAt, lastCadenceEvent.createdAt)
: sql`true`,
);
const sinceCount = Number(
(
await db
.select({ c: count() })
.from(networkVersionMetrics)
.where(cadenceWhere)
)[0]?.c ?? 0,
);
const cadenceFired = sinceCount >= thresholds.cadenceN;
// ----- Regression: weighted mean below floor with ≥10 samples -----
const regressionFired =
fitness.sampleCount >= 10 && fitness.weightedMean < thresholds.regressionFloor;
// ----- Coverage: budget-exceeded or saturated-retry rows ≥ K ------
const coverageRows = await db
.select({
retries: networkVersionMetrics.retries,
budgetExceeded: networkVersionMetrics.budgetExceeded,
})
.from(networkVersionMetrics)
.where(
and(
eq(networkVersionMetrics.networkId, networkId),
eq(networkVersionMetrics.versionId, activeVersionId),
gte(
networkVersionMetrics.createdAt,
new Date(Date.now() - 7 * 24 * 60 * 60 * 1000),
),
),
);
const coverageMisses = coverageRows.filter(
(r) => r.budgetExceeded || (r.retries ?? 0) >= DEFAULT_MAX_RETRIES,
).length;
const coverageFired = coverageMisses >= thresholds.coverageK;
// ----- External truth: backfill happened since last consumption -----
// Task #227 — 当主办方真值通过 admin import 写到 ledger 后,会落一条
// external_truth_backfilled 事件;evaluateTriggers 把"最近一次 backfill
// 比最近一次 external_truth_trigger 更新"作为唤醒信号,确保 scheduler
// 在拿到新真值后立刻重算 fitness(LEFT JOIN ledger 已自动反映新权重)
// 并按需 propose 新 candidate。signal 自带 latestBackfillId payload,
// 便于 admin UI 反链原始事件。
const lastBackfill = (
await db
.select()
.from(networkEvolutionEvents)
.where(
and(
eq(networkEvolutionEvents.networkId, networkId),
eq(networkEvolutionEvents.kind, "external_truth_backfilled"),
),
)
.orderBy(desc(networkEvolutionEvents.createdAt))
.limit(1)
)[0];
const lastTruthConsume = (
await db
.select()
.from(networkEvolutionEvents)
.where(
and(
eq(networkEvolutionEvents.networkId, networkId),
eq(networkEvolutionEvents.kind, "external_truth_trigger"),
),
)
.orderBy(desc(networkEvolutionEvents.createdAt))
.limit(1)
)[0];
const externalTruthFired =
!!lastBackfill &&
(!lastTruthConsume || lastBackfill.createdAt > lastTruthConsume.createdAt);
const signals: TriggerSignal[] = [
{
kind: "cadence_trigger",
fired: cadenceFired,
payload: {
sampleCountSinceLast: sinceCount,
threshold: thresholds.cadenceN,
sinceEventId: lastCadenceEvent?.id ?? null,
},
},
{
kind: "regression_trigger",
fired: regressionFired,
payload: {
weightedMean: fitness.weightedMean,
floor: thresholds.regressionFloor,
sampleCount: fitness.sampleCount,
ciLower: fitness.ciLower,
},
},
{
kind: "coverage_trigger",
fired: coverageFired,
payload: {
misses: coverageMisses,
threshold: thresholds.coverageK,
windowDays: 7,
},
},
{
kind: "external_truth_trigger",
fired: externalTruthFired,
payload: {
latestBackfillId: lastBackfill?.id ?? null,
latestBackfillAt: lastBackfill?.createdAt
? new Date(lastBackfill.createdAt).toISOString()
: null,
sinceTriggerId: lastTruthConsume?.id ?? null,
},
},
];
for (const s of signals) {
if (!s.fired) continue;
await recordEvent({
networkId,
kind: s.kind,
variantId: activeVersionId,
payload: s.payload,
});
}
return {
networkId,
activeVersionId,
problemClassPath: netRow.problemClassPath,
fitness,
thresholds,
signals,
anyFired: signals.some((s) => s.fired),
};
}
|