| """ |
| Phase 1 (part 2): RuleShaping-grounded negative-constraint phrasing for L3 rules. |
| |
| Grounded in RuleShaping — "Do Agent Rules Shape or Distort?" (arXiv:2604.11088, |
| verified against the paper): rules improve +7–14pp; NEGATIVE constraints are the |
| ONLY individually beneficial rule type, POSITIVE directives actively HURT, and rules |
| work through CONTEXT PRIMING (random rules help nearly as much as curated). The |
| highest-benefit category is STATE-DEPENDENT process guardrails (do not X until Y), |
| 63.8%. So L3 rules the slug emits are phrased as negative, preferably state-dependent, |
| guardrails. Individual rules can be harmful in isolation but are collectively helpful |
| (no degradation up to 50 rules), so emitting several is fine. |
| |
| This module provides: |
| - is_negative_constraint(text): heuristic check |
| - to_negative_constraint(gotcha): deterministic reshaping of a gotcha into a |
| guardrail-phrased rule WITHOUT inventing content (it only re-frames what's there). |
| |
| We keep this deterministic and conservative: if we cannot confidently reshape, we |
| return the original prefixed minimally, rather than fabricate a cause. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| _POSITIVE_OPENERS = ( |
| "always ", "make sure to ", "be sure to ", "remember to ", "you should ", |
| "ensure that ", "ensure you ", "prefer ", "use ", |
| ) |
| _NEGATIVE_MARKERS = ( |
| "avoid", "do not", "don't", "never", "without", "fails", "breaks", |
| "silently", "instead of", "rather than", "watch out", "beware", |
| ) |
|
|
|
|
| def is_negative_constraint(text: str) -> bool: |
| t = text.lower().strip() |
| return any(m in t for m in _NEGATIVE_MARKERS) |
|
|
|
|
| def _split_cause(gotcha: str) -> tuple[str, str | None]: |
| """Try to separate the 'what' from the 'why' using common connectives.""" |
| for sep in (" because ", " since ", "; ", " — ", " - ", ", which ", ", as "): |
| if sep in gotcha: |
| head, tail = gotcha.split(sep, 1) |
| return head.strip(), tail.strip().rstrip(".") |
| return gotcha.strip(), None |
|
|
|
|
| def to_negative_constraint(gotcha: str) -> str: |
| """Reshape a gotcha into a guardrail-phrased rule. Content-preserving: we never |
| invent a cause that isn't present; we only re-frame the existing text.""" |
| g = gotcha.strip().rstrip(".") |
| if not g: |
| return g |
| |
| if is_negative_constraint(g): |
| return g + "." |
|
|
| what, why = _split_cause(g) |
|
|
| |
| low = what.lower() |
| for opener in _POSITIVE_OPENERS: |
| if low.startswith(opener): |
| what = what[len(opener):].strip() |
| break |
|
|
| |
| |
| |
| wl = what.lower() |
| already = any(k in wl for k in ("assume", "assumes", "always", "is guaranteed")) |
| if why: |
| if already: |
| return f"Do not rely on the assumption that {what} — {why}." |
| return f"Avoid assuming {what} holds by default — {why}." |
| if already: |
| return f"Do not rely on the assumption that {what}; verify it explicitly." |
| return f"Watch out: {what} is not guaranteed; verify it rather than assuming it." |
|
|
|
|
| def reshape_rules(gotchas: list[str]) -> list[str]: |
| """Map a list of gotchas to guardrail-phrased rules, de-duplicated, order-stable.""" |
| seen, out = set(), [] |
| for g in gotchas: |
| r = to_negative_constraint(g) |
| key = r.lower() |
| if key not in seen: |
| seen.add(key) |
| out.append(r) |
| return out |
|
|
|
|
| |
| |
| |
| |
|
|
| _RESHAPE_SYSTEM = ( |
| "You rewrite a single engineering 'gotcha' into a guardrail-style rule. Rules: " |
| "(1) Phrase as a NEGATIVE constraint (do not / never / avoid), never a positive " |
| "directive (no 'always', 'use', 'prefer') — per RuleShaping (arXiv:2604.11088), " |
| "negative constraints are the only individually beneficial rule type; positive " |
| "directives actively hurt. " |
| "(2) PREFER a STATE-DEPENDENT PROCESS guardrail when the gotcha supports one: " |
| "'do not X until/unless/before Y' (the highest-benefit category, 63.8%), rather " |
| "than a state-independent architectural 'don't'. " |
| "(3) Do NOT invent any cause/mechanism/detail not present in the input. " |
| "(4) One sentence, under 30 words. (5) Output ONLY the rewritten rule." |
| ) |
|
|
|
|
| def to_negative_constraint_llm(gotcha: str, completer) -> str: |
| """Reshape via a model completer (prompt)->str. Falls back to the deterministic |
| reshaper on any failure or if the output drifts into a positive directive.""" |
| g = gotcha.strip().rstrip(".") |
| if not g: |
| return g |
| try: |
| out = completer(f"{_RESHAPE_SYSTEM}\n\nGOTCHA:\n{g}\n\nGUARDRAIL RULE:").strip() |
| out = out.strip().strip('"').rstrip(".") + "." |
| |
| if not out or not is_negative_constraint(out): |
| return to_negative_constraint(g) |
| |
| |
| _, why = _split_cause(g) |
| if why is None and (" because " in out.lower() or " since " in out.lower()): |
| return to_negative_constraint(g) |
| return out |
| except Exception: |
| return to_negative_constraint(g) |
|
|
|
|
| def reshape_rules_llm(gotchas: list, completer) -> list: |
| seen, out = set(), [] |
| for g in gotchas: |
| r = to_negative_constraint_llm(g, completer) |
| k = r.lower() |
| if k not in seen: |
| seen.add(k); out.append(r) |
| return out |
|
|