Phase 1.2: fix STALE-before-CONTRADICTED bug — both checks now run, CONTRADICTED wins when both fire
Browse files- CHANGELOG.md +9 -0
- src/agents/critic.py +35 -33
CHANGELOG.md
CHANGED
|
@@ -2,6 +2,15 @@
|
|
| 2 |
|
| 3 |
## [Unreleased]
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
### Phase 0 — Branch Setup
|
| 6 |
- Created v2-edge-reliability branch from main
|
| 7 |
- Added CHANGELOG.md
|
|
|
|
| 2 |
|
| 3 |
## [Unreleased]
|
| 4 |
|
| 5 |
+
### Phase 1 — Integrity + Critic Fix
|
| 6 |
+
- 1.1: Archived patch_contradiction.py to eval/archived/ with README
|
| 7 |
+
- 1.2: Fixed critic_node — STALE and CONTRADICTED checks now run in parallel
|
| 8 |
+
- Removed short-circuit: STALE no longer blocks CONTRADICTED
|
| 9 |
+
- When both fire, CONTRADICTED wins (richer signal)
|
| 10 |
+
- Removed **state spread from all return paths for consistency
|
| 11 |
+
- All 5 return paths now have identical key shape
|
| 12 |
+
- Added missing retry_count to FORCED_PASS path
|
| 13 |
+
|
| 14 |
### Phase 0 — Branch Setup
|
| 15 |
- Created v2-edge-reliability branch from main
|
| 16 |
- Added CHANGELOG.md
|
src/agents/critic.py
CHANGED
|
@@ -157,10 +157,10 @@ def critic_node(state: ResearchState) -> ResearchState:
|
|
| 157 |
if retry_count >= 2:
|
| 158 |
logger.info("Critic: max retries reached, forcing PASS")
|
| 159 |
return {
|
| 160 |
-
**state,
|
| 161 |
"critic_verdict": Verdict.FORCED_PASS,
|
| 162 |
"critic_notes": "Max retries reached. Passing with available evidence.",
|
| 163 |
"rewritten_questions": [],
|
|
|
|
| 164 |
"calibration_bin": Verdict.FORCED_PASS,
|
| 165 |
}
|
| 166 |
|
|
@@ -169,7 +169,6 @@ def critic_node(state: ResearchState) -> ResearchState:
|
|
| 169 |
logger.info(f"Critic: insufficient papers ({len(papers)})")
|
| 170 |
rewritten = _rewrite_questions(state.get("sub_questions") or [], "broaden")
|
| 171 |
return {
|
| 172 |
-
**state,
|
| 173 |
"critic_verdict": Verdict.INSUFFICIENT,
|
| 174 |
"critic_notes": f"Only {len(papers)} papers retrieved. Need at least 3.",
|
| 175 |
"rewritten_questions": rewritten,
|
|
@@ -183,7 +182,6 @@ def critic_node(state: ResearchState) -> ResearchState:
|
|
| 183 |
logger.info("Critic: insufficient high-score papers")
|
| 184 |
rewritten = _rewrite_questions(state.get("sub_questions") or [], "broaden")
|
| 185 |
return {
|
| 186 |
-
**state,
|
| 187 |
"critic_verdict": Verdict.INSUFFICIENT,
|
| 188 |
"critic_notes": "Fewer than 3 papers with hybrid_score >= 0.40.",
|
| 189 |
"rewritten_questions": rewritten,
|
|
@@ -191,41 +189,45 @@ def critic_node(state: ResearchState) -> ResearchState:
|
|
| 191 |
"calibration_bin": Verdict.INSUFFICIENT,
|
| 192 |
}
|
| 193 |
|
| 194 |
-
# STALE
|
| 195 |
mean_age = _mean_age_months(papers)
|
| 196 |
-
|
| 197 |
-
logger.info(f"Critic: evidence is stale (mean age: {mean_age:.1f} months)")
|
| 198 |
-
rewritten = _rewrite_questions(state.get("sub_questions") or [], "recent")
|
| 199 |
-
return {
|
| 200 |
-
**state,
|
| 201 |
-
"critic_verdict": Verdict.STALE,
|
| 202 |
-
"critic_notes": f"Mean paper age is {mean_age:.0f} months. Evidence may be outdated.",
|
| 203 |
-
"rewritten_questions": rewritten,
|
| 204 |
-
"retry_count": retry_count + 1,
|
| 205 |
-
"calibration_bin": Verdict.STALE,
|
| 206 |
-
}
|
| 207 |
|
| 208 |
-
# CONTRADICTED — papers disagree
|
| 209 |
contradictions = _detect_contradictions(papers)
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
return {
|
| 215 |
-
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"rewritten_questions":
|
| 219 |
-
"
|
| 220 |
-
"calibration_bin": Verdict.CONTRADICTED,
|
| 221 |
}
|
| 222 |
|
| 223 |
-
# PASS
|
| 224 |
-
|
|
|
|
| 225 |
return {
|
| 226 |
-
|
| 227 |
-
"
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"calibration_bin":
|
| 231 |
}
|
|
|
|
| 157 |
if retry_count >= 2:
|
| 158 |
logger.info("Critic: max retries reached, forcing PASS")
|
| 159 |
return {
|
|
|
|
| 160 |
"critic_verdict": Verdict.FORCED_PASS,
|
| 161 |
"critic_notes": "Max retries reached. Passing with available evidence.",
|
| 162 |
"rewritten_questions": [],
|
| 163 |
+
"retry_count": retry_count,
|
| 164 |
"calibration_bin": Verdict.FORCED_PASS,
|
| 165 |
}
|
| 166 |
|
|
|
|
| 169 |
logger.info(f"Critic: insufficient papers ({len(papers)})")
|
| 170 |
rewritten = _rewrite_questions(state.get("sub_questions") or [], "broaden")
|
| 171 |
return {
|
|
|
|
| 172 |
"critic_verdict": Verdict.INSUFFICIENT,
|
| 173 |
"critic_notes": f"Only {len(papers)} papers retrieved. Need at least 3.",
|
| 174 |
"rewritten_questions": rewritten,
|
|
|
|
| 182 |
logger.info("Critic: insufficient high-score papers")
|
| 183 |
rewritten = _rewrite_questions(state.get("sub_questions") or [], "broaden")
|
| 184 |
return {
|
|
|
|
| 185 |
"critic_verdict": Verdict.INSUFFICIENT,
|
| 186 |
"critic_notes": "Fewer than 3 papers with hybrid_score >= 0.40.",
|
| 187 |
"rewritten_questions": rewritten,
|
|
|
|
| 189 |
"calibration_bin": Verdict.INSUFFICIENT,
|
| 190 |
}
|
| 191 |
|
| 192 |
+
# --- Run STALE and CONTRADICTED checks in parallel (both always run) ---
|
| 193 |
mean_age = _mean_age_months(papers)
|
| 194 |
+
is_stale = mean_age > 24
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
|
|
|
| 196 |
contradictions = _detect_contradictions(papers)
|
| 197 |
+
is_contradicted = len(contradictions) > 0
|
| 198 |
+
|
| 199 |
+
# --- Combine signals: CONTRADICTED wins when both fire ---
|
| 200 |
+
if is_contradicted and is_stale:
|
| 201 |
+
verdict = Verdict.CONTRADICTED
|
| 202 |
+
contradiction_details = "; ".join(f"'{c[0]}' vs '{c[1]}': {c[2]}" for c in contradictions)
|
| 203 |
+
notes = f"CONTRADICTED (also stale, mean age {mean_age:.0f} months). Contradictions found: {contradiction_details}"
|
| 204 |
+
strategy = "probe_contradiction"
|
| 205 |
+
elif is_contradicted:
|
| 206 |
+
verdict = Verdict.CONTRADICTED
|
| 207 |
+
contradiction_details = "; ".join(f"'{c[0]}' vs '{c[1]}': {c[2]}" for c in contradictions)
|
| 208 |
+
notes = f"Contradictions found: {contradiction_details}"
|
| 209 |
+
strategy = "probe_contradiction"
|
| 210 |
+
elif is_stale:
|
| 211 |
+
verdict = Verdict.STALE
|
| 212 |
+
notes = f"Evidence is stale (mean age {mean_age:.0f} months > 24 month threshold)"
|
| 213 |
+
strategy = "recent"
|
| 214 |
+
else:
|
| 215 |
+
# PASS — all checks clear
|
| 216 |
return {
|
| 217 |
+
"critic_verdict": Verdict.PASS,
|
| 218 |
+
"critic_notes": f"Evidence passes all checks (mean age {mean_age:.0f} months, {len(papers)} papers, no contradictions detected)",
|
| 219 |
+
"retry_count": retry_count,
|
| 220 |
+
"rewritten_questions": [],
|
| 221 |
+
"calibration_bin": Verdict.PASS,
|
|
|
|
| 222 |
}
|
| 223 |
|
| 224 |
+
# --- Non-PASS path: rewrite questions and return ---
|
| 225 |
+
sub_questions = state.get("sub_questions") or []
|
| 226 |
+
rewritten = _rewrite_questions(sub_questions, strategy)
|
| 227 |
return {
|
| 228 |
+
"critic_verdict": verdict,
|
| 229 |
+
"critic_notes": notes,
|
| 230 |
+
"rewritten_questions": rewritten,
|
| 231 |
+
"retry_count": retry_count + 1,
|
| 232 |
+
"calibration_bin": verdict,
|
| 233 |
}
|