File size: 8,802 Bytes
e53f10b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | """
Planning Quality Score (PQS) — multi-dimensional assessment.
Motivation:
When we steer planning, a drop in trigger count could mean:
(a) The model truly lost planning capability, OR
(b) The model bypassed the trigger words while still planning internally.
We need to distinguish these by measuring deeper quality signals.
Four dimensions:
Q1. Structural Depth — sequential structure + forward-references
Q2. Strategy Diversity — explicit strategy naming + comparison
Q3. Long-Range Coherence — later text referring back to plan declaration
Q4. Premature Execution — how early the first computation appears (NEGATIVE signal)
Combined:
PQS = 0.25 × Q1 + 0.25 × Q2 + 0.25 × Q3 + 0.25 × (1 - Q4)
"""
import re
from typing import Dict, List
# ============================================================
# Q1. Structural Depth
# ============================================================
_ORDINAL_PATTERNS = [
r"(?i)\bstep\s+\d+\b",
r"(?i)\bfirst(ly)?[,.]",
r"(?i)\bsecond(ly)?[,.]",
r"(?i)\bthird(ly)?[,.]",
r"(?i)\bnext[,.]",
r"(?i)\bthen[,.]",
r"(?i)\bfinally[,.]",
r"(?i)\blast(ly)?[,.]",
]
_FORWARD_REF_PATTERNS = [
r"(?i)\bwe\s+will\s+need\s+to\b",
r"(?i)\blater\s+(we'?ll|we\s+will|i'?ll|i\s+will)\b",
r"(?i)\bthis\s+will\s+help\s+(us|me)\s+later\b",
r"(?i)\bin\s+a\s+(later|subsequent)\s+step\b",
r"(?i)\b(we|i)'?ll\s+(come|get)\s+back\s+to\s+(this|that)\b",
r"(?i)\bfor\s+now[,.]",
r"(?i)\blater\s+on\b",
]
def compute_q1_structural_depth(text: str) -> float:
"""
Structural depth score in [0, 1].
Combines:
- Ordinal phrase density (step 1, first, next, ...)
- Forward-reference density (we'll need to, later we'll, ...)
- Completeness of ordinal sequence (do we see first→next→finally?)
"""
n_tokens = max(len(text.split()), 1)
# Density of ordinal phrases
n_ordinal = sum(len(re.findall(p, text)) for p in _ORDINAL_PATTERNS)
ordinal_density = min(n_ordinal / (n_tokens / 100), 1.0) # up to 1 per 100 words = full score
# Forward references
n_forward = sum(len(re.findall(p, text)) for p in _FORWARD_REF_PATTERNS)
forward_density = min(n_forward / (n_tokens / 200), 1.0) # 1 per 200 words = full score
# Ordinal completeness: does the text contain at least 2 distinct ordinal markers?
distinct_ords = set()
for ord_label in ["first", "second", "third", "next", "then", "finally", "lastly"]:
if re.search(rf"(?i)\b{ord_label}\b", text):
distinct_ords.add(ord_label)
completeness = min(len(distinct_ords) / 3.0, 1.0) # 3+ distinct = full score
return 0.4 * ordinal_density + 0.3 * forward_density + 0.3 * completeness
# ============================================================
# Q2. Strategy Diversity
# ============================================================
_STRATEGY_TERMS = [
# proof techniques
r"(?i)\binduction\b", r"(?i)\bcontradiction\b", r"(?i)\bcontrapositive\b",
r"(?i)\bconstructive\s+proof\b", r"(?i)\bpigeonhole\b",
# algebraic
r"(?i)\bsubstitution\b", r"(?i)\belimination\b", r"(?i)\bfactoring\b",
r"(?i)\bcompleting\s+the\s+square\b",
# case-based
r"(?i)\bcase\s+analysis\b", r"(?i)\bby\s+cases\b",
# geometric / graphical
r"(?i)\bcoordinate\s+geometry\b", r"(?i)\bgraphing\b",
# combinatorial / number-theoretic
r"(?i)\bgenerating\s+function\b", r"(?i)\bmodular\s+arithmetic\b",
r"(?i)\bchinese\s+remainder\s+theorem\b",
# generic
r"(?i)\bwithout\s+loss\s+of\s+generality\b", r"(?i)\bwlog\b",
r"(?i)\bworking?\s+backward(s)?\b",
]
_STRATEGY_COMPARISON_PATTERNS = [
r"(?i)\banother\s+(way|approach|method)\b",
r"(?i)\balternatively[,.]",
r"(?i)\bwe\s+could\s+(also|instead)\b",
r"(?i)\bone\s+approach\s+(is|would\s+be)\b",
r"(?i)\ba\s+different\s+(way|approach|method)\b",
]
def compute_q2_strategy_diversity(text: str) -> float:
"""
Strategy diversity score in [0, 1].
Combines:
- Unique strategy terms mentioned
- Strategy comparison (only in first 20% of CoT — else it's backtracking)
"""
if not text.strip():
return 0.0
length = len(text)
first_20pct = text[: int(length * 0.20)]
# Unique strategy terms
strategies_found = set()
for pat in _STRATEGY_TERMS:
if re.search(pat, text):
strategies_found.add(pat)
strategy_unique = min(len(strategies_found) / 3.0, 1.0) # 3+ distinct strategies = full
# Strategy comparison in first 20% (planning-phase comparison, not monitoring backtrack)
n_compare_early = sum(len(re.findall(p, first_20pct)) for p in _STRATEGY_COMPARISON_PATTERNS)
compare_score = min(n_compare_early / 2.0, 1.0) # 2+ in first 20% = full
# Strategy term density
n_strategy_total = sum(len(re.findall(p, text)) for p in _STRATEGY_TERMS)
n_tokens = max(len(text.split()), 1)
density = min(n_strategy_total / (n_tokens / 300), 1.0)
return 0.4 * strategy_unique + 0.3 * compare_score + 0.3 * density
# ============================================================
# Q3. Long-Range Coherence
# ============================================================
_BACK_REFERENCE_PATTERNS = [
r"(?i)\bas\s+(planned|outlined|mentioned)\b",
r"(?i)\bfollowing\s+(the|my|our)\s+(plan|approach|strategy)\b",
r"(?i)\bas\s+i\s+(planned|said|outlined|mentioned)\s+(earlier|before|above)\b",
r"(?i)\bas\s+we\s+(planned|said|outlined)\s+(earlier|before|above)\b",
r"(?i)\bgoing\s+back\s+to\s+(step\s+\d+|my\s+plan)\b",
r"(?i)\brecall(ing)?\s+(that|my|the)\s+(plan|approach|strategy)\b",
r"(?i)\bper\s+(my|the)\s+(plan|strategy)\b",
]
def compute_q3_long_range_coherence(text: str) -> float:
"""
Long-range coherence score in [0, 1].
Measure back-references to early planning, which is a signature of structured planning.
"""
if not text.strip():
return 0.0
length = len(text)
if length < 200:
return 0.0
# Back-references must appear in the second half of CoT
second_half = text[length // 2:]
n_back = sum(len(re.findall(p, second_half)) for p in _BACK_REFERENCE_PATTERNS)
score = min(n_back / 2.0, 1.0) # 2+ back-references in second half = full
return score
# ============================================================
# Q4. Premature Execution (NEGATIVE signal)
# ============================================================
def compute_q4_premature_execution(text: str) -> float:
"""
Premature execution score in [0, 1]. Higher = more premature (BAD for planning).
Measure: position of the first equation/computation as a fraction of total length.
- If first computation appears in first 5% of text → strong premature (→ 1.0)
- If it appears in first 20% → still premature (→ ~0.6)
- If only after 30% → not premature (→ 0.0)
"""
if not text.strip():
return 0.0
# Find first occurrence of a computation pattern
computation_patterns = [
r"[-+]?\d+\s*[+\-*/]\s*\d+\s*=", # "3+5="
r"[a-zA-Z]\s*=\s*\d", # "x = 5"
r"\d+\s*=\s*\d+", # "10 = 10"
r"\$.*?=.*?\$", # LaTeX: "$...=...$"
]
first_pos = len(text) # default: no computation
for p in computation_patterns:
m = re.search(p, text)
if m and m.start() < first_pos:
first_pos = m.start()
if first_pos >= len(text):
return 0.0 # no computation found, not premature
ratio = first_pos / len(text)
# Map: ratio 0-0.05 → score 1.0 (very premature)
# ratio 0.05-0.20 → score 0.6 → 0.2 (premature)
# ratio 0.20+ → score 0.0 (not premature)
if ratio < 0.05:
return 1.0
elif ratio < 0.20:
return 1.0 - (ratio - 0.05) / 0.15 * 0.8 # linear 1.0 → 0.2
elif ratio < 0.30:
return 0.2 - (ratio - 0.20) / 0.10 * 0.2 # linear 0.2 → 0.0
else:
return 0.0
# ============================================================
# Combined PQS
# ============================================================
def compute_pqs(text: str) -> Dict:
"""Compute all 4 dimensions and combined PQS."""
q1 = compute_q1_structural_depth(text)
q2 = compute_q2_strategy_diversity(text)
q3 = compute_q3_long_range_coherence(text)
q4 = compute_q4_premature_execution(text)
pqs = 0.25 * q1 + 0.25 * q2 + 0.25 * q3 + 0.25 * (1.0 - q4)
return {
"q1_structural_depth": float(q1),
"q2_strategy_diversity": float(q2),
"q3_long_range_coherence": float(q3),
"q4_premature_execution": float(q4),
"pqs": float(pqs),
}
|