Spaces:
Running
Running
Enforce per-term ngram loop and underrepresented-only targeting.
Browse filesLimit n-gram stage targets to underrepresented terms, run three attempts per n-gram before advancing, and document the per-term attempt policy in optimizer principles.
Made-with: Cursor
- docs/TEXT_OPTIMIZER_PRINCIPLES.md +3 -2
- optimizer.py +43 -25
docs/TEXT_OPTIMIZER_PRINCIPLES.md
CHANGED
|
@@ -45,8 +45,9 @@ Update it whenever optimization policy changes.
|
|
| 45 |
- bi-grams and tri-grams are eligible when present in `>= 2` competitors;
|
| 46 |
- unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
|
| 47 |
- Iteration behavior:
|
| 48 |
-
- optimizer
|
| 49 |
-
-
|
|
|
|
| 50 |
|
| 51 |
## 5.1 Summary logic memory (current)
|
| 52 |
|
|
|
|
| 45 |
- bi-grams and tri-grams are eligible when present in `>= 2` competitors;
|
| 46 |
- unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
|
| 47 |
- Iteration behavior:
|
| 48 |
+
- optimizer works on one n-gram target at a time per step;
|
| 49 |
+
- per eligible n-gram target it allocates `3` attempts, then moves to the next target;
|
| 50 |
+
- if target list ends, stage advances to the next optimization stage.
|
| 51 |
|
| 52 |
## 5.1 Summary logic memory (current)
|
| 53 |
|
optimizer.py
CHANGED
|
@@ -26,6 +26,7 @@ TITLE_TARGET_THRESHOLD = 0.65
|
|
| 26 |
SEMANTIC_GAP_TOLERANCE_PCT = 0.15
|
| 27 |
SEMANTIC_GAP_MIN_ABS = 3.0
|
| 28 |
STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def _tokenize(text: str) -> List[str]:
|
|
@@ -487,27 +488,34 @@ def _choose_optimization_goal(
|
|
| 487 |
continue
|
| 488 |
if not _is_ngram_outside_tolerance(target, comp_avg):
|
| 489 |
continue
|
|
|
|
|
|
|
|
|
|
| 490 |
tol = _ngram_tolerance_pct(comp_avg)
|
| 491 |
dev_ratio = _ngram_deviation_ratio(target, comp_avg)
|
| 492 |
ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
|
| 493 |
if ngram_rows:
|
| 494 |
ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
|
| 495 |
-
pick = max(0, int(stage_cursor))
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
"
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
|
| 513 |
title_target_score = title_bert.get("target_score")
|
|
@@ -1110,6 +1118,16 @@ def _is_stage_complete(stage: str, metrics: Dict[str, Any], bert_stage_target: f
|
|
| 1110 |
return True
|
| 1111 |
|
| 1112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1113 |
def _candidate_utility(
|
| 1114 |
*,
|
| 1115 |
prev_metrics: Dict[str, Any],
|
|
@@ -1246,7 +1264,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1246 |
queued_candidates: List[Dict[str, Any]] = []
|
| 1247 |
stage_idx = 0
|
| 1248 |
stage_no_progress_steps = 0
|
| 1249 |
-
stage_goal_cursor: Dict[str, int] = {}
|
| 1250 |
|
| 1251 |
for step in range(max_iterations):
|
| 1252 |
while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
|
|
@@ -1266,7 +1284,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1266 |
language,
|
| 1267 |
stage=active_stage,
|
| 1268 |
bert_stage_target=bert_stage_target,
|
| 1269 |
-
stage_cursor=int(stage_goal_cursor.get(active_stage, 0)),
|
| 1270 |
)
|
| 1271 |
if goal["type"] == "none":
|
| 1272 |
stage_idx += 1
|
|
@@ -1594,8 +1612,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1594 |
stage_no_progress_steps = 0
|
| 1595 |
else:
|
| 1596 |
stage_no_progress_steps += 1
|
| 1597 |
-
|
| 1598 |
-
|
| 1599 |
applied_changes += 1
|
| 1600 |
queued_candidates = []
|
| 1601 |
|
|
@@ -1739,8 +1757,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1739 |
stage_no_progress_steps = 0
|
| 1740 |
else:
|
| 1741 |
stage_no_progress_steps += 1
|
| 1742 |
-
|
| 1743 |
-
|
| 1744 |
applied_changes += 1
|
| 1745 |
batch_applied = True
|
| 1746 |
batch_info = {
|
|
@@ -1860,7 +1878,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1860 |
)
|
| 1861 |
stage_no_progress_steps += 1
|
| 1862 |
if active_stage == "ngram":
|
| 1863 |
-
|
| 1864 |
if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
|
| 1865 |
stage_idx += 1
|
| 1866 |
stage_no_progress_steps = 0
|
|
@@ -1895,8 +1913,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1895 |
stage_no_progress_steps = 0
|
| 1896 |
else:
|
| 1897 |
stage_no_progress_steps += 1
|
| 1898 |
-
|
| 1899 |
-
|
| 1900 |
applied_changes += 1
|
| 1901 |
queued_candidates = []
|
| 1902 |
|
|
|
|
| 26 |
SEMANTIC_GAP_TOLERANCE_PCT = 0.15
|
| 27 |
SEMANTIC_GAP_MIN_ABS = 3.0
|
| 28 |
STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
|
| 29 |
+
NGRAM_ATTEMPTS_PER_TERM = 3
|
| 30 |
|
| 31 |
|
| 32 |
def _tokenize(text: str) -> List[str]:
|
|
|
|
| 488 |
continue
|
| 489 |
if not _is_ngram_outside_tolerance(target, comp_avg):
|
| 490 |
continue
|
| 491 |
+
# N-gram stage is for underrepresented terms only.
|
| 492 |
+
if target >= comp_avg:
|
| 493 |
+
continue
|
| 494 |
tol = _ngram_tolerance_pct(comp_avg)
|
| 495 |
dev_ratio = _ngram_deviation_ratio(target, comp_avg)
|
| 496 |
ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
|
| 497 |
if ngram_rows:
|
| 498 |
ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
|
| 499 |
+
pick = max(0, int(stage_cursor))
|
| 500 |
+
if pick >= len(ngram_rows):
|
| 501 |
+
# No more n-gram targets in current stage cursor window.
|
| 502 |
+
pass
|
| 503 |
+
else:
|
| 504 |
+
label, target, comp_avg, tol, _, _ = ngram_rows[pick]
|
| 505 |
+
candidates["ngram"] = {
|
| 506 |
+
"type": "ngram",
|
| 507 |
+
"label": label,
|
| 508 |
+
"focus_terms": [label],
|
| 509 |
+
"avoid_terms": [],
|
| 510 |
+
"ngram_target_count": target,
|
| 511 |
+
"ngram_comp_avg": comp_avg,
|
| 512 |
+
"ngram_tolerance_pct": tol,
|
| 513 |
+
"ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
|
| 514 |
+
"ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
|
| 515 |
+
"ngram_direction": "increase" if target < comp_avg else "decrease",
|
| 516 |
+
"ngram_rank_index": pick,
|
| 517 |
+
"ngram_candidates_total": len(ngram_rows),
|
| 518 |
+
}
|
| 519 |
|
| 520 |
title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
|
| 521 |
title_target_score = title_bert.get("target_score")
|
|
|
|
| 1118 |
return True
|
| 1119 |
|
| 1120 |
|
| 1121 |
+
def _advance_ngram_term_cursor(cursor_state: Dict[str, Dict[str, int]], stage_key: str) -> None:
|
| 1122 |
+
state = cursor_state.get(stage_key) or {"term_index": 0, "attempt_count": 0}
|
| 1123 |
+
attempts = int(state.get("attempt_count", 0)) + 1
|
| 1124 |
+
term_index = int(state.get("term_index", 0))
|
| 1125 |
+
if attempts >= NGRAM_ATTEMPTS_PER_TERM:
|
| 1126 |
+
term_index += 1
|
| 1127 |
+
attempts = 0
|
| 1128 |
+
cursor_state[stage_key] = {"term_index": term_index, "attempt_count": attempts}
|
| 1129 |
+
|
| 1130 |
+
|
| 1131 |
def _candidate_utility(
|
| 1132 |
*,
|
| 1133 |
prev_metrics: Dict[str, Any],
|
|
|
|
| 1264 |
queued_candidates: List[Dict[str, Any]] = []
|
| 1265 |
stage_idx = 0
|
| 1266 |
stage_no_progress_steps = 0
|
| 1267 |
+
stage_goal_cursor: Dict[str, Dict[str, int]] = {}
|
| 1268 |
|
| 1269 |
for step in range(max_iterations):
|
| 1270 |
while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
|
|
|
|
| 1284 |
language,
|
| 1285 |
stage=active_stage,
|
| 1286 |
bert_stage_target=bert_stage_target,
|
| 1287 |
+
stage_cursor=int((stage_goal_cursor.get(active_stage) or {}).get("term_index", 0)),
|
| 1288 |
)
|
| 1289 |
if goal["type"] == "none":
|
| 1290 |
stage_idx += 1
|
|
|
|
| 1612 |
stage_no_progress_steps = 0
|
| 1613 |
else:
|
| 1614 |
stage_no_progress_steps += 1
|
| 1615 |
+
if active_stage == "ngram":
|
| 1616 |
+
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 1617 |
applied_changes += 1
|
| 1618 |
queued_candidates = []
|
| 1619 |
|
|
|
|
| 1757 |
stage_no_progress_steps = 0
|
| 1758 |
else:
|
| 1759 |
stage_no_progress_steps += 1
|
| 1760 |
+
if active_stage == "ngram":
|
| 1761 |
+
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 1762 |
applied_changes += 1
|
| 1763 |
batch_applied = True
|
| 1764 |
batch_info = {
|
|
|
|
| 1878 |
)
|
| 1879 |
stage_no_progress_steps += 1
|
| 1880 |
if active_stage == "ngram":
|
| 1881 |
+
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 1882 |
if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
|
| 1883 |
stage_idx += 1
|
| 1884 |
stage_no_progress_steps = 0
|
|
|
|
| 1913 |
stage_no_progress_steps = 0
|
| 1914 |
else:
|
| 1915 |
stage_no_progress_steps += 1
|
| 1916 |
+
if active_stage == "ngram":
|
| 1917 |
+
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 1918 |
applied_changes += 1
|
| 1919 |
queued_candidates = []
|
| 1920 |
|