lsdf commited on
Commit
a52efa4
·
1 Parent(s): f364498

Enforce per-term ngram loop and underrepresented-only targeting.

Browse files

Limit n-gram stage targets to underrepresented terms, run three attempts per n-gram before advancing, and document the per-term attempt policy in optimizer principles.

Made-with: Cursor

Files changed (2) hide show
  1. docs/TEXT_OPTIMIZER_PRINCIPLES.md +3 -2
  2. optimizer.py +43 -25
docs/TEXT_OPTIMIZER_PRINCIPLES.md CHANGED
@@ -45,8 +45,9 @@ Update it whenever optimization policy changes.
45
  - bi-grams and tri-grams are eligible when present in `>= 2` competitors;
46
  - unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
47
  - Iteration behavior:
48
- - optimizer may work on one n-gram at a time per step,
49
- - if no primary progress on current n-gram target, it rotates to the next eligible n-gram candidate.
 
50
 
51
  ## 5.1 Summary logic memory (current)
52
 
 
45
  - bi-grams and tri-grams are eligible when present in `>= 2` competitors;
46
  - unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
47
  - Iteration behavior:
48
+ - optimizer works on one n-gram target at a time per step;
49
+ - per eligible n-gram target it allocates `3` attempts, then moves to the next target;
50
+ - if target list ends, stage advances to the next optimization stage.
51
 
52
  ## 5.1 Summary logic memory (current)
53
 
optimizer.py CHANGED
@@ -26,6 +26,7 @@ TITLE_TARGET_THRESHOLD = 0.65
26
  SEMANTIC_GAP_TOLERANCE_PCT = 0.15
27
  SEMANTIC_GAP_MIN_ABS = 3.0
28
  STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
 
29
 
30
 
31
  def _tokenize(text: str) -> List[str]:
@@ -487,27 +488,34 @@ def _choose_optimization_goal(
487
  continue
488
  if not _is_ngram_outside_tolerance(target, comp_avg):
489
  continue
 
 
 
490
  tol = _ngram_tolerance_pct(comp_avg)
491
  dev_ratio = _ngram_deviation_ratio(target, comp_avg)
492
  ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
493
  if ngram_rows:
494
  ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
495
- pick = max(0, int(stage_cursor)) % len(ngram_rows)
496
- label, target, comp_avg, tol, _, _ = ngram_rows[pick]
497
- candidates["ngram"] = {
498
- "type": "ngram",
499
- "label": label,
500
- "focus_terms": [label],
501
- "avoid_terms": [],
502
- "ngram_target_count": target,
503
- "ngram_comp_avg": comp_avg,
504
- "ngram_tolerance_pct": tol,
505
- "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
506
- "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
507
- "ngram_direction": "increase" if target < comp_avg else "decrease",
508
- "ngram_rank_index": pick,
509
- "ngram_candidates_total": len(ngram_rows),
510
- }
 
 
 
 
511
 
512
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
513
  title_target_score = title_bert.get("target_score")
@@ -1110,6 +1118,16 @@ def _is_stage_complete(stage: str, metrics: Dict[str, Any], bert_stage_target: f
1110
  return True
1111
 
1112
 
 
 
 
 
 
 
 
 
 
 
1113
  def _candidate_utility(
1114
  *,
1115
  prev_metrics: Dict[str, Any],
@@ -1246,7 +1264,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1246
  queued_candidates: List[Dict[str, Any]] = []
1247
  stage_idx = 0
1248
  stage_no_progress_steps = 0
1249
- stage_goal_cursor: Dict[str, int] = {}
1250
 
1251
  for step in range(max_iterations):
1252
  while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
@@ -1266,7 +1284,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1266
  language,
1267
  stage=active_stage,
1268
  bert_stage_target=bert_stage_target,
1269
- stage_cursor=int(stage_goal_cursor.get(active_stage, 0)),
1270
  )
1271
  if goal["type"] == "none":
1272
  stage_idx += 1
@@ -1594,8 +1612,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1594
  stage_no_progress_steps = 0
1595
  else:
1596
  stage_no_progress_steps += 1
1597
- if active_stage == "ngram":
1598
- stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1599
  applied_changes += 1
1600
  queued_candidates = []
1601
 
@@ -1739,8 +1757,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1739
  stage_no_progress_steps = 0
1740
  else:
1741
  stage_no_progress_steps += 1
1742
- if active_stage == "ngram":
1743
- stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1744
  applied_changes += 1
1745
  batch_applied = True
1746
  batch_info = {
@@ -1860,7 +1878,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1860
  )
1861
  stage_no_progress_steps += 1
1862
  if active_stage == "ngram":
1863
- stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1864
  if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
1865
  stage_idx += 1
1866
  stage_no_progress_steps = 0
@@ -1895,8 +1913,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1895
  stage_no_progress_steps = 0
1896
  else:
1897
  stage_no_progress_steps += 1
1898
- if active_stage == "ngram":
1899
- stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1900
  applied_changes += 1
1901
  queued_candidates = []
1902
 
 
26
  SEMANTIC_GAP_TOLERANCE_PCT = 0.15
27
  SEMANTIC_GAP_MIN_ABS = 3.0
28
  STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
29
+ NGRAM_ATTEMPTS_PER_TERM = 3
30
 
31
 
32
  def _tokenize(text: str) -> List[str]:
 
488
  continue
489
  if not _is_ngram_outside_tolerance(target, comp_avg):
490
  continue
491
+ # N-gram stage is for underrepresented terms only.
492
+ if target >= comp_avg:
493
+ continue
494
  tol = _ngram_tolerance_pct(comp_avg)
495
  dev_ratio = _ngram_deviation_ratio(target, comp_avg)
496
  ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
497
  if ngram_rows:
498
  ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
499
+ pick = max(0, int(stage_cursor))
500
+ if pick >= len(ngram_rows):
501
+ # No more n-gram targets in current stage cursor window.
502
+ pass
503
+ else:
504
+ label, target, comp_avg, tol, _, _ = ngram_rows[pick]
505
+ candidates["ngram"] = {
506
+ "type": "ngram",
507
+ "label": label,
508
+ "focus_terms": [label],
509
+ "avoid_terms": [],
510
+ "ngram_target_count": target,
511
+ "ngram_comp_avg": comp_avg,
512
+ "ngram_tolerance_pct": tol,
513
+ "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
514
+ "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
515
+ "ngram_direction": "increase" if target < comp_avg else "decrease",
516
+ "ngram_rank_index": pick,
517
+ "ngram_candidates_total": len(ngram_rows),
518
+ }
519
 
520
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
521
  title_target_score = title_bert.get("target_score")
 
1118
  return True
1119
 
1120
 
1121
+ def _advance_ngram_term_cursor(cursor_state: Dict[str, Dict[str, int]], stage_key: str) -> None:
1122
+ state = cursor_state.get(stage_key) or {"term_index": 0, "attempt_count": 0}
1123
+ attempts = int(state.get("attempt_count", 0)) + 1
1124
+ term_index = int(state.get("term_index", 0))
1125
+ if attempts >= NGRAM_ATTEMPTS_PER_TERM:
1126
+ term_index += 1
1127
+ attempts = 0
1128
+ cursor_state[stage_key] = {"term_index": term_index, "attempt_count": attempts}
1129
+
1130
+
1131
  def _candidate_utility(
1132
  *,
1133
  prev_metrics: Dict[str, Any],
 
1264
  queued_candidates: List[Dict[str, Any]] = []
1265
  stage_idx = 0
1266
  stage_no_progress_steps = 0
1267
+ stage_goal_cursor: Dict[str, Dict[str, int]] = {}
1268
 
1269
  for step in range(max_iterations):
1270
  while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
 
1284
  language,
1285
  stage=active_stage,
1286
  bert_stage_target=bert_stage_target,
1287
+ stage_cursor=int((stage_goal_cursor.get(active_stage) or {}).get("term_index", 0)),
1288
  )
1289
  if goal["type"] == "none":
1290
  stage_idx += 1
 
1612
  stage_no_progress_steps = 0
1613
  else:
1614
  stage_no_progress_steps += 1
1615
+ if active_stage == "ngram":
1616
+ _advance_ngram_term_cursor(stage_goal_cursor, active_stage)
1617
  applied_changes += 1
1618
  queued_candidates = []
1619
 
 
1757
  stage_no_progress_steps = 0
1758
  else:
1759
  stage_no_progress_steps += 1
1760
+ if active_stage == "ngram":
1761
+ _advance_ngram_term_cursor(stage_goal_cursor, active_stage)
1762
  applied_changes += 1
1763
  batch_applied = True
1764
  batch_info = {
 
1878
  )
1879
  stage_no_progress_steps += 1
1880
  if active_stage == "ngram":
1881
+ _advance_ngram_term_cursor(stage_goal_cursor, active_stage)
1882
  if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
1883
  stage_idx += 1
1884
  stage_no_progress_steps = 0
 
1913
  stage_no_progress_steps = 0
1914
  else:
1915
  stage_no_progress_steps += 1
1916
+ if active_stage == "ngram":
1917
+ _advance_ngram_term_cursor(stage_goal_cursor, active_stage)
1918
  applied_changes += 1
1919
  queued_candidates = []
1920