lsdf commited on
Commit
d5937ae
·
1 Parent(s): 84ebb46

Implement n-gram tolerance stage rules and optimizer principles doc.

Browse files

Apply user-defined n-gram gap policy (20% for avg>=4, 50% for avg<4), optimize n-gram stage toward competitor averages with local-safe acceptance, and add a living optimizer principles document.

Made-with: Cursor

docs/FULL_FUNCTIONAL_DOCUMENTATION.md CHANGED
@@ -44,6 +44,7 @@
44
  - `search.py` — смысловой поиск в графе (фразы + слова).
45
  - `url_fetcher.py` — извлечение текста/title из URL с выбором user-agent.
46
  - `optimizer.py` — LLM-оптимизация с обратной связью от метрик.
 
47
  - `templates/index.html` — frontend (UI + клиентская логика JS).
48
 
49
  ---
 
44
  - `search.py` — смысловой поиск в графе (фразы + слова).
45
  - `url_fetcher.py` — извлечение текста/title из URL с выбором user-agent.
46
  - `optimizer.py` — LLM-оптимизация с обратной связью от метрик.
47
+ - `docs/TEXT_OPTIMIZER_PRINCIPLES.md` — живой регламент принципов оптимизатора (stage-пайплайн, допуски, guardrails).
48
  - `templates/index.html` — frontend (UI + клиентская логика JS).
49
 
50
  ---
docs/TEXT_OPTIMIZER_PRINCIPLES.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Optimizer Principles
2
+
3
+ This document is a living spec for iterative text optimization behavior.
4
+ Update it whenever optimization policy changes.
5
+
6
+ ## 1) Multi-objective optimization model
7
+
8
+ - **Primary objective (by stage):**
9
+ - Stage A: BERT phrase relevance
10
+ - Stage B: BM25 remove cleanup
11
+ - Stage C: N-gram balancing
12
+ - Stage D: Semantic gap balancing
13
+ - Stage E: Title alignment
14
+ - **Guardrails (always active):**
15
+ - Do not allow critical metric regressions beyond mode tolerances.
16
+ - Keep grammar, coherence, and non-spam writing.
17
+
18
+ ## 2) Stage order and skipping
19
+
20
+ - Stage order:
21
+ - `bert -> bm25 -> ngram -> semantic -> title`
22
+ - A stage is skipped if no actionable goal exists.
23
+ - Plateau rule:
24
+ - If no primary progress for 3 steps, move to next stage.
25
+
26
+ ## 3) BERT stage policy
27
+
28
+ - Default Stage A threshold: `0.70`.
29
+ - User may set custom threshold via UI (`BERT target A-stage`), e.g. `0.61`.
30
+ - Stage A is complete when max target phrase score reaches configured threshold.
31
+
32
+ ## 4) BM25 stage policy
33
+
34
+ - Main target: reduce/remove over-optimization signals.
35
+ - A stage is considered healthy when `bm25_remove_count <= 3`.
36
+
37
+ ## 5) N-gram stage policy (quantitative)
38
+
39
+ - Goal: bring target counts closer to competitor average, not force exact equality.
40
+ - Tolerance bands:
41
+ - if `avg >= 4`: acceptable range is `avg +/- 20%`
42
+ - if `avg < 4`: acceptable range is `avg +/- 50%`
43
+ - N-gram signal is counted only when term is outside tolerance and present in enough competitors.
44
+
45
+ ## 6) Local acceptance and batch accumulation
46
+
47
+ - First evaluate candidate locally (chunk-level), then globally (document-level).
48
+ - Locally improved candidates may be queued when global score does not move yet.
49
+ - Non-conflicting queued edits can be applied as a batch (2-4 edits) if guardrails pass.
50
+
51
+ ## 7) Text quality constraints
52
+
53
+ - Reject candidates with:
54
+ - duplicated entities/words,
55
+ - suspicious token joins,
56
+ - excessive sentence count for current cascade level,
57
+ - obvious stuffing/redundancy.
58
+ - Keep narrative continuity and original subject/entity focus.
59
+
60
+ ## 8) Diagnostics requirements
61
+
62
+ - For every iteration, store:
63
+ - stage, goal, cascade level,
64
+ - candidate validity, local improvement, metric deltas,
65
+ - selected strategy and prompt debug payload.
66
+ - UI must show:
67
+ - stage progression,
68
+ - stage transitions,
69
+ - candidate strategy and reason for rejection.
70
+
optimizer.py CHANGED
@@ -25,7 +25,7 @@ BERT_GOAL_DELTA_MIN = 0.005
25
  TITLE_TARGET_THRESHOLD = 0.65
26
  SEMANTIC_GAP_TOLERANCE_PCT = 0.15
27
  SEMANTIC_GAP_MIN_ABS = 3.0
28
- STAGE_ORDER = ["bert", "bm25", "semantic", "ngram", "title"]
29
 
30
 
31
  def _tokenize(text: str) -> List[str]:
@@ -232,6 +232,44 @@ def _is_semantic_gap(target_weight: float, competitor_avg_weight: float) -> bool
232
  return (competitor_avg_weight > rel_threshold) and (abs_gap >= SEMANTIC_GAP_MIN_ABS)
233
 
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def _compute_metrics(
236
  analysis: Dict[str, Any],
237
  semantic: Dict[str, Any],
@@ -254,6 +292,7 @@ def _compute_metrics(
254
  bm25_remove_count = len(bm25_remove)
255
 
256
  ngram_signal_count = 0
 
257
  ngrams = analysis.get("ngram_stats", {}) or {}
258
  for bucket_name in ("unigrams", "bigrams"):
259
  for item in (ngrams.get(bucket_name) or []):
@@ -262,9 +301,9 @@ def _compute_metrics(
262
  continue
263
  target = float(item.get("target_count", 0))
264
  comp_avg = float(item.get("competitor_avg", 0))
265
- ratio_signal = comp_avg > 0 if target == 0 else comp_avg >= target * 2
266
- if ratio_signal:
267
  ngram_signal_count += 1
 
268
 
269
  title_score = None
270
  title_bert = analysis.get("title_analysis", {}).get("bert", {})
@@ -335,6 +374,7 @@ def _compute_metrics(
335
  "bert_phrase_scores": bert_phrase_scores,
336
  "bm25_remove_count": bm25_remove_count,
337
  "ngram_signal_count": ngram_signal_count,
 
338
  "title_bert_score": title_score,
339
  "semantic_gap_count": semantic_gap_count,
340
  "semantic_gap_sum": round(semantic_gap_sum, 4),
@@ -391,22 +431,41 @@ def _choose_optimization_goal(
391
  top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
392
  candidates["semantic"] = {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
393
 
394
- # Fallback: ngram add signal
395
- for bucket_name in ("unigrams", "bigrams"):
396
- bucket = analysis.get("ngram_stats", {}).get(bucket_name, []) or []
 
 
 
397
  for item in bucket:
 
 
 
398
  target = float(item.get("target_count", 0))
399
  comp_avg = float(item.get("competitor_avg", 0))
400
- if (target == 0 and comp_avg > 0) or (target > 0 and comp_avg >= target * 2):
401
- candidates["ngram"] = {
402
- "type": "ngram",
403
- "label": str(item.get("ngram", "")),
404
- "focus_terms": _tokenize(str(item.get("ngram", "")))[:3],
405
- "avoid_terms": [],
406
- }
407
- break
408
- if "ngram" in candidates:
409
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
412
  title_target_score = title_bert.get("target_score")
@@ -584,6 +643,7 @@ def _chunk_goal_relevance(
584
  goal_label: str,
585
  focus_terms: List[str],
586
  language: str,
 
587
  ) -> float:
588
  chunk = (text or "").strip()
589
  if not chunk:
@@ -595,6 +655,8 @@ def _chunk_goal_relevance(
595
  return float(logic.util.cos_sim(embeddings[0:1], embeddings[1:2])[0][0].item())
596
  except Exception:
597
  pass
 
 
598
 
599
  # Lexical fallback for non-BERT goals or if embedding scoring is unavailable.
600
  toks = _filter_stopwords(_tokenize(chunk), language)
@@ -620,15 +682,26 @@ def _chunk_goal_delta(
620
  goal_label: str,
621
  focus_terms: List[str],
622
  language: str,
 
623
  ) -> float:
624
- before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language)
625
- after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language)
 
 
 
 
 
 
 
626
  return round(after_rel - before_rel, 4)
627
 
628
 
629
  def _min_chunk_delta(goal_type: str) -> float:
630
  if goal_type == "bert":
631
  return 0.01
 
 
 
632
  return 0.05
633
 
634
 
@@ -639,9 +712,10 @@ def _chunk_relevance_pair(
639
  goal_label: str,
640
  focus_terms: List[str],
641
  language: str,
 
642
  ) -> Tuple[float, float]:
643
- before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language)
644
- after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language)
645
  return round(before_rel, 4), round(after_rel, 4)
646
 
647
 
@@ -690,6 +764,7 @@ def _metrics_delta(prev_metrics: Dict[str, Any], next_metrics: Dict[str, Any]) -
690
  "bert_low_count",
691
  "bm25_remove_count",
692
  "ngram_signal_count",
 
693
  "semantic_gap_count",
694
  "semantic_gap_sum",
695
  ]
@@ -925,7 +1000,10 @@ def _goal_improved(
925
  if goal_type == "semantic":
926
  return next_metrics["semantic_gap_count"] < prev_metrics["semantic_gap_count"]
927
  if goal_type == "ngram":
928
- return next_metrics["ngram_signal_count"] < prev_metrics["ngram_signal_count"]
 
 
 
929
  return next_metrics["score"] > prev_metrics["score"]
930
 
931
 
@@ -960,7 +1038,10 @@ def _stage_primary_progress(stage: str, prev_metrics: Dict[str, Any], next_metri
960
  or float(next_metrics.get("semantic_gap_sum", 0.0)) < float(prev_metrics.get("semantic_gap_sum", 0.0))
961
  )
962
  if stage == "ngram":
963
- return int(next_metrics.get("ngram_signal_count", 0)) < int(prev_metrics.get("ngram_signal_count", 0))
 
 
 
964
  if stage == "title":
965
  pv = prev_metrics.get("title_bert_score")
966
  nv = next_metrics.get("title_bert_score")
@@ -1250,6 +1331,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1250
  goal["label"],
1251
  goal.get("focus_terms", []) or [],
1252
  language,
 
1253
  )
1254
  chunk_delta = _chunk_goal_delta(
1255
  original_span_text,
@@ -1258,6 +1340,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1258
  goal["label"],
1259
  goal.get("focus_terms", []) or [],
1260
  language,
 
1261
  )
1262
  local_chunk_improved = chunk_delta >= _min_chunk_delta(goal["type"])
1263
 
 
25
  TITLE_TARGET_THRESHOLD = 0.65
26
  SEMANTIC_GAP_TOLERANCE_PCT = 0.15
27
  SEMANTIC_GAP_MIN_ABS = 3.0
28
+ STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
29
 
30
 
31
  def _tokenize(text: str) -> List[str]:
 
232
  return (competitor_avg_weight > rel_threshold) and (abs_gap >= SEMANTIC_GAP_MIN_ABS)
233
 
234
 
235
+ def _ngram_tolerance_pct(competitor_avg: float) -> float:
236
+ # User rule:
237
+ # - avg >= 4 -> +/-20%
238
+ # - avg < 4 -> +/-50%
239
+ return 0.20 if competitor_avg >= 4.0 else 0.50
240
+
241
+
242
+ def _is_ngram_outside_tolerance(target_count: float, competitor_avg: float) -> bool:
243
+ if competitor_avg <= 0:
244
+ return False
245
+ tol = _ngram_tolerance_pct(competitor_avg)
246
+ low = competitor_avg * (1.0 - tol)
247
+ high = competitor_avg * (1.0 + tol)
248
+ return target_count < low or target_count > high
249
+
250
+
251
+ def _ngram_deviation_ratio(target_count: float, competitor_avg: float) -> float:
252
+ if competitor_avg <= 0:
253
+ return 0.0
254
+ return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
255
+
256
+
257
+ def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
258
+ toks = _filter_stopwords(_tokenize(text), language)
259
+ phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
260
+ if not toks or not phrase_toks:
261
+ return 0
262
+ n = len(phrase_toks)
263
+ if n == 1:
264
+ term = phrase_toks[0]
265
+ return sum(1 for t in toks if t == term)
266
+ count = 0
267
+ for i in range(0, max(0, len(toks) - n + 1)):
268
+ if toks[i : i + n] == phrase_toks:
269
+ count += 1
270
+ return count
271
+
272
+
273
  def _compute_metrics(
274
  analysis: Dict[str, Any],
275
  semantic: Dict[str, Any],
 
292
  bm25_remove_count = len(bm25_remove)
293
 
294
  ngram_signal_count = 0
295
+ ngram_gap_sum = 0.0
296
  ngrams = analysis.get("ngram_stats", {}) or {}
297
  for bucket_name in ("unigrams", "bigrams"):
298
  for item in (ngrams.get(bucket_name) or []):
 
301
  continue
302
  target = float(item.get("target_count", 0))
303
  comp_avg = float(item.get("competitor_avg", 0))
304
+ if _is_ngram_outside_tolerance(target, comp_avg):
 
305
  ngram_signal_count += 1
306
+ ngram_gap_sum += _ngram_deviation_ratio(target, comp_avg)
307
 
308
  title_score = None
309
  title_bert = analysis.get("title_analysis", {}).get("bert", {})
 
374
  "bert_phrase_scores": bert_phrase_scores,
375
  "bm25_remove_count": bm25_remove_count,
376
  "ngram_signal_count": ngram_signal_count,
377
+ "ngram_gap_sum": round(ngram_gap_sum, 4),
378
  "title_bert_score": title_score,
379
  "semantic_gap_count": semantic_gap_count,
380
  "semantic_gap_sum": round(semantic_gap_sum, 4),
 
431
  top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
432
  candidates["semantic"] = {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
433
 
434
+ # N-gram balancing (toward competitor average with tolerance policy).
435
+ ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
436
+ ngram_stats = analysis.get("ngram_stats", {}) or {}
437
+ for bucket_name, bucket in ngram_stats.items():
438
+ if not isinstance(bucket, list):
439
+ continue
440
  for item in bucket:
441
+ ngram_label = str(item.get("ngram", "")).strip()
442
+ if not ngram_label:
443
+ continue
444
  target = float(item.get("target_count", 0))
445
  comp_avg = float(item.get("competitor_avg", 0))
446
+ comp_occ = int(item.get("comp_occurrence", 0))
447
+ if comp_occ < 2:
448
+ continue
449
+ if not _is_ngram_outside_tolerance(target, comp_avg):
450
+ continue
451
+ tol = _ngram_tolerance_pct(comp_avg)
452
+ dev_ratio = _ngram_deviation_ratio(target, comp_avg)
453
+ ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
454
+ if ngram_rows:
455
+ ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
456
+ label, target, comp_avg, tol, _, _ = ngram_rows[0]
457
+ candidates["ngram"] = {
458
+ "type": "ngram",
459
+ "label": label,
460
+ "focus_terms": [label],
461
+ "avoid_terms": [],
462
+ "ngram_target_count": target,
463
+ "ngram_comp_avg": comp_avg,
464
+ "ngram_tolerance_pct": tol,
465
+ "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
466
+ "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
467
+ "ngram_direction": "increase" if target < comp_avg else "decrease",
468
+ }
469
 
470
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
471
  title_target_score = title_bert.get("target_score")
 
643
  goal_label: str,
644
  focus_terms: List[str],
645
  language: str,
646
+ goal_meta: Optional[Dict[str, Any]] = None,
647
  ) -> float:
648
  chunk = (text or "").strip()
649
  if not chunk:
 
655
  return float(logic.util.cos_sim(embeddings[0:1], embeddings[1:2])[0][0].item())
656
  except Exception:
657
  pass
658
+ if goal_type == "ngram" and (goal_label or "").strip():
659
+ return float(_chunk_ngram_count(chunk, goal_label, language))
660
 
661
  # Lexical fallback for non-BERT goals or if embedding scoring is unavailable.
662
  toks = _filter_stopwords(_tokenize(chunk), language)
 
682
  goal_label: str,
683
  focus_terms: List[str],
684
  language: str,
685
+ goal_meta: Optional[Dict[str, Any]] = None,
686
  ) -> float:
687
+ before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language, goal_meta)
688
+ after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language, goal_meta)
689
+ if goal_type == "ngram":
690
+ target_avg = float((goal_meta or {}).get("ngram_comp_avg", 0.0))
691
+ if target_avg > 0:
692
+ # Positive delta means closer to competitor average regardless of direction.
693
+ before_dist = abs(before_rel - target_avg)
694
+ after_dist = abs(after_rel - target_avg)
695
+ return round(before_dist - after_dist, 4)
696
  return round(after_rel - before_rel, 4)
697
 
698
 
699
  def _min_chunk_delta(goal_type: str) -> float:
700
  if goal_type == "bert":
701
  return 0.01
702
+ if goal_type == "ngram":
703
+ # Require at least one occurrence-equivalent movement toward target zone.
704
+ return 0.5
705
  return 0.05
706
 
707
 
 
712
  goal_label: str,
713
  focus_terms: List[str],
714
  language: str,
715
+ goal_meta: Optional[Dict[str, Any]] = None,
716
  ) -> Tuple[float, float]:
717
+ before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language, goal_meta)
718
+ after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language, goal_meta)
719
  return round(before_rel, 4), round(after_rel, 4)
720
 
721
 
 
764
  "bert_low_count",
765
  "bm25_remove_count",
766
  "ngram_signal_count",
767
+ "ngram_gap_sum",
768
  "semantic_gap_count",
769
  "semantic_gap_sum",
770
  ]
 
1000
  if goal_type == "semantic":
1001
  return next_metrics["semantic_gap_count"] < prev_metrics["semantic_gap_count"]
1002
  if goal_type == "ngram":
1003
+ return (
1004
+ next_metrics["ngram_signal_count"] < prev_metrics["ngram_signal_count"]
1005
+ or float(next_metrics.get("ngram_gap_sum", 0.0)) < float(prev_metrics.get("ngram_gap_sum", 0.0))
1006
+ )
1007
  return next_metrics["score"] > prev_metrics["score"]
1008
 
1009
 
 
1038
  or float(next_metrics.get("semantic_gap_sum", 0.0)) < float(prev_metrics.get("semantic_gap_sum", 0.0))
1039
  )
1040
  if stage == "ngram":
1041
+ return (
1042
+ int(next_metrics.get("ngram_signal_count", 0)) < int(prev_metrics.get("ngram_signal_count", 0))
1043
+ or float(next_metrics.get("ngram_gap_sum", 0.0)) < float(prev_metrics.get("ngram_gap_sum", 0.0))
1044
+ )
1045
  if stage == "title":
1046
  pv = prev_metrics.get("title_bert_score")
1047
  nv = next_metrics.get("title_bert_score")
 
1331
  goal["label"],
1332
  goal.get("focus_terms", []) or [],
1333
  language,
1334
+ goal,
1335
  )
1336
  chunk_delta = _chunk_goal_delta(
1337
  original_span_text,
 
1340
  goal["label"],
1341
  goal.get("focus_terms", []) or [],
1342
  language,
1343
+ goal,
1344
  )
1345
  local_chunk_improved = chunk_delta >= _min_chunk_delta(goal["type"])
1346
 
templates/index.html CHANGED
@@ -859,7 +859,7 @@
859
  const before = it.metrics_before ? it.metrics_before.score : '-';
860
  const after = it.metrics_after ? it.metrics_after.score : '-';
861
  const baseline = (it.current_score ?? before);
862
- const reason = it.reason || (it.candidates ? 'all candidates rejected by constraints' : '-');
863
  const stage = (it.stage || (it.goal && it.goal.type) || '-');
864
  const advanced = it.advanced_to_stage ? ` → ${it.advanced_to_stage}` : '';
865
  return `<tr>
 
859
  const before = it.metrics_before ? it.metrics_before.score : '-';
860
  const after = it.metrics_after ? it.metrics_after.score : '-';
861
  const baseline = (it.current_score ?? before);
862
+ const reason = it.reason || ((it.status === 'rejected' && it.candidates) ? 'all candidates rejected by constraints' : '-');
863
  const stage = (it.stage || (it.goal && it.goal.type) || '-');
864
  const advanced = it.advanced_to_stage ? ` → ${it.advanced_to_stage}` : '';
865
  return `<tr>