lsdf commited on
Commit
86b2387
·
1 Parent(s): d5937ae

Align n-gram selection rules across summary and optimizer.

Browse files

Apply stage candidate rules for bi/tri-grams and keyword-derived unigrams, add n-gram target rotation to avoid single-term stalls, and sync summary n-gram logic with optimizer tolerance policy.

Made-with: Cursor

docs/TEXT_OPTIMIZER_PRINCIPLES.md CHANGED
@@ -41,6 +41,23 @@ Update it whenever optimization policy changes.
41
  - if `avg >= 4`: acceptable range is `avg +/- 20%`
42
  - if `avg < 4`: acceptable range is `avg +/- 50%`
43
  - N-gram signal is counted only when term is outside tolerance and present in enough competitors.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  ## 6) Local acceptance and batch accumulation
46
 
 
41
  - if `avg >= 4`: acceptable range is `avg +/- 20%`
42
  - if `avg < 4`: acceptable range is `avg +/- 50%`
43
  - N-gram signal is counted only when term is outside tolerance and present in enough competitors.
44
+ - Selection rules (multi-competitor mode, `competitors > 1`):
45
+ - bi-grams and tri-grams are eligible when present in `>= 2` competitors;
46
+ - unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
47
+ - Iteration behavior:
48
+ - optimizer may work on one n-gram at a time per step,
49
+ - if no primary progress on current n-gram target, it rotates to the next eligible n-gram candidate.
50
+
51
+ ## 5.1 Summary logic memory (current)
52
+
53
+ - Summary recommendation triggers:
54
+ - BERT warning when phrase score `< 0.70`;
55
+ - BM25 warning when `REMOVE >= 4`;
56
+ - N-gram warning when term is underrepresented among competitors;
57
+ - Title warning when Title BERT `< 0.65`;
58
+ - Semantic warning when keyword terms are weaker than competitor average.
59
+ - For N-grams in summary:
60
+ - summary renders top rows for readability, but optimizer runs against the full eligible candidate set.
61
 
62
  ## 6) Local acceptance and batch accumulation
63
 
optimizer.py CHANGED
@@ -254,6 +254,38 @@ def _ngram_deviation_ratio(target_count: float, competitor_avg: float) -> float:
254
  return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
258
  toks = _filter_stopwords(_tokenize(text), language)
259
  phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
@@ -293,11 +325,15 @@ def _compute_metrics(
293
 
294
  ngram_signal_count = 0
295
  ngram_gap_sum = 0.0
 
296
  ngrams = analysis.get("ngram_stats", {}) or {}
297
- for bucket_name in ("unigrams", "bigrams"):
298
- for item in (ngrams.get(bucket_name) or []):
 
 
299
  comp_occ = int(item.get("comp_occurrence", 0))
300
- if comp_occ < min_signal:
 
301
  continue
302
  target = float(item.get("target_count", 0))
303
  comp_avg = float(item.get("competitor_avg", 0))
@@ -393,6 +429,7 @@ def _choose_optimization_goal(
393
  language: str,
394
  stage: str = "bert",
395
  bert_stage_target: float = BERT_TARGET_THRESHOLD,
 
396
  ) -> Dict[str, Any]:
397
  candidates: Dict[str, Dict[str, Any]] = {}
398
  bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
@@ -434,6 +471,8 @@ def _choose_optimization_goal(
434
  # N-gram balancing (toward competitor average with tolerance policy).
435
  ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
436
  ngram_stats = analysis.get("ngram_stats", {}) or {}
 
 
437
  for bucket_name, bucket in ngram_stats.items():
438
  if not isinstance(bucket, list):
439
  continue
@@ -444,7 +483,7 @@ def _choose_optimization_goal(
444
  target = float(item.get("target_count", 0))
445
  comp_avg = float(item.get("competitor_avg", 0))
446
  comp_occ = int(item.get("comp_occurrence", 0))
447
- if comp_occ < 2:
448
  continue
449
  if not _is_ngram_outside_tolerance(target, comp_avg):
450
  continue
@@ -453,7 +492,8 @@ def _choose_optimization_goal(
453
  ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
454
  if ngram_rows:
455
  ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
456
- label, target, comp_avg, tol, _, _ = ngram_rows[0]
 
457
  candidates["ngram"] = {
458
  "type": "ngram",
459
  "label": label,
@@ -465,6 +505,8 @@ def _choose_optimization_goal(
465
  "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
466
  "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
467
  "ngram_direction": "increase" if target < comp_avg else "decrease",
 
 
468
  }
469
 
470
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
@@ -1204,6 +1246,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1204
  queued_candidates: List[Dict[str, Any]] = []
1205
  stage_idx = 0
1206
  stage_no_progress_steps = 0
 
1207
 
1208
  for step in range(max_iterations):
1209
  while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
@@ -1223,6 +1266,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1223
  language,
1224
  stage=active_stage,
1225
  bert_stage_target=bert_stage_target,
 
1226
  )
1227
  if goal["type"] == "none":
1228
  stage_idx += 1
@@ -1545,10 +1589,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1545
  current_analysis = best_local["analysis"]
1546
  current_semantic = best_local["semantic"]
1547
  current_metrics = best_local["metrics"]
1548
- if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
 
1549
  stage_no_progress_steps = 0
1550
  else:
1551
  stage_no_progress_steps += 1
 
 
1552
  applied_changes += 1
1553
  queued_candidates = []
1554
 
@@ -1687,10 +1734,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1687
  current_analysis = best_batch["batch_analysis"]
1688
  current_semantic = best_batch["batch_semantic"]
1689
  current_metrics = best_batch["batch_metrics"]
1690
- if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
 
1691
  stage_no_progress_steps = 0
1692
  else:
1693
  stage_no_progress_steps += 1
 
 
1694
  applied_changes += 1
1695
  batch_applied = True
1696
  batch_info = {
@@ -1809,6 +1859,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1809
  }
1810
  )
1811
  stage_no_progress_steps += 1
 
 
1812
  if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
1813
  stage_idx += 1
1814
  stage_no_progress_steps = 0
@@ -1838,10 +1890,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1838
  current_analysis = best["analysis"]
1839
  current_semantic = best["semantic"]
1840
  current_metrics = best["metrics"]
1841
- if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
 
1842
  stage_no_progress_steps = 0
1843
  else:
1844
  stage_no_progress_steps += 1
 
 
1845
  applied_changes += 1
1846
  queued_candidates = []
1847
 
 
254
  return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
255
 
256
 
257
+ def _keyword_unigram_set(keywords: List[str], language: str) -> set:
258
+ out = set()
259
+ for kw in keywords:
260
+ toks = _filter_stopwords(_tokenize(kw), language)
261
+ for t in toks:
262
+ out.add(t)
263
+ return out
264
+
265
+
266
+ def _is_ngram_stage_candidate(
267
+ ngram_label: str,
268
+ comp_occurrence: int,
269
+ competitor_count: int,
270
+ keyword_unigrams: set,
271
+ ) -> bool:
272
+ ngram = (ngram_label or "").strip().lower()
273
+ if not ngram:
274
+ return False
275
+ tokens = _tokenize(ngram)
276
+ n = len(tokens)
277
+ if competitor_count > 1:
278
+ if comp_occurrence < 2:
279
+ return False
280
+ if n >= 2:
281
+ # For multi-competitor mode, bi/tri-grams with K>=2 are always candidates.
282
+ return True
283
+ # Unigrams are candidates only if they belong to key phrases.
284
+ return n == 1 and tokens[0] in keyword_unigrams
285
+ # Single-competitor mode: keep broader eligibility.
286
+ return comp_occurrence >= 1
287
+
288
+
289
  def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
290
  toks = _filter_stopwords(_tokenize(text), language)
291
  phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
 
325
 
326
  ngram_signal_count = 0
327
  ngram_gap_sum = 0.0
328
+ keyword_unigrams = _keyword_unigram_set(keywords, language)
329
  ngrams = analysis.get("ngram_stats", {}) or {}
330
+ for bucket in ngrams.values():
331
+ if not isinstance(bucket, list):
332
+ continue
333
+ for item in bucket:
334
  comp_occ = int(item.get("comp_occurrence", 0))
335
+ ngram_label = str(item.get("ngram", ""))
336
+ if not _is_ngram_stage_candidate(ngram_label, comp_occ, competitor_count, keyword_unigrams):
337
  continue
338
  target = float(item.get("target_count", 0))
339
  comp_avg = float(item.get("competitor_avg", 0))
 
429
  language: str,
430
  stage: str = "bert",
431
  bert_stage_target: float = BERT_TARGET_THRESHOLD,
432
+ stage_cursor: int = 0,
433
  ) -> Dict[str, Any]:
434
  candidates: Dict[str, Dict[str, Any]] = {}
435
  bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
 
471
  # N-gram balancing (toward competitor average with tolerance policy).
472
  ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
473
  ngram_stats = analysis.get("ngram_stats", {}) or {}
474
+ competitor_count = len((analysis.get("word_counts", {}) or {}).get("competitors", []) or [])
475
+ keyword_unigrams = _keyword_unigram_set(keywords, language)
476
  for bucket_name, bucket in ngram_stats.items():
477
  if not isinstance(bucket, list):
478
  continue
 
483
  target = float(item.get("target_count", 0))
484
  comp_avg = float(item.get("competitor_avg", 0))
485
  comp_occ = int(item.get("comp_occurrence", 0))
486
+ if not _is_ngram_stage_candidate(ngram_label, comp_occ, competitor_count, keyword_unigrams):
487
  continue
488
  if not _is_ngram_outside_tolerance(target, comp_avg):
489
  continue
 
492
  ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
493
  if ngram_rows:
494
  ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
495
+ pick = max(0, int(stage_cursor)) % len(ngram_rows)
496
+ label, target, comp_avg, tol, _, _ = ngram_rows[pick]
497
  candidates["ngram"] = {
498
  "type": "ngram",
499
  "label": label,
 
505
  "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
506
  "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
507
  "ngram_direction": "increase" if target < comp_avg else "decrease",
508
+ "ngram_rank_index": pick,
509
+ "ngram_candidates_total": len(ngram_rows),
510
  }
511
 
512
  title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
 
1246
  queued_candidates: List[Dict[str, Any]] = []
1247
  stage_idx = 0
1248
  stage_no_progress_steps = 0
1249
+ stage_goal_cursor: Dict[str, int] = {}
1250
 
1251
  for step in range(max_iterations):
1252
  while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
 
1266
  language,
1267
  stage=active_stage,
1268
  bert_stage_target=bert_stage_target,
1269
+ stage_cursor=int(stage_goal_cursor.get(active_stage, 0)),
1270
  )
1271
  if goal["type"] == "none":
1272
  stage_idx += 1
 
1589
  current_analysis = best_local["analysis"]
1590
  current_semantic = best_local["semantic"]
1591
  current_metrics = best_local["metrics"]
1592
+ progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
1593
+ if progressed_stage:
1594
  stage_no_progress_steps = 0
1595
  else:
1596
  stage_no_progress_steps += 1
1597
+ if active_stage == "ngram":
1598
+ stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1599
  applied_changes += 1
1600
  queued_candidates = []
1601
 
 
1734
  current_analysis = best_batch["batch_analysis"]
1735
  current_semantic = best_batch["batch_semantic"]
1736
  current_metrics = best_batch["batch_metrics"]
1737
+ progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
1738
+ if progressed_stage:
1739
  stage_no_progress_steps = 0
1740
  else:
1741
  stage_no_progress_steps += 1
1742
+ if active_stage == "ngram":
1743
+ stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1744
  applied_changes += 1
1745
  batch_applied = True
1746
  batch_info = {
 
1859
  }
1860
  )
1861
  stage_no_progress_steps += 1
1862
+ if active_stage == "ngram":
1863
+ stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1864
  if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
1865
  stage_idx += 1
1866
  stage_no_progress_steps = 0
 
1890
  current_analysis = best["analysis"]
1891
  current_semantic = best["semantic"]
1892
  current_metrics = best["metrics"]
1893
+ progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
1894
+ if progressed_stage:
1895
  stage_no_progress_steps = 0
1896
  else:
1897
  stage_no_progress_steps += 1
1898
+ if active_stage == "ngram":
1899
+ stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
1900
  applied_changes += 1
1901
  queued_candidates = []
1902
 
templates/index.html CHANGED
@@ -1412,22 +1412,51 @@
1412
 
1413
  // === 3) N-grams: сигнал только если 2+ конкурента ===
1414
  const ngramSignals = [];
1415
- const ngramBuckets = analysisData.ngram_stats
1416
- ? [analysisData.ngram_stats.unigrams, analysisData.ngram_stats.bigrams]
1417
- : [];
1418
- ngramBuckets.forEach(bucket => {
1419
- (bucket || []).forEach(item => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  const compOcc = Number(item.comp_occurrence || 0);
1421
  const targetCount = Number(item.target_count || 0);
1422
  const compAvg = Number(item.competitor_avg || 0);
1423
- if (compOcc < minCompetitorSignal) return;
1424
- const ratioSignal = targetCount === 0 ? compAvg > 0 : compAvg >= targetCount * 2;
1425
- if (!ratioSignal) return;
1426
  ngramSignals.push({
1427
  ngram: item.ngram,
1428
  compOcc,
1429
  targetCount,
1430
- compAvg
 
1431
  });
1432
  });
1433
  });
@@ -1439,7 +1468,7 @@
1439
  if (ngramSignals.length > 0) {
1440
  const topSignals = ngramSignals
1441
  .slice()
1442
- .sort((a, b) => (b.compOcc - a.compOcc) || (b.compAvg - a.compAvg))
1443
  .slice(0, 10)
1444
  .map(x => ({
1445
  ngram: x.ngram,
@@ -1661,7 +1690,7 @@
1661
  container.innerHTML = `
1662
  <div class="stat-card">
1663
  <h5 class="card-title mb-3">Итоговые рекомендации (что сделать в первую очередь)</h5>
1664
- <p class="text-muted small mb-3">Сводка формируется по правилам: BERT &lt; 0.70, BM25 remove ≥ 4, n-граммы с сигналом от 2+ конкурентов, Title BERT &lt; 0.65, Semantic Core-разрыв по словам из ключей.</p>
1665
  ${recCards}
1666
  </div>
1667
  <div class="stat-card">
 
1412
 
1413
  // === 3) N-grams: сигнал только если 2+ конкурента ===
1414
  const ngramSignals = [];
1415
+ const ngramStats = analysisData.ngram_stats || {};
1416
+ const kwUnigrams = new Set();
1417
+ keywordsRaw.forEach(kw => {
1418
+ String(kw || '')
1419
+ .toLowerCase()
1420
+ .replace(/[^\p{L}\p{N}\s-]+/gu, ' ')
1421
+ .split(/\s+/)
1422
+ .map(v => v.trim())
1423
+ .filter(v => v.length >= 2)
1424
+ .forEach(t => kwUnigrams.add(t));
1425
+ });
1426
+ const isOutsideTolerance = (targetCount, compAvg) => {
1427
+ if (compAvg <= 0) return false;
1428
+ const tol = compAvg >= 4 ? 0.20 : 0.50;
1429
+ return targetCount < compAvg * (1 - tol) || targetCount > compAvg * (1 + tol);
1430
+ };
1431
+ const isEligibleNgram = (ngram, compOcc) => {
1432
+ const toks = String(ngram || '')
1433
+ .toLowerCase()
1434
+ .replace(/[^\p{L}\p{N}\s-]+/gu, ' ')
1435
+ .split(/\s+/)
1436
+ .map(v => v.trim())
1437
+ .filter(v => v.length >= 2);
1438
+ if (!toks.length) return false;
1439
+ if (competitorCount > 1) {
1440
+ if (compOcc < 2) return false;
1441
+ if (toks.length >= 2) return true; // bi/tri-grams
1442
+ return kwUnigrams.has(toks[0]); // unigram from keyword phrase
1443
+ }
1444
+ return compOcc >= 1;
1445
+ };
1446
+ Object.values(ngramStats).forEach(bucket => {
1447
+ (Array.isArray(bucket) ? bucket : []).forEach(item => {
1448
  const compOcc = Number(item.comp_occurrence || 0);
1449
  const targetCount = Number(item.target_count || 0);
1450
  const compAvg = Number(item.competitor_avg || 0);
1451
+ if (!isEligibleNgram(item.ngram, compOcc)) return;
1452
+ if (!isOutsideTolerance(targetCount, compAvg)) return;
1453
+ const devRatio = compAvg > 0 ? Math.abs(targetCount - compAvg) / compAvg : 0;
1454
  ngramSignals.push({
1455
  ngram: item.ngram,
1456
  compOcc,
1457
  targetCount,
1458
+ compAvg,
1459
+ devRatio
1460
  });
1461
  });
1462
  });
 
1468
  if (ngramSignals.length > 0) {
1469
  const topSignals = ngramSignals
1470
  .slice()
1471
+ .sort((a, b) => (b.devRatio - a.devRatio) || (b.compOcc - a.compOcc) || (b.compAvg - a.compAvg))
1472
  .slice(0, 10)
1473
  .map(x => ({
1474
  ngram: x.ngram,
 
1690
  container.innerHTML = `
1691
  <div class="stat-card">
1692
  <h5 class="card-title mb-3">Итоговые рекомендации (что сделать в первую очередь)</h5>
1693
+ <p class="text-muted small mb-3">Сводка формируется по правилам: BERT &lt; 0.70, BM25 remove ≥ 4, n-граммы по допускам (±20% при Avg≥4, ±50% при Avg&lt;4) с фильтром K>=2 для multi-competitor, Title BERT &lt; 0.65, Semantic Core-разрыв по словам из ключей.</p>
1694
  ${recCards}
1695
  </div>
1696
  <div class="stat-card">