Spaces:
Running
Running
refactor(optimizer): apply unified per-goal iteration budget across all stages
Browse filesBuild per-stage goal lists and iterate each goal with max_iterations attempts before moving on. Total run budget now scales with number of actionable goals, and docs are updated to match the universal rule.
Made-with: Cursor
- docs/FULL_FUNCTIONAL_DOCUMENTATION.md +2 -2
- optimizer.py +91 -48
docs/FULL_FUNCTIONAL_DOCUMENTATION.md
CHANGED
|
@@ -472,7 +472,7 @@ HTML extraction pipeline:
|
|
| 472 |
- `_is_stage_complete` для `bert`:
|
| 473 |
- этап считается завершённым только когда **каждая** отслеживаемая ключевая фраза достигает `bert_stage_target` (проверка по `min(bert_phrase_scores)`);
|
| 474 |
- достижение порога одной «сильной» фразой больше не завершает BERT-этап.
|
| 475 |
-
-
|
| 476 |
- `_validate_candidate_text`:
|
| 477 |
- отклоняет некачественные/спамные кандидаты (дубли слов/сущностей, подозрительные склейки токенов);
|
| 478 |
- добавляет anti-stuffing фильтр для цели BERT (повторы exact phrase и чрезмерные повторы focus-термов).
|
|
@@ -480,7 +480,7 @@ HTML extraction pipeline:
|
|
| 480 |
### Главная функция `optimize_text`
|
| 481 |
Итерационный цикл:
|
| 482 |
1. baseline metrics.
|
| 483 |
-
-
|
| 484 |
2. выбрать goal.
|
| 485 |
3. выбрать пул чанков и операцию каскада.
|
| 486 |
- **Этап `title`:** если средняя BERT-близость Title к ключам (`title_bert_score`) ниже порога (`TITLE_TARGET_THRESHOLD` ≈ 0.65), цель — **только переписать текст из поля Title** (`target_title`), а не абзац основного текста. LLM получает текущий title, выдержку из body и ключевые слова; метрики пересчитываются с новым title. Пакетные правки по body с title не смешиваются.
|
|
|
|
| 472 |
- `_is_stage_complete` для `bert`:
|
| 473 |
- этап считается завершённым только когда **каждая** отслеживаемая ключевая фраза достигает `bert_stage_target` (проверка по `min(bert_phrase_scores)`);
|
| 474 |
- достижение порога одной «сильной» фразой больше не завершает BERT-этап.
|
| 475 |
+
- унифицированный цикл по целям: на каждой стадии для **каждой** найденной цели/фразы действует одинаковый бюджет `max_iterations` попыток; после исчерпания лимита оптимизатор переходит к следующей цели той же стадии.
|
| 476 |
- `_validate_candidate_text`:
|
| 477 |
- отклоняет некачественные/спамные кандидаты (дубли слов/сущностей, подозрительные склейки токенов);
|
| 478 |
- добавляет anti-stuffing фильтр для цели BERT (повторы exact phrase и чрезмерные повторы focus-термов).
|
|
|
|
| 480 |
### Главная функция `optimize_text`
|
| 481 |
Итерационный цикл:
|
| 482 |
1. baseline metrics.
|
| 483 |
+
- общий бюджет шагов оценивается как `sum(цели_стадии × max_iterations)` по всем стадиям (с верхней отсечкой в коде), то есть масштабируется по числу реально требующих улучшения целей.
|
| 484 |
2. выбрать goal.
|
| 485 |
3. выбрать пул чанков и операцию каскада.
|
| 486 |
- **Этап `title`:** если средняя BERT-близость Title к ключам (`title_bert_score`) ниже порога (`TITLE_TARGET_THRESHOLD` ≈ 0.65), цель — **только переписать текст из поля Title** (`target_title`), а не абзац основного текста. LLM получает текущий title, выдержку из body и ключевые слова; метрики пересчитываются с новым title. Пакетные правки по body с title не смешиваются.
|
optimizer.py
CHANGED
|
@@ -545,18 +545,48 @@ def _choose_optimization_goal(
|
|
| 545 |
bert_stage_target: float = BERT_TARGET_THRESHOLD,
|
| 546 |
stage_cursor: int = 0,
|
| 547 |
) -> Dict[str, Any]:
|
| 548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
|
| 550 |
low_bert = [x for x in bert_details if float(x.get("my_max_score", 0)) < float(bert_stage_target)]
|
| 551 |
if low_bert:
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
| 555 |
|
| 556 |
bm25_remove = [x for x in (analysis.get("bm25_recommendations") or []) if x.get("action") == "remove"]
|
| 557 |
if len(bm25_remove) >= 4:
|
| 558 |
-
|
| 559 |
-
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
# Semantic keyword gaps
|
| 562 |
lang_stop = STOP_WORDS.get(language, STOP_WORDS["en"])
|
|
@@ -579,19 +609,14 @@ def _choose_optimization_goal(
|
|
| 579 |
if _is_semantic_gap(target_w, comp_w):
|
| 580 |
candidate_rows.append((term, gap))
|
| 581 |
if candidate_rows:
|
| 582 |
-
|
| 583 |
-
|
| 584 |
|
| 585 |
# N-gram balancing (toward competitor average with tolerance policy).
|
| 586 |
ngram_rows = _build_ngram_stage_rows(analysis, keywords, language)
|
| 587 |
if ngram_rows:
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
# No more n-gram targets in current stage cursor window.
|
| 591 |
-
pass
|
| 592 |
-
else:
|
| 593 |
-
label, target, comp_avg, tol, _, _ = ngram_rows[pick]
|
| 594 |
-
candidates["ngram"] = {
|
| 595 |
"type": "ngram",
|
| 596 |
"label": label,
|
| 597 |
"focus_terms": [label],
|
|
@@ -602,9 +627,9 @@ def _choose_optimization_goal(
|
|
| 602 |
"ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
|
| 603 |
"ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
|
| 604 |
"ngram_direction": "increase" if target < comp_avg else "decrease",
|
| 605 |
-
"ngram_rank_index":
|
| 606 |
"ngram_candidates_total": len(ngram_rows),
|
| 607 |
-
}
|
| 608 |
|
| 609 |
title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
|
| 610 |
title_target_score = title_bert.get("target_score")
|
|
@@ -613,17 +638,14 @@ def _choose_optimization_goal(
|
|
| 613 |
and title_target_score is not None
|
| 614 |
and float(title_target_score) < TITLE_TARGET_THRESHOLD
|
| 615 |
):
|
| 616 |
-
|
| 617 |
"type": "title",
|
| 618 |
"label": "title alignment",
|
| 619 |
"focus_terms": _filter_stopwords(_tokenize(" ".join(keywords[:8])), language)[:8],
|
| 620 |
"avoid_terms": [],
|
| 621 |
-
}
|
| 622 |
|
| 623 |
-
|
| 624 |
-
return candidates[stage]
|
| 625 |
-
|
| 626 |
-
return {"type": "none", "label": "no-op", "focus_terms": [], "avoid_terms": []}
|
| 627 |
|
| 628 |
|
| 629 |
def _choose_sentence_idx(sentences: List[str], focus_terms: List[str], avoid_terms: List[str], language: str) -> int:
|
|
@@ -1485,11 +1507,24 @@ def optimize_text(
|
|
| 1485 |
baseline_analysis, baseline_semantic, keywords, language, bert_stage_target=bert_stage_target
|
| 1486 |
)
|
| 1487 |
|
| 1488 |
-
#
|
| 1489 |
-
#
|
| 1490 |
-
|
| 1491 |
-
|
| 1492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1493 |
|
| 1494 |
current_text = target_text
|
| 1495 |
current_title = (target_title or "").strip()
|
|
@@ -1574,15 +1609,40 @@ def optimize_text(
|
|
| 1574 |
break
|
| 1575 |
|
| 1576 |
active_stage = STAGE_ORDER[stage_idx]
|
| 1577 |
-
|
| 1578 |
current_analysis,
|
| 1579 |
current_semantic,
|
| 1580 |
keywords,
|
| 1581 |
language,
|
| 1582 |
stage=active_stage,
|
| 1583 |
bert_stage_target=bert_stage_target,
|
| 1584 |
-
stage_cursor=int((stage_goal_cursor.get(active_stage) or {}).get("term_index", 0)),
|
| 1585 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1586 |
if goal["type"] == "none":
|
| 1587 |
stage_idx += 1
|
| 1588 |
stage_no_progress_steps = 0
|
|
@@ -2176,8 +2236,6 @@ def optimize_text(
|
|
| 2176 |
stage_no_progress_steps = 0
|
| 2177 |
else:
|
| 2178 |
stage_no_progress_steps += 1
|
| 2179 |
-
if active_stage == "ngram":
|
| 2180 |
-
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 2181 |
applied_changes += 1
|
| 2182 |
queued_candidates = []
|
| 2183 |
|
|
@@ -2322,8 +2380,6 @@ def optimize_text(
|
|
| 2322 |
stage_no_progress_steps = 0
|
| 2323 |
else:
|
| 2324 |
stage_no_progress_steps += 1
|
| 2325 |
-
if active_stage == "ngram":
|
| 2326 |
-
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 2327 |
applied_changes += 1
|
| 2328 |
batch_applied = True
|
| 2329 |
batch_info = {
|
|
@@ -2442,18 +2498,7 @@ def optimize_text(
|
|
| 2442 |
}
|
| 2443 |
)
|
| 2444 |
stage_no_progress_steps += 1
|
| 2445 |
-
|
| 2446 |
-
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 2447 |
-
# Do not auto-skip BERT stage on local plateau while threshold is unmet.
|
| 2448 |
-
# For BERT we keep iterating (with cascade escalation) until either:
|
| 2449 |
-
# - per-phrase threshold is met in _is_stage_complete, or
|
| 2450 |
-
# - global step budget is exhausted.
|
| 2451 |
-
can_advance_on_plateau = active_stage != "bert"
|
| 2452 |
-
if can_advance_on_plateau and stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
|
| 2453 |
-
stage_idx += 1
|
| 2454 |
-
stage_no_progress_steps = 0
|
| 2455 |
-
logs[-1]["advanced_to_stage"] = STAGE_ORDER[stage_idx]
|
| 2456 |
-
logs[-1]["reason"] = f"{logs[-1].get('reason', '-') } Stage plateau: no primary progress for 3 steps."
|
| 2457 |
consecutive_failures += 1
|
| 2458 |
if consecutive_failures >= 2 and cascade_level < 4:
|
| 2459 |
cascade_level += 1
|
|
@@ -2487,8 +2532,6 @@ def optimize_text(
|
|
| 2487 |
stage_no_progress_steps = 0
|
| 2488 |
else:
|
| 2489 |
stage_no_progress_steps += 1
|
| 2490 |
-
if active_stage == "ngram":
|
| 2491 |
-
_advance_ngram_term_cursor(stage_goal_cursor, active_stage)
|
| 2492 |
applied_changes += 1
|
| 2493 |
queued_candidates = []
|
| 2494 |
|
|
|
|
| 545 |
bert_stage_target: float = BERT_TARGET_THRESHOLD,
|
| 546 |
stage_cursor: int = 0,
|
| 547 |
) -> Dict[str, Any]:
|
| 548 |
+
goals = _collect_optimization_goals(
|
| 549 |
+
analysis=analysis,
|
| 550 |
+
semantic=semantic,
|
| 551 |
+
keywords=keywords,
|
| 552 |
+
language=language,
|
| 553 |
+
stage=stage,
|
| 554 |
+
bert_stage_target=bert_stage_target,
|
| 555 |
+
)
|
| 556 |
+
if not goals:
|
| 557 |
+
return {"type": "none", "label": "no-op", "focus_terms": [], "avoid_terms": []}
|
| 558 |
+
pick = max(0, int(stage_cursor))
|
| 559 |
+
if pick >= len(goals):
|
| 560 |
+
return {"type": "none", "label": "no-op", "focus_terms": [], "avoid_terms": []}
|
| 561 |
+
return goals[pick]
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
def _collect_optimization_goals(
|
| 565 |
+
analysis: Dict[str, Any],
|
| 566 |
+
semantic: Dict[str, Any],
|
| 567 |
+
keywords: List[str],
|
| 568 |
+
language: str,
|
| 569 |
+
stage: str = "bert",
|
| 570 |
+
bert_stage_target: float = BERT_TARGET_THRESHOLD,
|
| 571 |
+
) -> List[Dict[str, Any]]:
|
| 572 |
+
goals: List[Dict[str, Any]] = []
|
| 573 |
bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
|
| 574 |
low_bert = [x for x in bert_details if float(x.get("my_max_score", 0)) < float(bert_stage_target)]
|
| 575 |
if low_bert:
|
| 576 |
+
for row in sorted(low_bert, key=lambda x: float(x.get("my_max_score", 0))):
|
| 577 |
+
phrase = str(row.get("phrase", "")).strip()
|
| 578 |
+
if not phrase:
|
| 579 |
+
continue
|
| 580 |
+
focus_terms = _filter_stopwords(_tokenize(phrase), language)[:4]
|
| 581 |
+
goals.append({"type": "bert", "label": phrase, "focus_terms": focus_terms, "avoid_terms": []})
|
| 582 |
|
| 583 |
bm25_remove = [x for x in (analysis.get("bm25_recommendations") or []) if x.get("action") == "remove"]
|
| 584 |
if len(bm25_remove) >= 4:
|
| 585 |
+
for row in sorted(bm25_remove, key=lambda r: int(r.get("count", 0)), reverse=True)[:8]:
|
| 586 |
+
word = str(row.get("word", "")).strip()
|
| 587 |
+
if not word:
|
| 588 |
+
continue
|
| 589 |
+
goals.append({"type": "bm25", "label": f"reduce spam: {word}", "focus_terms": [], "avoid_terms": [word]})
|
| 590 |
|
| 591 |
# Semantic keyword gaps
|
| 592 |
lang_stop = STOP_WORDS.get(language, STOP_WORDS["en"])
|
|
|
|
| 609 |
if _is_semantic_gap(target_w, comp_w):
|
| 610 |
candidate_rows.append((term, gap))
|
| 611 |
if candidate_rows:
|
| 612 |
+
for term, _gap in sorted(candidate_rows, key=lambda x: x[1], reverse=True)[:12]:
|
| 613 |
+
goals.append({"type": "semantic", "label": term, "focus_terms": [term], "avoid_terms": []})
|
| 614 |
|
| 615 |
# N-gram balancing (toward competitor average with tolerance policy).
|
| 616 |
ngram_rows = _build_ngram_stage_rows(analysis, keywords, language)
|
| 617 |
if ngram_rows:
|
| 618 |
+
for rank, (label, target, comp_avg, tol, _, _) in enumerate(ngram_rows):
|
| 619 |
+
goals.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
"type": "ngram",
|
| 621 |
"label": label,
|
| 622 |
"focus_terms": [label],
|
|
|
|
| 627 |
"ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
|
| 628 |
"ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
|
| 629 |
"ngram_direction": "increase" if target < comp_avg else "decrease",
|
| 630 |
+
"ngram_rank_index": rank,
|
| 631 |
"ngram_candidates_total": len(ngram_rows),
|
| 632 |
+
})
|
| 633 |
|
| 634 |
title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
|
| 635 |
title_target_score = title_bert.get("target_score")
|
|
|
|
| 638 |
and title_target_score is not None
|
| 639 |
and float(title_target_score) < TITLE_TARGET_THRESHOLD
|
| 640 |
):
|
| 641 |
+
goals.append({
|
| 642 |
"type": "title",
|
| 643 |
"label": "title alignment",
|
| 644 |
"focus_terms": _filter_stopwords(_tokenize(" ".join(keywords[:8])), language)[:8],
|
| 645 |
"avoid_terms": [],
|
| 646 |
+
})
|
| 647 |
|
| 648 |
+
return [g for g in goals if g.get("type") == stage]
|
|
|
|
|
|
|
|
|
|
| 649 |
|
| 650 |
|
| 651 |
def _choose_sentence_idx(sentences: List[str], focus_terms: List[str], avoid_terms: List[str], language: str) -> int:
|
|
|
|
| 1507 |
baseline_analysis, baseline_semantic, keywords, language, bert_stage_target=bert_stage_target
|
| 1508 |
)
|
| 1509 |
|
| 1510 |
+
# Unified per-goal budget for all stages:
|
| 1511 |
+
# total steps = sum(goals_in_stage * max_iterations)
|
| 1512 |
+
baseline_goal_counts = {
|
| 1513 |
+
st: len(
|
| 1514 |
+
_collect_optimization_goals(
|
| 1515 |
+
baseline_analysis,
|
| 1516 |
+
baseline_semantic,
|
| 1517 |
+
keywords,
|
| 1518 |
+
language,
|
| 1519 |
+
stage=st,
|
| 1520 |
+
bert_stage_target=bert_stage_target,
|
| 1521 |
+
)
|
| 1522 |
+
)
|
| 1523 |
+
for st in STAGE_ORDER
|
| 1524 |
+
}
|
| 1525 |
+
ngram_row_count = int(baseline_goal_counts.get("ngram", 0))
|
| 1526 |
+
estimated_total = sum(int(c) * int(max_iterations) for c in baseline_goal_counts.values())
|
| 1527 |
+
total_loop_steps = min(240, max(1, estimated_total))
|
| 1528 |
|
| 1529 |
current_text = target_text
|
| 1530 |
current_title = (target_title or "").strip()
|
|
|
|
| 1609 |
break
|
| 1610 |
|
| 1611 |
active_stage = STAGE_ORDER[stage_idx]
|
| 1612 |
+
goals_for_stage = _collect_optimization_goals(
|
| 1613 |
current_analysis,
|
| 1614 |
current_semantic,
|
| 1615 |
keywords,
|
| 1616 |
language,
|
| 1617 |
stage=active_stage,
|
| 1618 |
bert_stage_target=bert_stage_target,
|
|
|
|
| 1619 |
)
|
| 1620 |
+
state = stage_goal_cursor.get(active_stage) or {"goal_index": 0, "attempt_count": 0}
|
| 1621 |
+
goal_index = int(state.get("goal_index", 0))
|
| 1622 |
+
attempt_count = int(state.get("attempt_count", 0))
|
| 1623 |
+
|
| 1624 |
+
# Advance across goals that exhausted per-goal iteration budget.
|
| 1625 |
+
while goal_index < len(goals_for_stage) and attempt_count >= max_iterations:
|
| 1626 |
+
goal_index += 1
|
| 1627 |
+
attempt_count = 0
|
| 1628 |
+
|
| 1629 |
+
if goal_index >= len(goals_for_stage):
|
| 1630 |
+
stage_idx += 1
|
| 1631 |
+
stage_no_progress_steps = 0
|
| 1632 |
+
logs.append(
|
| 1633 |
+
{
|
| 1634 |
+
"step": step + 1,
|
| 1635 |
+
"status": "stage_skipped",
|
| 1636 |
+
"stage": active_stage,
|
| 1637 |
+
"reason": f"All goals exhausted for stage '{active_stage}' (max_iterations={max_iterations} per goal).",
|
| 1638 |
+
}
|
| 1639 |
+
)
|
| 1640 |
+
stage_goal_cursor[active_stage] = {"goal_index": goal_index, "attempt_count": attempt_count}
|
| 1641 |
+
continue
|
| 1642 |
+
|
| 1643 |
+
goal = goals_for_stage[goal_index]
|
| 1644 |
+
attempt_count += 1
|
| 1645 |
+
stage_goal_cursor[active_stage] = {"goal_index": goal_index, "attempt_count": attempt_count}
|
| 1646 |
if goal["type"] == "none":
|
| 1647 |
stage_idx += 1
|
| 1648 |
stage_no_progress_steps = 0
|
|
|
|
| 2236 |
stage_no_progress_steps = 0
|
| 2237 |
else:
|
| 2238 |
stage_no_progress_steps += 1
|
|
|
|
|
|
|
| 2239 |
applied_changes += 1
|
| 2240 |
queued_candidates = []
|
| 2241 |
|
|
|
|
| 2380 |
stage_no_progress_steps = 0
|
| 2381 |
else:
|
| 2382 |
stage_no_progress_steps += 1
|
|
|
|
|
|
|
| 2383 |
applied_changes += 1
|
| 2384 |
batch_applied = True
|
| 2385 |
batch_info = {
|
|
|
|
| 2498 |
}
|
| 2499 |
)
|
| 2500 |
stage_no_progress_steps += 1
|
| 2501 |
+
# Stage transition is controlled by per-stage iteration budget and completion checks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2502 |
consecutive_failures += 1
|
| 2503 |
if consecutive_failures >= 2 and cascade_level < 4:
|
| 2504 |
cascade_level += 1
|
|
|
|
| 2532 |
stage_no_progress_steps = 0
|
| 2533 |
else:
|
| 2534 |
stage_no_progress_steps += 1
|
|
|
|
|
|
|
| 2535 |
applied_changes += 1
|
| 2536 |
queued_candidates = []
|
| 2537 |
|