lsdf commited on
Commit
09a2c0e
·
1 Parent(s): dd4e1d6

Add configurable phrase strategy mode for LLM optimizer.

Browse files

Expose Auto/Distributed/Exact phrase strategy in UI and API, then enforce it in prompt generation and response metadata to reduce unnatural exact-phrase stuffing.

Made-with: Cursor

docs/FULL_FUNCTIONAL_DOCUMENTATION.md CHANGED
@@ -164,7 +164,8 @@
164
  ### Вход (`OptimizerRequest`)
165
  - аналитические данные: `target_text`, `competitors`, `keywords`, `language`, `target_title`, `competitor_titles`
166
  - LLM: `api_key`, `api_base_url`, `model`, `temperature`
167
- - стратегия: `max_iterations`, `candidates_per_iteration`, `optimization_mode`
 
168
 
169
  ### Выход (`OptimizerResponse`)
170
  - `optimized_text`
@@ -172,6 +173,7 @@
172
  - `iterations[]` (подробный лог шагов)
173
  - `applied_changes`
174
  - `optimization_mode`
 
175
  - `error` (если есть)
176
 
177
  ---
@@ -428,7 +430,10 @@ HTML extraction pipeline:
428
  - учитывает `cascade_level` и тип операции (`rewrite`/`insert`)
429
  - явно требует грамматически корректный и естественный текст
430
  - ограничивает число предложений по уровню
431
- - для BERT допускает 2 валидные схемы: exact phrase один раз **или** естественное разнесённое использование core-термов (`mbit`, `alternatives`) в одном абзаце.
 
 
 
432
  - для `rewrite` явно требует сохранить исходный смысл `sentence-by-sentence` и не менять субъект/ключевую сущность без необходимости.
433
 
434
  ### Применение правок
@@ -448,6 +453,9 @@ HTML extraction pipeline:
448
  - hard constraints (не ухудшать критичные метрики сверх допустимого);
449
  - режимы `conservative/balanced/aggressive` задают пороги регрессии;
450
  - решение учитывает и `goal_improved`, и общий `delta_score`.
 
 
 
451
 
452
  ### Главная функция `optimize_text`
453
  Итерационный цикл:
 
164
  ### Вход (`OptimizerRequest`)
165
  - аналитические данные: `target_text`, `competitors`, `keywords`, `language`, `target_title`, `competitor_titles`
166
  - LLM: `api_key`, `api_base_url`, `model`, `temperature`
167
+ - стратегия: `max_iterations`, `candidates_per_iteration`, `optimization_mode`, `phrase_strategy_mode`
168
+ - `phrase_strategy_mode`: `auto | distributed_preferred | exact_preferred`
169
 
170
  ### Выход (`OptimizerResponse`)
171
  - `optimized_text`
 
173
  - `iterations[]` (подробный лог шагов)
174
  - `applied_changes`
175
  - `optimization_mode`
176
+ - `phrase_strategy_mode`
177
  - `error` (если есть)
178
 
179
  ---
 
430
  - учитывает `cascade_level` и тип операции (`rewrite`/`insert`)
431
  - явно требует грамматически корректный и естественный текст
432
  - ограничивает число предложений по уровню
433
+ - для BERT динамически выбирает стратегию по длине целевой фразы:
434
+ - короткие цели: допустим один natural exact match;
435
+ - длинные multi-word цели: приоритет у distributed semantic coverage (части фразы/леммы/близкие формулировки), без forced exact match.
436
+ - exact phrase не должен повторяться: при неестественном звучании он запрещается в пользу распределённой формулировки.
437
  - для `rewrite` явно требует сохранить исходный смысл `sentence-by-sentence` и не менять субъект/ключевую сущность без необходимости.
438
 
439
  ### Применение правок
 
453
  - hard constraints (не ухудшать критичные метрики сверх допустимого);
454
  - режимы `conservative/balanced/aggressive` задают пороги регрессии;
455
  - решение учитывает и `goal_improved`, и общий `delta_score`.
456
+ - `_validate_candidate_text`:
457
+ - отклоняет некачественные/спамные кандидаты (дубли слов/сущностей, подозрительные склейки токенов);
458
+ - добавляет anti-stuffing фильтр для цели BERT (повторы exact phrase и чрезмерные повторы focus-термов).
459
 
460
  ### Главная функция `optimize_text`
461
  Итерационный цикл:
models.py CHANGED
@@ -91,6 +91,7 @@ class OptimizerRequest(BaseModel):
91
  candidates_per_iteration: int = 2
92
  temperature: float = 0.25
93
  optimization_mode: str = "balanced"
 
94
 
95
 
96
  class OptimizerResponse(BaseModel):
@@ -101,4 +102,5 @@ class OptimizerResponse(BaseModel):
101
  iterations: List[Dict[str, Any]] = Field(default_factory=list)
102
  applied_changes: int = 0
103
  optimization_mode: str = "balanced"
 
104
  error: str = ""
 
91
  candidates_per_iteration: int = 2
92
  temperature: float = 0.25
93
  optimization_mode: str = "balanced"
94
+ phrase_strategy_mode: str = "auto" # auto | exact_preferred | distributed_preferred
95
 
96
 
97
  class OptimizerResponse(BaseModel):
 
102
  iterations: List[Dict[str, Any]] = Field(default_factory=list)
103
  applied_changes: int = 0
104
  optimization_mode: str = "balanced"
105
+ phrase_strategy_mode: str = "auto"
106
  error: str = ""
optimizer.py CHANGED
@@ -60,7 +60,13 @@ def _max_sentences_for_level(cascade_level: int, operation: str) -> int:
60
  return 4
61
 
62
 
63
- def _validate_candidate_text(edited_text: str, cascade_level: int, operation: str) -> List[str]:
 
 
 
 
 
 
64
  reasons: List[str] = []
65
  text = (edited_text or "").strip()
66
  if not text:
@@ -79,6 +85,29 @@ def _validate_candidate_text(edited_text: str, cascade_level: int, operation: st
79
  if re.search(r"\b[a-z]{6,}[A-Z][a-z]+\b", text):
80
  reasons.append("suspicious_token_join")
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return reasons
83
 
84
 
@@ -685,6 +714,7 @@ def _llm_edit_chunk(
685
  focus_terms: List[str],
686
  avoid_terms: List[str],
687
  temperature: float,
 
688
  ) -> Dict[str, Any]:
689
  endpoint = base_url.rstrip("/") + "/chat/completions"
690
  op = operation if operation in {"rewrite", "insert"} else "rewrite"
@@ -701,17 +731,45 @@ def _llm_edit_chunk(
701
  else "Create a short bridge chunk (1-2 sentences) to insert after the chunk."
702
  )
703
  max_sent = _max_sentences_for_level(cascade_level, op)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  user_msg = (
705
  f"Language: {language}\n"
706
  f"Operation: {op}\n"
707
  f"Cascade level: L{cascade_level}\n"
708
  f"Goal: {goal_type} ({goal_label})\n"
 
709
  f"Instruction: {op_instruction}\n"
710
  f"Must preserve overall narrative and style.\n"
711
  "Text must be grammatically correct and natural for native readers.\n"
712
  "Keep edits tightly local to the provided chunk and immediate context only.\n"
713
  "Edit must be substantive (not just synonyms) and should increase relevance to the goal phrase.\n"
714
  "Do not change the sentence subject/entity focus unless absolutely required by grammar.\n"
 
715
  f"Focus terms to strengthen: {', '.join(focus_terms) if focus_terms else '-'}\n"
716
  f"Terms to de-emphasize/avoid overuse: {', '.join(avoid_terms) if avoid_terms else '-'}\n\n"
717
  f"Chunk to edit/expand:\n{chunk_text}\n\n"
@@ -722,11 +780,13 @@ def _llm_edit_chunk(
722
  "2) Keep local coherence with surrounding text.\n"
723
  f"3) Max {max_sent} sentence(s) in edited_text.\n"
724
  "4) Keep key named entities from the original chunk unchanged when possible.\n"
725
- "5) For BERT goal, improve semantic match to goal phrase without keyword stuffing.\n"
726
- "6) For BERT goals you may use either: (a) exact phrase once, or (b) natural distributed use of core terms in one paragraph.\n"
727
- "7) For rewrite: preserve original meaning sentence-by-sentence while improving relevance.\n"
728
- "8) Provide rationale in one short sentence.\n"
729
- "9) Only output JSON object."
 
 
730
  )
731
  payload = {
732
  "model": model,
@@ -764,6 +824,9 @@ def _llm_edit_chunk(
764
  "goal_label": goal_label,
765
  "focus_terms": focus_terms,
766
  "avoid_terms": avoid_terms,
 
 
 
767
  "max_sentences": max_sent,
768
  "chunk_text": chunk_text,
769
  "context_before": context_before,
@@ -930,6 +993,9 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
930
  candidates_per_iteration = max(1, min(5, candidates_per_iteration))
931
  temperature = float(request_data.get("temperature", 0.25) or 0.25)
932
  optimization_mode = str(request_data.get("optimization_mode", "balanced") or "balanced")
 
 
 
933
 
934
  baseline_analysis = _build_analysis_snapshot(
935
  target_text, competitors, keywords, language, target_title, competitor_titles
@@ -1022,6 +1088,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1022
  focus_terms=goal["focus_terms"],
1023
  avoid_terms=goal["avoid_terms"],
1024
  temperature=temp,
 
1025
  )
1026
  edited_text = str((llm_result or {}).get("edited_text", "")).strip()
1027
  llm_rationale = str((llm_result or {}).get("rationale", "")).strip()
@@ -1029,7 +1096,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1029
  if not edited_text or edited_text == original_span_text:
1030
  continue
1031
 
1032
- quality_issues = _validate_candidate_text(edited_text, cascade_level, operation)
 
 
 
 
 
 
1033
  before_rel, after_rel = _chunk_relevance_pair(
1034
  original_span_text,
1035
  edited_text,
@@ -1576,4 +1649,5 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
1576
  "iterations": logs,
1577
  "applied_changes": applied_changes,
1578
  "optimization_mode": optimization_mode,
 
1579
  }
 
60
  return 4
61
 
62
 
63
+ def _validate_candidate_text(
64
+ edited_text: str,
65
+ cascade_level: int,
66
+ operation: str,
67
+ goal_label: str = "",
68
+ focus_terms: Optional[List[str]] = None,
69
+ ) -> List[str]:
70
  reasons: List[str] = []
71
  text = (edited_text or "").strip()
72
  if not text:
 
85
  if re.search(r"\b[a-z]{6,}[A-Z][a-z]+\b", text):
86
  reasons.append("suspicious_token_join")
87
 
88
+ # Anti-stuffing checks for BERT phrase goals.
89
+ focus_terms = focus_terms or []
90
+ phrase = (goal_label or "").strip().lower()
91
+ normalized = re.sub(r"\s+", " ", text.lower())
92
+
93
+ if phrase:
94
+ phrase_occurrences = normalized.count(phrase)
95
+ phrase_token_count = len(_tokenize(phrase))
96
+ # For long goal phrases, repeated exact matches are usually unnatural.
97
+ if phrase_token_count >= 3 and phrase_occurrences > 1:
98
+ reasons.append("exact_phrase_stuffing")
99
+ elif phrase_occurrences > 2:
100
+ reasons.append("exact_phrase_stuffing")
101
+
102
+ for term in focus_terms:
103
+ tok = (term or "").strip().lower()
104
+ if not tok:
105
+ continue
106
+ term_occurrences = len(re.findall(rf"\b{re.escape(tok)}\b", normalized))
107
+ if term_occurrences > 3:
108
+ reasons.append("focus_term_overuse")
109
+ break
110
+
111
  return reasons
112
 
113
 
 
714
  focus_terms: List[str],
715
  avoid_terms: List[str],
716
  temperature: float,
717
+ phrase_strategy_mode: str = "auto",
718
  ) -> Dict[str, Any]:
719
  endpoint = base_url.rstrip("/") + "/chat/completions"
720
  op = operation if operation in {"rewrite", "insert"} else "rewrite"
 
731
  else "Create a short bridge chunk (1-2 sentences) to insert after the chunk."
732
  )
733
  max_sent = _max_sentences_for_level(cascade_level, op)
734
+ phrase_tokens = _filter_stopwords(_tokenize(goal_label or ""), language)
735
+ phrase_len = len(phrase_tokens)
736
+ strategy_mode = (phrase_strategy_mode or "auto").strip().lower()
737
+ if strategy_mode not in {"auto", "exact_preferred", "distributed_preferred"}:
738
+ strategy_mode = "auto"
739
+ if strategy_mode == "exact_preferred":
740
+ phrase_strategy = (
741
+ "Prefer one natural exact phrase mention when grammatically correct; otherwise use distributed core-term coverage."
742
+ )
743
+ elif strategy_mode == "distributed_preferred":
744
+ phrase_strategy = (
745
+ "Prefer distributed semantic coverage: spread core terms/lemmas naturally and avoid exact phrase unless absolutely natural."
746
+ )
747
+ elif phrase_len >= 3:
748
+ phrase_strategy = (
749
+ "Prefer distributed semantic coverage for long phrases: naturally spread core terms/lemmas across the local paragraph. "
750
+ "Use exact phrase only if it is grammatically natural."
751
+ )
752
+ elif phrase_len == 2:
753
+ phrase_strategy = (
754
+ "For two-term goals, use either one natural exact phrase or distributed use of both terms without repetition."
755
+ )
756
+ else:
757
+ phrase_strategy = (
758
+ "For single-term goals, improve relevance using natural lexical variants and nearby semantic anchors."
759
+ )
760
  user_msg = (
761
  f"Language: {language}\n"
762
  f"Operation: {op}\n"
763
  f"Cascade level: L{cascade_level}\n"
764
  f"Goal: {goal_type} ({goal_label})\n"
765
+ f"Goal token count (without stopwords): {phrase_len}\n"
766
  f"Instruction: {op_instruction}\n"
767
  f"Must preserve overall narrative and style.\n"
768
  "Text must be grammatically correct and natural for native readers.\n"
769
  "Keep edits tightly local to the provided chunk and immediate context only.\n"
770
  "Edit must be substantive (not just synonyms) and should increase relevance to the goal phrase.\n"
771
  "Do not change the sentence subject/entity focus unless absolutely required by grammar.\n"
772
+ f"Phrase strategy: {phrase_strategy}\n"
773
  f"Focus terms to strengthen: {', '.join(focus_terms) if focus_terms else '-'}\n"
774
  f"Terms to de-emphasize/avoid overuse: {', '.join(avoid_terms) if avoid_terms else '-'}\n\n"
775
  f"Chunk to edit/expand:\n{chunk_text}\n\n"
 
780
  "2) Keep local coherence with surrounding text.\n"
781
  f"3) Max {max_sent} sentence(s) in edited_text.\n"
782
  "4) Keep key named entities from the original chunk unchanged when possible.\n"
783
+ "5) For BERT goals, prioritize semantic alignment over exact phrase repetition.\n"
784
+ "6) If exact phrase sounds unnatural, do NOT force it; use grammatically correct distributed wording.\n"
785
+ "7) Exact phrase may appear at most once, and only when it reads naturally.\n"
786
+ "8) Avoid repeating the same focus term more than needed; no stuffing.\n"
787
+ "9) For rewrite: preserve original meaning sentence-by-sentence while improving relevance.\n"
788
+ "10) Provide rationale in one short sentence.\n"
789
+ "11) Only output JSON object."
790
  )
791
  payload = {
792
  "model": model,
 
824
  "goal_label": goal_label,
825
  "focus_terms": focus_terms,
826
  "avoid_terms": avoid_terms,
827
+ "phrase_strategy_mode": strategy_mode,
828
+ "goal_token_count": phrase_len,
829
+ "phrase_strategy": phrase_strategy,
830
  "max_sentences": max_sent,
831
  "chunk_text": chunk_text,
832
  "context_before": context_before,
 
993
  candidates_per_iteration = max(1, min(5, candidates_per_iteration))
994
  temperature = float(request_data.get("temperature", 0.25) or 0.25)
995
  optimization_mode = str(request_data.get("optimization_mode", "balanced") or "balanced")
996
+ phrase_strategy_mode = str(request_data.get("phrase_strategy_mode", "auto") or "auto").strip().lower()
997
+ if phrase_strategy_mode not in {"auto", "exact_preferred", "distributed_preferred"}:
998
+ phrase_strategy_mode = "auto"
999
 
1000
  baseline_analysis = _build_analysis_snapshot(
1001
  target_text, competitors, keywords, language, target_title, competitor_titles
 
1088
  focus_terms=goal["focus_terms"],
1089
  avoid_terms=goal["avoid_terms"],
1090
  temperature=temp,
1091
+ phrase_strategy_mode=phrase_strategy_mode,
1092
  )
1093
  edited_text = str((llm_result or {}).get("edited_text", "")).strip()
1094
  llm_rationale = str((llm_result or {}).get("rationale", "")).strip()
 
1096
  if not edited_text or edited_text == original_span_text:
1097
  continue
1098
 
1099
+ quality_issues = _validate_candidate_text(
1100
+ edited_text,
1101
+ cascade_level,
1102
+ operation,
1103
+ goal_label=goal.get("label", ""),
1104
+ focus_terms=goal.get("focus_terms", []) or [],
1105
+ )
1106
  before_rel, after_rel = _chunk_relevance_pair(
1107
  original_span_text,
1108
  edited_text,
 
1649
  "iterations": logs,
1650
  "applied_changes": applied_changes,
1651
  "optimization_mode": optimization_mode,
1652
+ "phrase_strategy_mode": phrase_strategy_mode,
1653
  }
templates/index.html CHANGED
@@ -310,6 +310,14 @@
310
  <option value="aggressive">Aggressive</option>
311
  </select>
312
  </div>
 
 
 
 
 
 
 
 
313
  </div>
314
  <div class="d-flex gap-2 mt-3">
315
  <button class="btn btn-dark" onclick="runLlmOptimization()">Запустить оптимизацию</button>
@@ -545,7 +553,8 @@
545
  optimizer_iterations: Number(document.getElementById('optimizerIterations').value || 2),
546
  optimizer_candidates: Number(document.getElementById('optimizerCandidates').value || 2),
547
  optimizer_temperature: Number(document.getElementById('optimizerTemp').value || 0.25),
548
- optimizer_mode: document.getElementById('optimizerMode').value
 
549
  },
550
  state: {
551
  analysis_result: currentData,
@@ -596,6 +605,7 @@
596
  document.getElementById('optimizerCandidates').value = 2;
597
  document.getElementById('optimizerTemp').value = 0.25;
598
  document.getElementById('optimizerMode').value = 'balanced';
 
599
 
600
  // Competitor text fields
601
  const competitorsList = document.getElementById('competitorsList');
@@ -649,6 +659,7 @@
649
  document.getElementById('optimizerCandidates').value = inp.optimizer_candidates ?? 2;
650
  document.getElementById('optimizerTemp').value = inp.optimizer_temperature ?? 0.25;
651
  document.getElementById('optimizerMode').value = inp.optimizer_mode || 'balanced';
 
652
 
653
  // Title character counter refresh
654
  const titleLen = (inp.target_title || '').length;
@@ -924,7 +935,10 @@
924
  <div class="stat-card">
925
  <h6 class="card-title">Результат оптимизации</h6>
926
  <div class="small mb-2">Применено правок: <strong>${data.applied_changes || 0}</strong></div>
927
- <div class="small mb-2">Режим: <strong>${data.optimization_mode || 'balanced'}</strong></div>
 
 
 
928
  <div class="table-responsive">
929
  <table class="table table-sm table-bordered mb-0">
930
  <thead class="table-light"><tr><th>Метрика</th><th>До</th><th>После</th></tr></thead>
@@ -981,7 +995,8 @@
981
  max_iterations: Number(document.getElementById('optimizerIterations').value || 2),
982
  candidates_per_iteration: Number(document.getElementById('optimizerCandidates').value || 2),
983
  temperature: Number(document.getElementById('optimizerTemp').value || 0.25),
984
- optimization_mode: document.getElementById('optimizerMode').value || 'balanced'
 
985
  };
986
 
987
  document.getElementById('loader').style.display = 'block';
 
310
  <option value="aggressive">Aggressive</option>
311
  </select>
312
  </div>
313
+ <div class="col-md-3">
314
+ <label class="form-label small text-muted mb-1">Phrase Strategy</label>
315
+ <select id="optimizerPhraseStrategy" class="form-select">
316
+ <option value="auto" selected>Auto</option>
317
+ <option value="distributed_preferred">Distributed preferred</option>
318
+ <option value="exact_preferred">Exact phrase preferred</option>
319
+ </select>
320
+ </div>
321
  </div>
322
  <div class="d-flex gap-2 mt-3">
323
  <button class="btn btn-dark" onclick="runLlmOptimization()">Запустить оптимизацию</button>
 
553
  optimizer_iterations: Number(document.getElementById('optimizerIterations').value || 2),
554
  optimizer_candidates: Number(document.getElementById('optimizerCandidates').value || 2),
555
  optimizer_temperature: Number(document.getElementById('optimizerTemp').value || 0.25),
556
+ optimizer_mode: document.getElementById('optimizerMode').value,
557
+ optimizer_phrase_strategy: document.getElementById('optimizerPhraseStrategy').value
558
  },
559
  state: {
560
  analysis_result: currentData,
 
605
  document.getElementById('optimizerCandidates').value = 2;
606
  document.getElementById('optimizerTemp').value = 0.25;
607
  document.getElementById('optimizerMode').value = 'balanced';
608
+ document.getElementById('optimizerPhraseStrategy').value = 'auto';
609
 
610
  // Competitor text fields
611
  const competitorsList = document.getElementById('competitorsList');
 
659
  document.getElementById('optimizerCandidates').value = inp.optimizer_candidates ?? 2;
660
  document.getElementById('optimizerTemp').value = inp.optimizer_temperature ?? 0.25;
661
  document.getElementById('optimizerMode').value = inp.optimizer_mode || 'balanced';
662
+ document.getElementById('optimizerPhraseStrategy').value = inp.optimizer_phrase_strategy || 'auto';
663
 
664
  // Title character counter refresh
665
  const titleLen = (inp.target_title || '').length;
 
935
  <div class="stat-card">
936
  <h6 class="card-title">Результат оптимизации</h6>
937
  <div class="small mb-2">Применено правок: <strong>${data.applied_changes || 0}</strong></div>
938
+ <div class="small mb-2">
939
+ Режим: <strong>${data.optimization_mode || 'balanced'}</strong>
940
+ · Phrase Strategy: <strong>${data.phrase_strategy_mode || 'auto'}</strong>
941
+ </div>
942
  <div class="table-responsive">
943
  <table class="table table-sm table-bordered mb-0">
944
  <thead class="table-light"><tr><th>Метрика</th><th>До</th><th>После</th></tr></thead>
 
995
  max_iterations: Number(document.getElementById('optimizerIterations').value || 2),
996
  candidates_per_iteration: Number(document.getElementById('optimizerCandidates').value || 2),
997
  temperature: Number(document.getElementById('optimizerTemp').value || 0.25),
998
+ optimization_mode: document.getElementById('optimizerMode').value || 'balanced',
999
+ phrase_strategy_mode: document.getElementById('optimizerPhraseStrategy').value || 'auto'
1000
  };
1001
 
1002
  document.getElementById('loader').style.display = 'block';