Spaces:

lsdf
/

ai-seo-analyzer

Running

lsdf commited on Mar 17

Commit

f8eb22b

1 Parent(s): d2ba52e

Implement staged multi-objective optimization flow after BERT.

Add explicit stage pipeline (BERT -> BM25 -> Semantic -> N-gram -> Title), stage progress/plateau transitions, and expose candidate phrase strategy in debug table for clearer diagnostics.

Made-with: Cursor

Files changed (2) hide show

optimizer.py +127 -8
templates/index.html +4 -2

optimizer.py CHANGED Viewed

@@ -22,8 +22,10 @@ STOP_WORDS = {
 BERT_TARGET_THRESHOLD = 0.7
 BERT_GOAL_DELTA_MIN = 0.005
 SEMANTIC_GAP_TOLERANCE_PCT = 0.15
 SEMANTIC_GAP_MIN_ABS = 3.0
 def _tokenize(text: str) -> List[str]:
@@ -338,18 +340,25 @@ def _compute_metrics(analysis: Dict[str, Any], semantic: Dict[str, Any], keyword
     }
-def _choose_optimization_goal(analysis: Dict[str, Any], semantic: Dict[str, Any], keywords: List[str], language: str) -> Dict[str, Any]:
     bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
     low_bert = [x for x in bert_details if float(x.get("my_max_score", 0)) < BERT_TARGET_THRESHOLD]
     if low_bert:
         worst = sorted(low_bert, key=lambda x: float(x.get("my_max_score", 0)))[0]
         focus_terms = _filter_stopwords(_tokenize(worst.get("phrase", "")), language)[:4]
-        return {"type": "bert", "label": str(worst.get("phrase", "")), "focus_terms": focus_terms, "avoid_terms": []}
     bm25_remove = [x for x in (analysis.get("bm25_recommendations") or []) if x.get("action") == "remove"]
     if len(bm25_remove) >= 4:
         spam_terms = [str(x.get("word", "")) for x in sorted(bm25_remove, key=lambda r: int(r.get("count", 0)), reverse=True)[:4]]
-        return {"type": "bm25", "label": "reduce spam", "focus_terms": [], "avoid_terms": spam_terms}
     # Semantic keyword gaps
     lang_stop = STOP_WORDS.get(language, STOP_WORDS["en"])
@@ -373,7 +382,7 @@ def _choose_optimization_goal(analysis: Dict[str, Any], semantic: Dict[str, Any]
             candidate_rows.append((term, gap))
     if candidate_rows:
         top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
-        return {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
     # Fallback: ngram add signal
     for bucket_name in ("unigrams", "bigrams"):
@@ -382,7 +391,28 @@ def _choose_optimization_goal(analysis: Dict[str, Any], semantic: Dict[str, Any]
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
             if (target == 0 and comp_avg > 0) or (target > 0 and comp_avg >= target * 2):
-                return {"type": "ngram", "label": str(item.get("ngram", "")), "focus_terms": _tokenize(str(item.get("ngram", "")))[:3], "avoid_terms": []}
     return {"type": "none", "label": "no-op", "focus_terms": [], "avoid_terms": []}
@@ -906,6 +936,48 @@ def _safe_delta(prev_metrics: Dict[str, Any], next_metrics: Dict[str, Any], key:
         return 0.0
 def _candidate_utility(
     *,
     prev_metrics: Dict[str, Any],
@@ -1036,13 +1108,38 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
     goal_attempt_cursor: Dict[str, int] = {}
     attempted_spans = set()
     queued_candidates: List[Dict[str, Any]] = []
     for step in range(max_iterations):
-        goal = _choose_optimization_goal(current_analysis, current_semantic, keywords, language)
-        if goal["type"] == "none":
-            logs.append({"step": step + 1, "status": "stopped", "reason": "No optimization goals left."})
             break
         sentences = _split_sentences(current_text)
         if not sentences:
             logs.append({"step": step + 1, "status": "stopped", "reason": "No sentences available for editing."})
@@ -1347,6 +1444,10 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                 current_analysis = best_local["analysis"]
                 current_semantic = best_local["semantic"]
                 current_metrics = best_local["metrics"]
                 applied_changes += 1
                 queued_candidates = []
@@ -1354,6 +1455,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                     {
                         "step": step + 1,
                         "status": "applied_local_progress",
                         "goal": goal,
                         "cascade_level": cascade_level,
                         "operation": best_local.get("operation"),
@@ -1482,6 +1584,10 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                     current_analysis = best_batch["batch_analysis"]
                     current_semantic = best_batch["batch_semantic"]
                     current_metrics = best_batch["batch_metrics"]
                     applied_changes += 1
                     batch_applied = True
                     batch_info = {
@@ -1509,6 +1615,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                     {
                         "step": step + 1,
                         "status": "applied_batch",
                         "goal": goal,
                         "cascade_level": cascade_level,
                         "operation": "batch",
@@ -1553,6 +1660,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                 {
                     "step": step + 1,
                     "status": "rejected",
                     "goal": goal,
                     "cascade_level": cascade_level,
                     "operation": primary_span.get("operation"),
@@ -1597,6 +1705,12 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                     ],
                 }
             )
             consecutive_failures += 1
             if consecutive_failures >= 2 and cascade_level < 4:
                 cascade_level += 1
@@ -1621,6 +1735,10 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
         current_analysis = best["analysis"]
         current_semantic = best["semantic"]
         current_metrics = best["metrics"]
         applied_changes += 1
         queued_candidates = []
@@ -1628,6 +1746,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
             {
                 "step": step + 1,
                 "status": "applied",
                 "goal": goal,
                 "cascade_level": cascade_level,
                 "operation": best.get("operation"),

 BERT_TARGET_THRESHOLD = 0.7
 BERT_GOAL_DELTA_MIN = 0.005
+TITLE_TARGET_THRESHOLD = 0.65
 SEMANTIC_GAP_TOLERANCE_PCT = 0.15
 SEMANTIC_GAP_MIN_ABS = 3.0
+STAGE_ORDER = ["bert", "bm25", "semantic", "ngram", "title"]
 def _tokenize(text: str) -> List[str]:
     }
+def _choose_optimization_goal(
+    analysis: Dict[str, Any],
+    semantic: Dict[str, Any],
+    keywords: List[str],
+    language: str,
+    stage: str = "bert",
+) -> Dict[str, Any]:
+    candidates: Dict[str, Dict[str, Any]] = {}
     bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
     low_bert = [x for x in bert_details if float(x.get("my_max_score", 0)) < BERT_TARGET_THRESHOLD]
     if low_bert:
         worst = sorted(low_bert, key=lambda x: float(x.get("my_max_score", 0)))[0]
         focus_terms = _filter_stopwords(_tokenize(worst.get("phrase", "")), language)[:4]
+        candidates["bert"] = {"type": "bert", "label": str(worst.get("phrase", "")), "focus_terms": focus_terms, "avoid_terms": []}
     bm25_remove = [x for x in (analysis.get("bm25_recommendations") or []) if x.get("action") == "remove"]
     if len(bm25_remove) >= 4:
         spam_terms = [str(x.get("word", "")) for x in sorted(bm25_remove, key=lambda r: int(r.get("count", 0)), reverse=True)[:4]]
+        candidates["bm25"] = {"type": "bm25", "label": "reduce spam", "focus_terms": [], "avoid_terms": spam_terms}
     # Semantic keyword gaps
     lang_stop = STOP_WORDS.get(language, STOP_WORDS["en"])
             candidate_rows.append((term, gap))
     if candidate_rows:
         top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
+        candidates["semantic"] = {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
     # Fallback: ngram add signal
     for bucket_name in ("unigrams", "bigrams"):
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
             if (target == 0 and comp_avg > 0) or (target > 0 and comp_avg >= target * 2):
+                candidates["ngram"] = {
+                    "type": "ngram",
+                    "label": str(item.get("ngram", "")),
+                    "focus_terms": _tokenize(str(item.get("ngram", "")))[:3],
+                    "avoid_terms": [],
+                }
+                break
+        if "ngram" in candidates:
+            break
+    title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
+    title_target_score = title_bert.get("target_score")
+    if title_target_score is not None and float(title_target_score) < TITLE_TARGET_THRESHOLD:
+        candidates["title"] = {
+            "type": "title",
+            "label": "title alignment",
+            "focus_terms": _filter_stopwords(_tokenize(" ".join(keywords[:2])), language)[:4],
+            "avoid_terms": [],
+        }
+    if stage in candidates:
+        return candidates[stage]
     return {"type": "none", "label": "no-op", "focus_terms": [], "avoid_terms": []}
         return 0.0
+def _stage_primary_progress(stage: str, prev_metrics: Dict[str, Any], next_metrics: Dict[str, Any]) -> bool:
+    if stage == "bert":
+        prev_low = int(prev_metrics.get("bert_low_count", 0))
+        next_low = int(next_metrics.get("bert_low_count", 0))
+        if next_low < prev_low:
+            return True
+        prev_max = max([0.0] + [float(v) for v in (prev_metrics.get("bert_phrase_scores") or {}).values()])
+        next_max = max([0.0] + [float(v) for v in (next_metrics.get("bert_phrase_scores") or {}).values()])
+        return (next_max - prev_max) >= BERT_GOAL_DELTA_MIN
+    if stage == "bm25":
+        return int(next_metrics.get("bm25_remove_count", 0)) < int(prev_metrics.get("bm25_remove_count", 0))
+    if stage == "semantic":
+        return (
+            int(next_metrics.get("semantic_gap_count", 0)) < int(prev_metrics.get("semantic_gap_count", 0))
+            or float(next_metrics.get("semantic_gap_sum", 0.0)) < float(prev_metrics.get("semantic_gap_sum", 0.0))
+        )
+    if stage == "ngram":
+        return int(next_metrics.get("ngram_signal_count", 0)) < int(prev_metrics.get("ngram_signal_count", 0))
+    if stage == "title":
+        pv = prev_metrics.get("title_bert_score")
+        nv = next_metrics.get("title_bert_score")
+        if pv is None or nv is None:
+            return False
+        return float(nv) > float(pv)
+    return False
+def _is_stage_complete(stage: str, metrics: Dict[str, Any]) -> bool:
+    if stage == "bert":
+        return int(metrics.get("bert_low_count", 0)) == 0
+    if stage == "bm25":
+        return int(metrics.get("bm25_remove_count", 0)) <= 3
+    if stage == "semantic":
+        return int(metrics.get("semantic_gap_count", 0)) <= 0
+    if stage == "ngram":
+        return int(metrics.get("ngram_signal_count", 0)) <= 0
+    if stage == "title":
+        score = metrics.get("title_bert_score")
+        return (score is None) or (float(score) >= TITLE_TARGET_THRESHOLD)
+    return True
 def _candidate_utility(
     *,
     prev_metrics: Dict[str, Any],
     goal_attempt_cursor: Dict[str, int] = {}
     attempted_spans = set()
     queued_candidates: List[Dict[str, Any]] = []
+    stage_idx = 0
+    stage_no_progress_steps = 0
     for step in range(max_iterations):
+        while stage_idx < len(STAGE_ORDER) and _is_stage_complete(STAGE_ORDER[stage_idx], current_metrics):
+            stage_idx += 1
+            stage_no_progress_steps = 0
+        if stage_idx >= len(STAGE_ORDER):
+            logs.append({"step": step + 1, "status": "stopped", "reason": "All optimization stages completed."})
             break
+        active_stage = STAGE_ORDER[stage_idx]
+        goal = _choose_optimization_goal(
+            current_analysis,
+            current_semantic,
+            keywords,
+            language,
+            stage=active_stage,
+        )
+        if goal["type"] == "none":
+            stage_idx += 1
+            stage_no_progress_steps = 0
+            logs.append(
+                {
+                    "step": step + 1,
+                    "status": "stage_skipped",
+                    "stage": active_stage,
+                    "reason": f"No actionable goals for stage '{active_stage}', moving to next stage.",
+                }
+            )
+            continue
         sentences = _split_sentences(current_text)
         if not sentences:
             logs.append({"step": step + 1, "status": "stopped", "reason": "No sentences available for editing."})
                 current_analysis = best_local["analysis"]
                 current_semantic = best_local["semantic"]
                 current_metrics = best_local["metrics"]
+                if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
+                    stage_no_progress_steps = 0
+                else:
+                    stage_no_progress_steps += 1
                 applied_changes += 1
                 queued_candidates = []
                     {
                         "step": step + 1,
                         "status": "applied_local_progress",
+                        "stage": active_stage,
                         "goal": goal,
                         "cascade_level": cascade_level,
                         "operation": best_local.get("operation"),
                     current_analysis = best_batch["batch_analysis"]
                     current_semantic = best_batch["batch_semantic"]
                     current_metrics = best_batch["batch_metrics"]
+                    if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
+                        stage_no_progress_steps = 0
+                    else:
+                        stage_no_progress_steps += 1
                     applied_changes += 1
                     batch_applied = True
                     batch_info = {
                     {
                         "step": step + 1,
                         "status": "applied_batch",
+                        "stage": active_stage,
                         "goal": goal,
                         "cascade_level": cascade_level,
                         "operation": "batch",
                 {
                     "step": step + 1,
                     "status": "rejected",
+                    "stage": active_stage,
                     "goal": goal,
                     "cascade_level": cascade_level,
                     "operation": primary_span.get("operation"),
                     ],
                 }
             )
+            stage_no_progress_steps += 1
+            if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
+                stage_idx += 1
+                stage_no_progress_steps = 0
+                logs[-1]["advanced_to_stage"] = STAGE_ORDER[stage_idx]
+                logs[-1]["reason"] = f"{logs[-1].get('reason', '-') } Stage plateau: no primary progress for 3 steps."
             consecutive_failures += 1
             if consecutive_failures >= 2 and cascade_level < 4:
                 cascade_level += 1
         current_analysis = best["analysis"]
         current_semantic = best["semantic"]
         current_metrics = best["metrics"]
+        if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
+            stage_no_progress_steps = 0
+        else:
+            stage_no_progress_steps += 1
         applied_changes += 1
         queued_candidates = []
             {
                 "step": step + 1,
                 "status": "applied",
+                "stage": active_stage,
                 "goal": goal,
                 "cascade_level": cascade_level,
                 "operation": best.get("operation"),

templates/index.html CHANGED Viewed

@@ -871,6 +871,7 @@
             const candidateRows = candidates.map(c => {
                 const reasons = Array.isArray(c.invalid_reasons) ? c.invalid_reasons.join(', ') : '';
                 const sentAfter = c.sentence_after ? safeHtml(c.sentence_after) : '-';
                 const relBefore = (c.chunk_relevance_before ?? '-');
                 const relAfter = (c.chunk_relevance_after ?? '-');
                 const termDiff = c.term_diff ? safeHtml(JSON.stringify(c.term_diff)) : '-';
@@ -880,6 +881,7 @@
                 return `
                     <tr>
                         <td>${c.candidate_index ?? '-'}</td>
                         <td>${c.valid ? 'yes' : 'no'}</td>
                         <td>${c.goal_improved ? 'yes' : 'no'}</td>
                         <td>${c.bert_phrase_delta ?? '-'}</td>
@@ -926,10 +928,10 @@
                         <table class="table table-sm table-bordered mb-0">
                             <thead class="table-light">
                                 <tr>
-                                    <th>#cand</th><th>valid</th><th>goal+</th><th>bert Δ</th><th>local+</th><th>chunk Δ</th><th>rel b→a</th><th>Δ</th><th>score</th><th>reject reason/error</th><th>кандидат правки</th>
                                 </tr>
                             </thead>
-                            <tbody>${candidateRows || '<tr><td colspan="11" class="text-center text-muted">Нет кандидатов</td></tr>'}</tbody>
                         </table>
                     </div>
                 </div>

             const candidateRows = candidates.map(c => {
                 const reasons = Array.isArray(c.invalid_reasons) ? c.invalid_reasons.join(', ') : '';
                 const sentAfter = c.sentence_after ? safeHtml(c.sentence_after) : '-';
+                const strategy = c.phrase_strategy_used || (c.llm_prompt_debug && c.llm_prompt_debug.phrase_strategy_mode) || '-';
                 const relBefore = (c.chunk_relevance_before ?? '-');
                 const relAfter = (c.chunk_relevance_after ?? '-');
                 const termDiff = c.term_diff ? safeHtml(JSON.stringify(c.term_diff)) : '-';
                 return `
                     <tr>
                         <td>${c.candidate_index ?? '-'}</td>
+                        <td>${safeHtml(strategy)}</td>
                         <td>${c.valid ? 'yes' : 'no'}</td>
                         <td>${c.goal_improved ? 'yes' : 'no'}</td>
                         <td>${c.bert_phrase_delta ?? '-'}</td>
                         <table class="table table-sm table-bordered mb-0">
                             <thead class="table-light">
                                 <tr>
+                                    <th>#cand</th><th>strategy</th><th>valid</th><th>goal+</th><th>bert Δ</th><th>local+</th><th>chunk Δ</th><th>rel b→a</th><th>Δ</th><th>score</th><th>reject reason/error</th><th>кандидат правки</th>
                                 </tr>
                             </thead>
+                            <tbody>${candidateRows || '<tr><td colspan="12" class="text-center text-muted">Нет кандидатов</td></tr>'}</tbody>
                         </table>
                     </div>
                 </div>