Spaces:

lsdf
/

ai-seo-analyzer

Running

lsdf commited on Mar 18

Commit

d5937ae

1 Parent(s): 84ebb46

Implement n-gram tolerance stage rules and optimizer principles doc.

Apply user-defined n-gram gap policy (20% for avg>=4, 50% for avg<4), optimize n-gram stage toward competitor averages with local-safe acceptance, and add a living optimizer principles document.

Made-with: Cursor

Files changed (4) hide show

docs/FULL_FUNCTIONAL_DOCUMENTATION.md +1 -0
docs/TEXT_OPTIMIZER_PRINCIPLES.md +70 -0
optimizer.py +105 -22
templates/index.html +1 -1

docs/FULL_FUNCTIONAL_DOCUMENTATION.md CHANGED Viewed

@@ -44,6 +44,7 @@
 - `search.py` — смысловой поиск в графе (фразы + слова).
 - `url_fetcher.py` — извлечение текста/title из URL с выбором user-agent.
 - `optimizer.py` — LLM-оптимизация с обратной связью от метрик.
 - `templates/index.html` — frontend (UI + клиентская логика JS).
 ---

 - `search.py` — смысловой поиск в графе (фразы + слова).
 - `url_fetcher.py` — извлечение текста/title из URL с выбором user-agent.
 - `optimizer.py` — LLM-оптимизация с обратной связью от метрик.
+- `docs/TEXT_OPTIMIZER_PRINCIPLES.md` — живой регламент принципов оптимизатора (stage-пайплайн, допуски, guardrails).
 - `templates/index.html` — frontend (UI + клиентская логика JS).
 ---

docs/TEXT_OPTIMIZER_PRINCIPLES.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Text Optimizer Principles
+This document is a living spec for iterative text optimization behavior.
+Update it whenever optimization policy changes.
+## 1) Multi-objective optimization model
+- **Primary objective (by stage):**
+  - Stage A: BERT phrase relevance
+  - Stage B: BM25 remove cleanup
+  - Stage C: N-gram balancing
+  - Stage D: Semantic gap balancing
+  - Stage E: Title alignment
+- **Guardrails (always active):**
+  - Do not allow critical metric regressions beyond mode tolerances.
+  - Keep grammar, coherence, and non-spam writing.
+## 2) Stage order and skipping
+- Stage order:
+  - `bert -> bm25 -> ngram -> semantic -> title`
+- A stage is skipped if no actionable goal exists.
+- Plateau rule:
+  - If no primary progress for 3 steps, move to next stage.
+## 3) BERT stage policy
+- Default Stage A threshold: `0.70`.
+- User may set custom threshold via UI (`BERT target A-stage`), e.g. `0.61`.
+- Stage A is complete when max target phrase score reaches configured threshold.
+## 4) BM25 stage policy
+- Main target: reduce/remove over-optimization signals.
+- A stage is considered healthy when `bm25_remove_count <= 3`.
+## 5) N-gram stage policy (quantitative)
+- Goal: bring target counts closer to competitor average, not force exact equality.
+- Tolerance bands:
+  - if `avg >= 4`: acceptable range is `avg +/- 20%`
+  - if `avg < 4`: acceptable range is `avg +/- 50%`
+- N-gram signal is counted only when term is outside tolerance and present in enough competitors.
+## 6) Local acceptance and batch accumulation
+- First evaluate candidate locally (chunk-level), then globally (document-level).
+- Locally improved candidates may be queued when global score does not move yet.
+- Non-conflicting queued edits can be applied as a batch (2-4 edits) if guardrails pass.
+## 7) Text quality constraints
+- Reject candidates with:
+  - duplicated entities/words,
+  - suspicious token joins,
+  - excessive sentence count for current cascade level,
+  - obvious stuffing/redundancy.
+- Keep narrative continuity and original subject/entity focus.
+## 8) Diagnostics requirements
+- For every iteration, store:
+  - stage, goal, cascade level,
+  - candidate validity, local improvement, metric deltas,
+  - selected strategy and prompt debug payload.
+- UI must show:
+  - stage progression,
+  - stage transitions,
+  - candidate strategy and reason for rejection.

optimizer.py CHANGED Viewed

@@ -25,7 +25,7 @@ BERT_GOAL_DELTA_MIN = 0.005
 TITLE_TARGET_THRESHOLD = 0.65
 SEMANTIC_GAP_TOLERANCE_PCT = 0.15
 SEMANTIC_GAP_MIN_ABS = 3.0
-STAGE_ORDER = ["bert", "bm25", "semantic", "ngram", "title"]
 def _tokenize(text: str) -> List[str]:
@@ -232,6 +232,44 @@ def _is_semantic_gap(target_weight: float, competitor_avg_weight: float) -> bool
     return (competitor_avg_weight > rel_threshold) and (abs_gap >= SEMANTIC_GAP_MIN_ABS)
 def _compute_metrics(
     analysis: Dict[str, Any],
     semantic: Dict[str, Any],
@@ -254,6 +292,7 @@ def _compute_metrics(
     bm25_remove_count = len(bm25_remove)
     ngram_signal_count = 0
     ngrams = analysis.get("ngram_stats", {}) or {}
     for bucket_name in ("unigrams", "bigrams"):
         for item in (ngrams.get(bucket_name) or []):
@@ -262,9 +301,9 @@ def _compute_metrics(
                 continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
-            ratio_signal = comp_avg > 0 if target == 0 else comp_avg >= target * 2
-            if ratio_signal:
                 ngram_signal_count += 1
     title_score = None
     title_bert = analysis.get("title_analysis", {}).get("bert", {})
@@ -335,6 +374,7 @@ def _compute_metrics(
         "bert_phrase_scores": bert_phrase_scores,
         "bm25_remove_count": bm25_remove_count,
         "ngram_signal_count": ngram_signal_count,
         "title_bert_score": title_score,
         "semantic_gap_count": semantic_gap_count,
         "semantic_gap_sum": round(semantic_gap_sum, 4),
@@ -391,22 +431,41 @@ def _choose_optimization_goal(
         top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
         candidates["semantic"] = {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
-    # Fallback: ngram add signal
-    for bucket_name in ("unigrams", "bigrams"):
-        bucket = analysis.get("ngram_stats", {}).get(bucket_name, []) or []
         for item in bucket:
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
-            if (target == 0 and comp_avg > 0) or (target > 0 and comp_avg >= target * 2):
-                candidates["ngram"] = {
-                    "type": "ngram",
-                    "label": str(item.get("ngram", "")),
-                    "focus_terms": _tokenize(str(item.get("ngram", "")))[:3],
-                    "avoid_terms": [],
-                }
-                break
-        if "ngram" in candidates:
-            break
     title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
     title_target_score = title_bert.get("target_score")
@@ -584,6 +643,7 @@ def _chunk_goal_relevance(
     goal_label: str,
     focus_terms: List[str],
     language: str,
 ) -> float:
     chunk = (text or "").strip()
     if not chunk:
@@ -595,6 +655,8 @@ def _chunk_goal_relevance(
             return float(logic.util.cos_sim(embeddings[0:1], embeddings[1:2])[0][0].item())
         except Exception:
             pass
     # Lexical fallback for non-BERT goals or if embedding scoring is unavailable.
     toks = _filter_stopwords(_tokenize(chunk), language)
@@ -620,15 +682,26 @@ def _chunk_goal_delta(
     goal_label: str,
     focus_terms: List[str],
     language: str,
 ) -> float:
-    before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language)
-    after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language)
     return round(after_rel - before_rel, 4)
 def _min_chunk_delta(goal_type: str) -> float:
     if goal_type == "bert":
         return 0.01
     return 0.05
@@ -639,9 +712,10 @@ def _chunk_relevance_pair(
     goal_label: str,
     focus_terms: List[str],
     language: str,
 ) -> Tuple[float, float]:
-    before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language)
-    after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language)
     return round(before_rel, 4), round(after_rel, 4)
@@ -690,6 +764,7 @@ def _metrics_delta(prev_metrics: Dict[str, Any], next_metrics: Dict[str, Any]) -
         "bert_low_count",
         "bm25_remove_count",
         "ngram_signal_count",
         "semantic_gap_count",
         "semantic_gap_sum",
     ]
@@ -925,7 +1000,10 @@ def _goal_improved(
     if goal_type == "semantic":
         return next_metrics["semantic_gap_count"] < prev_metrics["semantic_gap_count"]
     if goal_type == "ngram":
-        return next_metrics["ngram_signal_count"] < prev_metrics["ngram_signal_count"]
     return next_metrics["score"] > prev_metrics["score"]
@@ -960,7 +1038,10 @@ def _stage_primary_progress(stage: str, prev_metrics: Dict[str, Any], next_metri
             or float(next_metrics.get("semantic_gap_sum", 0.0)) < float(prev_metrics.get("semantic_gap_sum", 0.0))
         )
     if stage == "ngram":
-        return int(next_metrics.get("ngram_signal_count", 0)) < int(prev_metrics.get("ngram_signal_count", 0))
     if stage == "title":
         pv = prev_metrics.get("title_bert_score")
         nv = next_metrics.get("title_bert_score")
@@ -1250,6 +1331,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                         goal["label"],
                         goal.get("focus_terms", []) or [],
                         language,
                     )
                     chunk_delta = _chunk_goal_delta(
                         original_span_text,
@@ -1258,6 +1340,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                         goal["label"],
                         goal.get("focus_terms", []) or [],
                         language,
                     )
                     local_chunk_improved = chunk_delta >= _min_chunk_delta(goal["type"])

 TITLE_TARGET_THRESHOLD = 0.65
 SEMANTIC_GAP_TOLERANCE_PCT = 0.15
 SEMANTIC_GAP_MIN_ABS = 3.0
+STAGE_ORDER = ["bert", "bm25", "ngram", "semantic", "title"]
 def _tokenize(text: str) -> List[str]:
     return (competitor_avg_weight > rel_threshold) and (abs_gap >= SEMANTIC_GAP_MIN_ABS)
+def _ngram_tolerance_pct(competitor_avg: float) -> float:
+    # User rule:
+    # - avg >= 4  -> +/-20%
+    # - avg < 4   -> +/-50%
+    return 0.20 if competitor_avg >= 4.0 else 0.50
+def _is_ngram_outside_tolerance(target_count: float, competitor_avg: float) -> bool:
+    if competitor_avg <= 0:
+        return False
+    tol = _ngram_tolerance_pct(competitor_avg)
+    low = competitor_avg * (1.0 - tol)
+    high = competitor_avg * (1.0 + tol)
+    return target_count < low or target_count > high
+def _ngram_deviation_ratio(target_count: float, competitor_avg: float) -> float:
+    if competitor_avg <= 0:
+        return 0.0
+    return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
+def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
+    toks = _filter_stopwords(_tokenize(text), language)
+    phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
+    if not toks or not phrase_toks:
+        return 0
+    n = len(phrase_toks)
+    if n == 1:
+        term = phrase_toks[0]
+        return sum(1 for t in toks if t == term)
+    count = 0
+    for i in range(0, max(0, len(toks) - n + 1)):
+        if toks[i : i + n] == phrase_toks:
+            count += 1
+    return count
 def _compute_metrics(
     analysis: Dict[str, Any],
     semantic: Dict[str, Any],
     bm25_remove_count = len(bm25_remove)
     ngram_signal_count = 0
+    ngram_gap_sum = 0.0
     ngrams = analysis.get("ngram_stats", {}) or {}
     for bucket_name in ("unigrams", "bigrams"):
         for item in (ngrams.get(bucket_name) or []):
                 continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
+            if _is_ngram_outside_tolerance(target, comp_avg):
                 ngram_signal_count += 1
+                ngram_gap_sum += _ngram_deviation_ratio(target, comp_avg)
     title_score = None
     title_bert = analysis.get("title_analysis", {}).get("bert", {})
         "bert_phrase_scores": bert_phrase_scores,
         "bm25_remove_count": bm25_remove_count,
         "ngram_signal_count": ngram_signal_count,
+        "ngram_gap_sum": round(ngram_gap_sum, 4),
         "title_bert_score": title_score,
         "semantic_gap_count": semantic_gap_count,
         "semantic_gap_sum": round(semantic_gap_sum, 4),
         top_term = sorted(candidate_rows, key=lambda x: x[1], reverse=True)[0][0]
         candidates["semantic"] = {"type": "semantic", "label": top_term, "focus_terms": [top_term], "avoid_terms": []}
+    # N-gram balancing (toward competitor average with tolerance policy).
+    ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
+    ngram_stats = analysis.get("ngram_stats", {}) or {}
+    for bucket_name, bucket in ngram_stats.items():
+        if not isinstance(bucket, list):
+            continue
         for item in bucket:
+            ngram_label = str(item.get("ngram", "")).strip()
+            if not ngram_label:
+                continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
+            comp_occ = int(item.get("comp_occurrence", 0))
+            if comp_occ < 2:
+                continue
+            if not _is_ngram_outside_tolerance(target, comp_avg):
+                continue
+            tol = _ngram_tolerance_pct(comp_avg)
+            dev_ratio = _ngram_deviation_ratio(target, comp_avg)
+            ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
+    if ngram_rows:
+        ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
+        label, target, comp_avg, tol, _, _ = ngram_rows[0]
+        candidates["ngram"] = {
+            "type": "ngram",
+            "label": label,
+            "focus_terms": [label],
+            "avoid_terms": [],
+            "ngram_target_count": target,
+            "ngram_comp_avg": comp_avg,
+            "ngram_tolerance_pct": tol,
+            "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
+            "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
+            "ngram_direction": "increase" if target < comp_avg else "decrease",
+        }
     title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
     title_target_score = title_bert.get("target_score")
     goal_label: str,
     focus_terms: List[str],
     language: str,
+    goal_meta: Optional[Dict[str, Any]] = None,
 ) -> float:
     chunk = (text or "").strip()
     if not chunk:
             return float(logic.util.cos_sim(embeddings[0:1], embeddings[1:2])[0][0].item())
         except Exception:
             pass
+    if goal_type == "ngram" and (goal_label or "").strip():
+        return float(_chunk_ngram_count(chunk, goal_label, language))
     # Lexical fallback for non-BERT goals or if embedding scoring is unavailable.
     toks = _filter_stopwords(_tokenize(chunk), language)
     goal_label: str,
     focus_terms: List[str],
     language: str,
+    goal_meta: Optional[Dict[str, Any]] = None,
 ) -> float:
+    before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language, goal_meta)
+    after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language, goal_meta)
+    if goal_type == "ngram":
+        target_avg = float((goal_meta or {}).get("ngram_comp_avg", 0.0))
+        if target_avg > 0:
+            # Positive delta means closer to competitor average regardless of direction.
+            before_dist = abs(before_rel - target_avg)
+            after_dist = abs(after_rel - target_avg)
+            return round(before_dist - after_dist, 4)
     return round(after_rel - before_rel, 4)
 def _min_chunk_delta(goal_type: str) -> float:
     if goal_type == "bert":
         return 0.01
+    if goal_type == "ngram":
+        # Require at least one occurrence-equivalent movement toward target zone.
+        return 0.5
     return 0.05
     goal_label: str,
     focus_terms: List[str],
     language: str,
+    goal_meta: Optional[Dict[str, Any]] = None,
 ) -> Tuple[float, float]:
+    before_rel = _chunk_goal_relevance(before_text, goal_type, goal_label, focus_terms, language, goal_meta)
+    after_rel = _chunk_goal_relevance(after_text, goal_type, goal_label, focus_terms, language, goal_meta)
     return round(before_rel, 4), round(after_rel, 4)
         "bert_low_count",
         "bm25_remove_count",
         "ngram_signal_count",
+        "ngram_gap_sum",
         "semantic_gap_count",
         "semantic_gap_sum",
     ]
     if goal_type == "semantic":
         return next_metrics["semantic_gap_count"] < prev_metrics["semantic_gap_count"]
     if goal_type == "ngram":
+        return (
+            next_metrics["ngram_signal_count"] < prev_metrics["ngram_signal_count"]
+            or float(next_metrics.get("ngram_gap_sum", 0.0)) < float(prev_metrics.get("ngram_gap_sum", 0.0))
+        )
     return next_metrics["score"] > prev_metrics["score"]
             or float(next_metrics.get("semantic_gap_sum", 0.0)) < float(prev_metrics.get("semantic_gap_sum", 0.0))
         )
     if stage == "ngram":
+        return (
+            int(next_metrics.get("ngram_signal_count", 0)) < int(prev_metrics.get("ngram_signal_count", 0))
+            or float(next_metrics.get("ngram_gap_sum", 0.0)) < float(prev_metrics.get("ngram_gap_sum", 0.0))
+        )
     if stage == "title":
         pv = prev_metrics.get("title_bert_score")
         nv = next_metrics.get("title_bert_score")
                         goal["label"],
                         goal.get("focus_terms", []) or [],
                         language,
+                        goal,
                     )
                     chunk_delta = _chunk_goal_delta(
                         original_span_text,
                         goal["label"],
                         goal.get("focus_terms", []) or [],
                         language,
+                        goal,
                     )
                     local_chunk_improved = chunk_delta >= _min_chunk_delta(goal["type"])

templates/index.html CHANGED Viewed

@@ -859,7 +859,7 @@
             const before = it.metrics_before ? it.metrics_before.score : '-';
             const after = it.metrics_after ? it.metrics_after.score : '-';
             const baseline = (it.current_score ?? before);
-            const reason = it.reason || (it.candidates ? 'all candidates rejected by constraints' : '-');
             const stage = (it.stage || (it.goal && it.goal.type) || '-');
             const advanced = it.advanced_to_stage ? ` → ${it.advanced_to_stage}` : '';
             return `<tr>

             const before = it.metrics_before ? it.metrics_before.score : '-';
             const after = it.metrics_after ? it.metrics_after.score : '-';
             const baseline = (it.current_score ?? before);
+            const reason = it.reason || ((it.status === 'rejected' && it.candidates) ? 'all candidates rejected by constraints' : '-');
             const stage = (it.stage || (it.goal && it.goal.type) || '-');
             const advanced = it.advanced_to_stage ? ` → ${it.advanced_to_stage}` : '';
             return `<tr>